Repository: huggingface/text-generation-inference
Branch: main
Commit: db931fcf42da
Files: 864
Total size: 6.3 MB

Directory structure:
gitextract_uvlvpncm/

├── .dockerignore
├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   ├── bug-report.yml
│   │   ├── config.yml
│   │   ├── feature-request.yml
│   │   └── new-model-addition.yml
│   ├── PULL_REQUEST_TEMPLATE.md
│   └── workflows/
│       ├── autodocs.yaml
│       ├── build.yaml
│       ├── build_documentation.yaml
│       ├── build_pr_documentation.yaml
│       ├── ci_build.yaml
│       ├── client-tests.yaml
│       ├── codeql.yml
│       ├── integration_tests.yaml
│       ├── load_test.yaml
│       ├── nix_build.yaml
│       ├── nix_cache.yaml
│       ├── nix_tests.yaml
│       ├── stale.yaml
│       ├── tests.yaml
│       ├── trufflehog.yaml
│       └── upload_pr_documentation.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── .redocly.lint-ignore.yaml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── Cargo.toml
├── Dockerfile
├── Dockerfile.neuron
├── Dockerfile.nix
├── Dockerfile_amd
├── Dockerfile_gaudi
├── Dockerfile_intel
├── Dockerfile_llamacpp
├── Dockerfile_trtllm
├── LICENSE
├── Makefile
├── README.md
├── assets/
│   └── tgi_grafana.json
├── backends/
│   ├── client/
│   │   ├── Cargo.toml
│   │   ├── build.rs
│   │   └── src/
│   │       ├── lib.rs
│   │       ├── v2/
│   │       │   ├── client.rs
│   │       │   ├── mod.rs
│   │       │   └── sharded_client.rs
│   │       └── v3/
│   │           ├── client.rs
│   │           ├── mod.rs
│   │           └── sharded_client.rs
│   ├── gaudi/
│   │   ├── Makefile
│   │   ├── README.md
│   │   ├── examples/
│   │   │   └── docker_commands/
│   │   │       └── docker_commands.md
│   │   ├── server/
│   │   │   ├── .gitignore
│   │   │   ├── Makefile
│   │   │   ├── Makefile-awq
│   │   │   ├── Makefile-eetq
│   │   │   ├── Makefile-fbgemm
│   │   │   ├── Makefile-flash-att
│   │   │   ├── Makefile-flash-att-v2
│   │   │   ├── Makefile-selective-scan
│   │   │   ├── Makefile-vllm
│   │   │   ├── README.md
│   │   │   ├── dill-0.3.7-patch.sh
│   │   │   ├── dill-0.3.8-patch.sh
│   │   │   ├── pyproject.toml
│   │   │   ├── requirements.txt
│   │   │   └── text_generation_server/
│   │   │       ├── __init__.py
│   │   │       ├── adapters/
│   │   │       │   ├── __init__.py
│   │   │       │   ├── config.py
│   │   │       │   ├── lora.py
│   │   │       │   └── weights.py
│   │   │       ├── cache.py
│   │   │       ├── cli.py
│   │   │       ├── interceptor.py
│   │   │       ├── layers/
│   │   │       │   ├── __init__.py
│   │   │       │   ├── attention/
│   │   │       │   │   ├── __init__.py
│   │   │       │   │   ├── common.py
│   │   │       │   │   ├── hpu.py
│   │   │       │   │   └── kv_cache.py
│   │   │       │   ├── awq/
│   │   │       │   │   ├── conversion_utils.py
│   │   │       │   │   └── quantize/
│   │   │       │   │       ├── __init__.py
│   │   │       │   │       └── hpu.py
│   │   │       │   ├── bnb.py
│   │   │       │   ├── compressed_tensors/
│   │   │       │   │   ├── __init__.py
│   │   │       │   │   ├── loader.py
│   │   │       │   │   └── w8an_fp.py
│   │   │       │   ├── conv.py
│   │   │       │   ├── exl2.py
│   │   │       │   ├── fp8.py
│   │   │       │   ├── gptq/
│   │   │       │   │   ├── __init__.py
│   │   │       │   │   ├── hpu.py
│   │   │       │   │   ├── quantize.py
│   │   │       │   │   └── utils.py
│   │   │       │   ├── layernorm.py
│   │   │       │   ├── linear.py
│   │   │       │   ├── lora.py
│   │   │       │   ├── medusa.py
│   │   │       │   ├── mlp.py
│   │   │       │   ├── moe/
│   │   │       │   │   ├── __init__.py
│   │   │       │   │   ├── fp8.py
│   │   │       │   │   ├── fused_moe.py
│   │   │       │   │   └── unquantized.py
│   │   │       │   ├── rotary.py
│   │   │       │   ├── speculative.py
│   │   │       │   └── tensor_parallel.py
│   │   │       ├── models/
│   │   │       │   ├── __init__.py
│   │   │       │   ├── custom_modeling/
│   │   │       │   │   ├── __init__.py
│   │   │       │   │   ├── bloom_modeling.py
│   │   │       │   │   ├── clip.py
│   │   │       │   │   ├── flash_cohere_modeling.py
│   │   │       │   │   ├── flash_dbrx_modeling.py
│   │   │       │   │   ├── flash_deepseek_v2_modeling.py
│   │   │       │   │   ├── flash_deepseek_v3_modeling.py
│   │   │       │   │   ├── flash_gemma2_modeling.py
│   │   │       │   │   ├── flash_gemma3_modeling.py
│   │   │       │   │   ├── flash_gemma_modeling.py
│   │   │       │   │   ├── flash_gpt2_modeling.py
│   │   │       │   │   ├── flash_gptj_modeling.py
│   │   │       │   │   ├── flash_llama4_modeling.py
│   │   │       │   │   ├── flash_llama_modeling.py
│   │   │       │   │   ├── flash_llava_next.py
│   │   │       │   │   ├── flash_mistral_modeling.py
│   │   │       │   │   ├── flash_mixtral_modeling.py
│   │   │       │   │   ├── flash_mllama.py
│   │   │       │   │   ├── flash_neox_modeling.py
│   │   │       │   │   ├── flash_pali_gemma_modeling.py
│   │   │       │   │   ├── flash_phi_modeling.py
│   │   │       │   │   ├── flash_phi_moe_modeling.py
│   │   │       │   │   ├── flash_qwen2_modeling.py
│   │   │       │   │   ├── flash_qwen3_modeling.py
│   │   │       │   │   ├── flash_qwen3_moe_modeling.py
│   │   │       │   │   ├── flash_rw_modeling.py
│   │   │       │   │   ├── flash_santacoder_modeling.py
│   │   │       │   │   ├── flash_starcoder2_modeling.py
│   │   │       │   │   ├── idefics2.py
│   │   │       │   │   ├── idefics3.py
│   │   │       │   │   ├── mamba_modeling.py
│   │   │       │   │   ├── qwen2_5_vl.py
│   │   │       │   │   ├── qwen2_vl.py
│   │   │       │   │   ├── siglip.py
│   │   │       │   │   └── vlm.py
│   │   │       │   ├── flash_causal_lm.py
│   │   │       │   ├── flash_vlm_causal_lm.py
│   │   │       │   ├── globals.py
│   │   │       │   ├── mllama_causal_lm.py
│   │   │       │   ├── model.py
│   │   │       │   ├── seq2seq_lm.py
│   │   │       │   └── types.py
│   │   │       ├── pb/
│   │   │       │   └── .gitignore
│   │   │       ├── server.py
│   │   │       ├── tracing.py
│   │   │       └── utils/
│   │   │           ├── __init__.py
│   │   │           ├── adapter.py
│   │   │           ├── chunks.py
│   │   │           ├── convert.py
│   │   │           ├── debug.py
│   │   │           ├── dist.py
│   │   │           ├── hub.py
│   │   │           ├── import_utils.py
│   │   │           ├── kernels.py
│   │   │           ├── log.py
│   │   │           ├── logits_process.py
│   │   │           ├── merges/
│   │   │           │   ├── strategies.py
│   │   │           │   └── utils.py
│   │   │           ├── peft.py
│   │   │           ├── prefill_chunking.py
│   │   │           ├── quantization.py
│   │   │           ├── segments.py
│   │   │           ├── sgmv.py
│   │   │           ├── speculate.py
│   │   │           ├── tokens.py
│   │   │           ├── version.py
│   │   │           ├── watermark.py
│   │   │           └── weights.py
│   │   └── tgi-entrypoint.sh
│   ├── grpc-metadata/
│   │   ├── Cargo.toml
│   │   └── src/
│   │       └── lib.rs
│   ├── llamacpp/
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   ├── build.rs
│   │   ├── requirements.txt
│   │   └── src/
│   │       ├── backend.rs
│   │       ├── llamacpp.rs
│   │       ├── main.rs
│   │       └── quantize.rs
│   ├── neuron/
│   │   ├── Cargo.toml
│   │   ├── Makefile
│   │   ├── README.md
│   │   ├── server/
│   │   │   ├── .gitignore
│   │   │   ├── Makefile
│   │   │   ├── build-requirements.txt
│   │   │   ├── pyproject.toml
│   │   │   └── text_generation_server/
│   │   │       ├── cli.py
│   │   │       ├── generator.py
│   │   │       ├── interceptor.py
│   │   │       ├── model.py
│   │   │       ├── server.py
│   │   │       └── tgi_env.py
│   │   ├── tests/
│   │   │   ├── conftest.py
│   │   │   ├── fixtures/
│   │   │   │   └── model.py
│   │   │   ├── prune_test_models.py
│   │   │   ├── pytest.ini
│   │   │   ├── requirements.txt
│   │   │   ├── server/
│   │   │   │   ├── helpers.py
│   │   │   │   ├── test_cached_model.py
│   │   │   │   ├── test_continuous_batching.py
│   │   │   │   ├── test_decode.py
│   │   │   │   ├── test_generator_slot.py
│   │   │   │   ├── test_info.py
│   │   │   │   └── test_prefill.py
│   │   │   └── test_entry_point.py
│   │   ├── tgi-entrypoint.sh
│   │   └── tgi_entry_point.py
│   ├── trtllm/
│   │   ├── CMakeLists.txt
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   ├── build.rs
│   │   ├── cmake/
│   │   │   ├── json.cmake
│   │   │   ├── spdlog.cmake
│   │   │   ├── trtllm.cmake
│   │   │   └── utils/
│   │   │       └── detect_cuda_arch.cu
│   │   ├── csrc/
│   │   │   ├── backend.cpp
│   │   │   ├── backend.hpp
│   │   │   ├── ffi.hpp
│   │   │   └── hardware.hpp
│   │   ├── scripts/
│   │   │   ├── install_tensorrt.sh
│   │   │   └── setup_sccache.py
│   │   ├── src/
│   │   │   ├── errors.rs
│   │   │   ├── lib.rs
│   │   │   ├── looper.rs
│   │   │   ├── main.rs
│   │   │   └── utils.rs
│   │   └── tests/
│   │       ├── test_backend.cpp
│   │       └── test_hardware.cpp
│   ├── v2/
│   │   ├── Cargo.toml
│   │   ├── build.rs
│   │   └── src/
│   │       ├── backend.rs
│   │       ├── client/
│   │       │   ├── grpc_client.rs
│   │       │   ├── mod.rs
│   │       │   └── sharded_client.rs
│   │       ├── lib.rs
│   │       ├── main.rs
│   │       └── queue.rs
│   └── v3/
│       ├── Cargo.toml
│       ├── benches/
│       │   └── prefix_cache.rs
│       ├── build.rs
│       └── src/
│           ├── backend.rs
│           ├── block_allocator.rs
│           ├── client/
│           │   ├── grpc_client.rs
│           │   ├── mod.rs
│           │   └── sharded_client.rs
│           ├── lib.rs
│           ├── main.rs
│           ├── queue.rs
│           └── radix.rs
├── benchmark/
│   ├── Cargo.toml
│   ├── README.md
│   └── src/
│       ├── app.rs
│       ├── event.rs
│       ├── generation.rs
│       ├── lib.rs
│       ├── main.rs
│       ├── table.rs
│       └── utils.rs
├── clients/
│   └── python/
│       ├── .gitignore
│       ├── Makefile
│       ├── README.md
│       ├── pyproject.toml
│       ├── tests/
│       │   ├── conftest.py
│       │   ├── test_client.py
│       │   ├── test_errors.py
│       │   ├── test_inference_api.py
│       │   └── test_types.py
│       └── text_generation/
│           ├── __init__.py
│           ├── client.py
│           ├── errors.py
│           ├── inference_api.py
│           └── types.py
├── crate-hashes.json
├── docs/
│   ├── README.md
│   ├── index.html
│   ├── openapi.json
│   └── source/
│       ├── _toctree.yml
│       ├── architecture.md
│       ├── backends/
│       │   ├── gaudi.mdx
│       │   ├── llamacpp.md
│       │   ├── neuron.md
│       │   └── trtllm.md
│       ├── basic_tutorials/
│       │   ├── consuming_tgi.md
│       │   ├── gated_model_access.md
│       │   ├── monitoring.md
│       │   ├── non_core_models.md
│       │   ├── preparing_model.md
│       │   ├── safety.md
│       │   ├── train_medusa.md
│       │   ├── using_cli.md
│       │   ├── using_guidance.md
│       │   └── visual_language_models.md
│       ├── conceptual/
│       │   ├── chunking.md
│       │   ├── external.md
│       │   ├── flash_attention.md
│       │   ├── guidance.md
│       │   ├── lora.md
│       │   ├── paged_attention.md
│       │   ├── quantization.md
│       │   ├── safetensors.md
│       │   ├── speculation.md
│       │   ├── streaming.md
│       │   └── tensor_parallelism.md
│       ├── index.md
│       ├── installation.md
│       ├── installation_amd.md
│       ├── installation_gaudi.md
│       ├── installation_inferentia.md
│       ├── installation_intel.md
│       ├── installation_nvidia.md
│       ├── installation_tpu.md
│       ├── multi_backend_support.md
│       ├── quicktour.md
│       ├── reference/
│       │   ├── api_reference.md
│       │   ├── launcher.md
│       │   └── metrics.md
│       ├── supported_models.md
│       └── usage_statistics.md
├── flake.nix
├── integration-tests/
│   ├── conftest.py
│   ├── fixtures/
│   │   ├── gaudi/
│   │   │   └── service.py
│   │   └── neuron/
│   │       ├── export_models.py
│   │       └── service.py
│   ├── gaudi/
│   │   ├── capture_expected_outputs.py
│   │   └── test_gaudi_generate.py
│   ├── models/
│   │   ├── __snapshots__/
│   │   │   ├── test.py
│   │   │   ├── test_bloom_560m/
│   │   │   │   ├── test_bloom_560m.json
│   │   │   │   ├── test_bloom_560m_all_params.json
│   │   │   │   └── test_bloom_560m_load.json
│   │   │   ├── test_bloom_560m_sharded/
│   │   │   │   ├── test_bloom_560m_sharded.json
│   │   │   │   └── test_bloom_560m_sharded_load.json
│   │   │   ├── test_chat_llama/
│   │   │   │   └── test_flash_llama_simple.json
│   │   │   ├── test_completion_prompts/
│   │   │   │   ├── test_chat_hfhub_nousage.json
│   │   │   │   ├── test_chat_hfhub_usage.json
│   │   │   │   ├── test_chat_openai_nousage.json
│   │   │   │   ├── test_chat_openai_usage.json
│   │   │   │   ├── test_flash_llama_completion_many_prompts.json
│   │   │   │   ├── test_flash_llama_completion_many_prompts_stream.json
│   │   │   │   ├── test_flash_llama_completion_single_prompt.json
│   │   │   │   └── test_flash_llama_completion_stream_usage.json
│   │   │   ├── test_compressed_tensors_w8a8_int/
│   │   │   │   ├── test_compressed_tensors_w8a8_int.json
│   │   │   │   ├── test_compressed_tensors_w8a8_int_all_params.json
│   │   │   │   └── test_compressed_tensors_w8a8_int_load.json
│   │   │   ├── test_compressed_tensors_w8a8_int_dynamic_weight/
│   │   │   │   ├── test_compressed_tensors_w8a8_int_dynamic_weight.json
│   │   │   │   ├── test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json
│   │   │   │   └── test_compressed_tensors_w8a8_int_dynamic_weight_load.json
│   │   │   ├── test_compressed_tensors_w8an_fp/
│   │   │   │   ├── test_compressed_tensors_w8an.json
│   │   │   │   ├── test_compressed_tensors_w8an_all_params.json
│   │   │   │   └── test_compressed_tensors_w8an_load.json
│   │   │   ├── test_compressed_tensors_wna16_int/
│   │   │   │   ├── test_compressed_tensors_wna16.json
│   │   │   │   ├── test_compressed_tensors_wna16_all_params.json
│   │   │   │   └── test_compressed_tensors_wna16_load.json
│   │   │   ├── test_compressed_tensors_wna16_int_24/
│   │   │   │   ├── test_compressed_tensors_wna16_int_24.json
│   │   │   │   ├── test_compressed_tensors_wna16_int_24_all_params.json
│   │   │   │   └── test_compressed_tensors_wna16_int_24_load.json
│   │   │   ├── test_continue_final_message/
│   │   │   │   ├── test_llama_completion_single_prompt.json
│   │   │   │   └── test_llama_completion_single_prompt_continue.json
│   │   │   ├── test_flash_awq/
│   │   │   │   ├── test_flash_llama_awq.json
│   │   │   │   ├── test_flash_llama_awq_all_params.json
│   │   │   │   └── test_flash_llama_awq_load.json
│   │   │   ├── test_flash_awq_sharded/
│   │   │   │   ├── test_flash_llama_awq_load_sharded.json
│   │   │   │   └── test_flash_llama_awq_sharded.json
│   │   │   ├── test_flash_deepseek_v2/
│   │   │   │   ├── test_flash_deepseek_v2.json
│   │   │   │   ├── test_flash_deepseek_v2_all_params.json
│   │   │   │   └── test_flash_deepseek_v2_load.json
│   │   │   ├── test_flash_falcon/
│   │   │   │   ├── test_flash_falcon.json
│   │   │   │   ├── test_flash_falcon_all_params.json
│   │   │   │   └── test_flash_falcon_load.json
│   │   │   ├── test_flash_gemma/
│   │   │   │   ├── test_flash_gemma_all_params.json
│   │   │   │   ├── test_flash_gemma_load.json
│   │   │   │   └── test_flash_gemma_simple.json
│   │   │   ├── test_flash_gemma2/
│   │   │   │   ├── test_flash_gemma2.json
│   │   │   │   └── test_flash_gemma2_load.json
│   │   │   ├── test_flash_gemma3/
│   │   │   │   ├── test_exceed_window.json
│   │   │   │   ├── test_flash_gemma3.json
│   │   │   │   ├── test_flash_gemma3_image_base64_rgb_jpg.json
│   │   │   │   ├── test_flash_gemma3_image_base64_rgb_png.json
│   │   │   │   ├── test_flash_gemma3_image_base64_rgba.json
│   │   │   │   ├── test_flash_gemma3_image_cow.json
│   │   │   │   └── test_flash_gemma3_image_cow_dog.json
│   │   │   ├── test_flash_gemma_gptq/
│   │   │   │   ├── test_flash_gemma_gptq.json
│   │   │   │   ├── test_flash_gemma_gptq_all_params.json
│   │   │   │   └── test_flash_gemma_gptq_load.json
│   │   │   ├── test_flash_gpt2/
│   │   │   │   ├── test_flash_gpt2.json
│   │   │   │   └── test_flash_gpt2_load.json
│   │   │   ├── test_flash_grammar_llama/
│   │   │   │   ├── test_flash_llama_grammar.json
│   │   │   │   ├── test_flash_llama_grammar_json.json
│   │   │   │   ├── test_flash_llama_grammar_load.json
│   │   │   │   ├── test_flash_llama_grammar_regex.json
│   │   │   │   └── test_flash_llama_grammar_single_load_instance.json
│   │   │   ├── test_flash_llama/
│   │   │   │   ├── test_flash_llama_all_params.json
│   │   │   │   ├── test_flash_llama_load.json
│   │   │   │   └── test_flash_llama_simple.json
│   │   │   ├── test_flash_llama_exl2/
│   │   │   │   ├── test_flash_llama_exl2.json
│   │   │   │   ├── test_flash_llama_exl2_all_params.json
│   │   │   │   └── test_flash_llama_exl2_load.json
│   │   │   ├── test_flash_llama_fp8/
│   │   │   │   ├── test_flash_llama_fp8.json
│   │   │   │   ├── test_flash_llama_fp8_all_params.json
│   │   │   │   └── test_flash_llama_fp8_load.json
│   │   │   ├── test_flash_llama_fp8_kv_cache/
│   │   │   │   ├── test_flash_llama_fp8_kv_cache.json
│   │   │   │   ├── test_flash_llama_fp8_kv_cache_all_params.json
│   │   │   │   └── test_flash_llama_fp8_kv_cache_load.json
│   │   │   ├── test_flash_llama_gptq/
│   │   │   │   ├── test_flash_llama_gptq.json
│   │   │   │   ├── test_flash_llama_gptq_all_params.json
│   │   │   │   └── test_flash_llama_gptq_load.json
│   │   │   ├── test_flash_llama_marlin/
│   │   │   │   ├── test_flash_llama_marlin.json
│   │   │   │   ├── test_flash_llama_marlin_all_params.json
│   │   │   │   └── test_flash_llama_marlin_load.json
│   │   │   ├── test_flash_llama_marlin_24/
│   │   │   │   ├── test_flash_llama_marlin.json
│   │   │   │   ├── test_flash_llama_marlin24_all_params.json
│   │   │   │   └── test_flash_llama_marlin24_load.json
│   │   │   ├── test_flash_llama_prefix/
│   │   │   │   └── test_flash_llama_load.json
│   │   │   ├── test_flash_llama_prefix_flashdecoding/
│   │   │   │   └── test_flash_llama_flashdecoding.json
│   │   │   ├── test_flash_medusa/
│   │   │   │   ├── test_flash_medusa_all_params.json
│   │   │   │   ├── test_flash_medusa_load.json
│   │   │   │   └── test_flash_medusa_simple.json
│   │   │   ├── test_flash_mistral/
│   │   │   │   ├── test_flash_mistral.json
│   │   │   │   ├── test_flash_mistral_all_params.json
│   │   │   │   └── test_flash_mistral_load.json
│   │   │   ├── test_flash_mixtral/
│   │   │   │   ├── test_flash_mixtral.json
│   │   │   │   ├── test_flash_mixtral_all_params.json
│   │   │   │   └── test_flash_mixtral_load.json
│   │   │   ├── test_flash_mixtral_awq/
│   │   │   │   ├── test_flash_mixtral_awq.json
│   │   │   │   ├── test_flash_mixtral_awq_all_params.json
│   │   │   │   └── test_flash_mixtral_awq_load.json
│   │   │   ├── test_flash_mixtral_gptq/
│   │   │   │   ├── test_flash_mixtral_gptq.json
│   │   │   │   ├── test_flash_mixtral_gptq_all_params.json
│   │   │   │   └── test_flash_mixtral_gptq_load.json
│   │   │   ├── test_flash_neox/
│   │   │   │   ├── test_flash_neox.json
│   │   │   │   └── test_flash_neox_load.json
│   │   │   ├── test_flash_neox_sharded/
│   │   │   │   ├── test_flash_neox.json
│   │   │   │   └── test_flash_neox_load.json
│   │   │   ├── test_flash_pali_gemma/
│   │   │   │   ├── test_flash_pali_gemma.json
│   │   │   │   └── test_flash_pali_gemma_two_images.json
│   │   │   ├── test_flash_pali_gemma2/
│   │   │   │   └── test_flash_pali_gemma_image.json
│   │   │   ├── test_flash_phi/
│   │   │   │   ├── test_flash_phi.json
│   │   │   │   ├── test_flash_phi_all_params.json
│   │   │   │   └── test_flash_phi_load.json
│   │   │   ├── test_flash_phi35_moe/
│   │   │   │   ├── test_flash_phi35_moe.json
│   │   │   │   ├── test_flash_phi35_moe_all_params.json
│   │   │   │   └── test_flash_phi35_moe_load.json
│   │   │   ├── test_flash_qwen2/
│   │   │   │   ├── test_flash_qwen2.json
│   │   │   │   ├── test_flash_qwen2_all_params.json
│   │   │   │   └── test_flash_qwen2_load.json
│   │   │   ├── test_flash_qwen2_5_vl/
│   │   │   │   ├── test_flash_qwen2_5_vl_bay.json
│   │   │   │   ├── test_flash_qwen2_5_vl_inpaint.json
│   │   │   │   ├── test_flash_qwen2_5_vl_simple.json
│   │   │   │   └── test_flash_qwen2_5_vl_simple_streaming.json
│   │   │   ├── test_flash_qwen2_vl/
│   │   │   │   ├── test_flash_qwen2_vl_bay.json
│   │   │   │   ├── test_flash_qwen2_vl_inpaint.json
│   │   │   │   ├── test_flash_qwen2_vl_simple.json
│   │   │   │   └── test_flash_qwen2_vl_simple_streaming.json
│   │   │   ├── test_flash_santacoder/
│   │   │   │   ├── test_flash_santacoder.json
│   │   │   │   └── test_flash_santacoder_load.json
│   │   │   ├── test_flash_starcoder/
│   │   │   │   ├── test_flash_starcoder.json
│   │   │   │   ├── test_flash_starcoder_default_params.json
│   │   │   │   └── test_flash_starcoder_load.json
│   │   │   ├── test_flash_starcoder2/
│   │   │   │   ├── test_flash_starcoder2.json
│   │   │   │   ├── test_flash_starcoder2_default_params.json
│   │   │   │   └── test_flash_starcoder2_load.json
│   │   │   ├── test_flash_starcoder2_lora/
│   │   │   │   ├── test_flash_starcoder2.json
│   │   │   │   ├── test_flash_starcoder2_default_params.json
│   │   │   │   ├── test_flash_starcoder2_load.json
│   │   │   │   └── test_flash_starcoder2_with_hugcode_adapter.json
│   │   │   ├── test_flash_starcoder_gptq/
│   │   │   │   ├── test_flash_starcoder_gptq.json
│   │   │   │   ├── test_flash_starcoder_gptq_default_params.json
│   │   │   │   └── test_flash_starcoder_gptq_load.json
│   │   │   ├── test_grammar_llama/
│   │   │   │   └── test_non_flash_llama_grammar_json.json
│   │   │   ├── test_grammar_response_format_llama/
│   │   │   │   ├── test_grammar_response_format_llama_json.1.json
│   │   │   │   ├── test_grammar_response_format_llama_json.2.json
│   │   │   │   └── test_grammar_response_format_llama_json.json
│   │   │   ├── test_idefics/
│   │   │   │   ├── test_idefics.json
│   │   │   │   ├── test_idefics_load.json
│   │   │   │   └── test_idefics_two_images.json
│   │   │   ├── test_idefics2/
│   │   │   │   ├── test_flash_idefics2_next_all_params.json
│   │   │   │   ├── test_flash_idefics2_next_load.json
│   │   │   │   ├── test_flash_idefics2_next_simple.json
│   │   │   │   └── test_flash_idefics2_two_images.json
│   │   │   ├── test_idefics3/
│   │   │   │   └── test_flash_idefics3_next_simple_url.json
│   │   │   ├── test_json_schema_constrain/
│   │   │   │   ├── test_json_schema_basic.json
│   │   │   │   ├── test_json_schema_complex.json
│   │   │   │   └── test_json_schema_stream.json
│   │   │   ├── test_llava_next/
│   │   │   │   ├── test_flash_llava_next_all_params.json
│   │   │   │   ├── test_flash_llava_next_load.json
│   │   │   │   └── test_flash_llava_next_simple.json
│   │   │   ├── test_lora_mistral/
│   │   │   │   ├── test_lora_mistral_with_customer_support_adapter.json
│   │   │   │   ├── test_lora_mistral_with_dbpedia_adapter.json
│   │   │   │   ├── test_lora_mistral_without_adapter.json
│   │   │   │   └── test_lora_mistral_without_customer_support_adapter.json
│   │   │   ├── test_mamba/
│   │   │   │   ├── test_mamba.json
│   │   │   │   ├── test_mamba_all_params.json
│   │   │   │   └── test_mamba_load.json
│   │   │   ├── test_mllama/
│   │   │   │   ├── test_mllama_load.json
│   │   │   │   └── test_mllama_simpl.json
│   │   │   ├── test_mpt/
│   │   │   │   ├── test_mpt.json
│   │   │   │   └── test_mpt_load.json
│   │   │   ├── test_mt0_base/
│   │   │   │   ├── test_mt0_base.json
│   │   │   │   ├── test_mt0_base_all_params.json
│   │   │   │   └── test_mt0_base_load.json
│   │   │   ├── test_neox/
│   │   │   │   ├── test_neox.json
│   │   │   │   └── test_neox_load.json
│   │   │   ├── test_neox_sharded/
│   │   │   │   ├── test_neox.json
│   │   │   │   └── test_neox_load.json
│   │   │   ├── test_server_gptq_quantized/
│   │   │   │   ├── test_server_gptq_quantized.json
│   │   │   │   ├── test_server_gptq_quantized_all_params.json
│   │   │   │   └── test_server_gptq_quantized_load.json
│   │   │   ├── test_smolvlm/
│   │   │   │   └── test_flash_smolvlm_next_simple_url.json
│   │   │   ├── test_t5_sharded/
│   │   │   │   ├── test_t5_sharded.json
│   │   │   │   └── test_t5_sharded_load.json
│   │   │   ├── test_tools_llama/
│   │   │   │   ├── test_flash_llama_grammar_tools_auto_nostream.json
│   │   │   │   ├── test_flash_llama_grammar_tools_choice_nostream.json
│   │   │   │   ├── test_flash_llama_grammar_tools_choice_stream.json
│   │   │   │   ├── test_flash_llama_grammar_tools_insufficient_information_nostream.json
│   │   │   │   ├── test_flash_llama_grammar_tools_insufficient_information_stream.json
│   │   │   │   ├── test_flash_llama_grammar_tools_nostream.json
│   │   │   │   ├── test_flash_llama_grammar_tools_openai.json
│   │   │   │   ├── test_flash_llama_grammar_tools_sea_creatures_stream_auto.json
│   │   │   │   ├── test_flash_llama_grammar_tools_sea_creatures_stream_function_object.json
│   │   │   │   ├── test_flash_llama_grammar_tools_sea_creatures_stream_none.json
│   │   │   │   ├── test_flash_llama_grammar_tools_sea_creatures_stream_required.json
│   │   │   │   └── test_flash_llama_tool_reply_response.json
│   │   │   ├── test_transformers_llama4/
│   │   │   │   ├── test_flash_llama4.json
│   │   │   │   ├── test_flash_llama4_image_base64_rgb_jpg.json
│   │   │   │   ├── test_flash_llama4_image_base64_rgb_png.json
│   │   │   │   ├── test_flash_llama4_image_base64_rgba.json
│   │   │   │   ├── test_flash_llama4_image_cow.json
│   │   │   │   └── test_flash_llama4_image_cow_dog.json
│   │   │   └── test_transformers_olmo/
│   │   │       ├── test_flash_llama_load.json
│   │   │       └── test_flash_llama_simple.json
│   │   ├── test_bloom_560m.py
│   │   ├── test_bloom_560m_sharded.py
│   │   ├── test_chat_llama.py
│   │   ├── test_chat_stream_options.py
│   │   ├── test_completion_prompts.py
│   │   ├── test_compressed_tensors_w8a8_int.py
│   │   ├── test_compressed_tensors_w8a8_int_dynamic_weight.py
│   │   ├── test_compressed_tensors_w8an_fp.py
│   │   ├── test_compressed_tensors_wna16_int.py
│   │   ├── test_compressed_tensors_wna16_int_24.py
│   │   ├── test_continue_final_message.py
│   │   ├── test_flash_awq.py
│   │   ├── test_flash_awq_sharded.py
│   │   ├── test_flash_deepseek_v2.py
│   │   ├── test_flash_falcon.py
│   │   ├── test_flash_gemma.py
│   │   ├── test_flash_gemma2.py
│   │   ├── test_flash_gemma3.py
│   │   ├── test_flash_gemma_gptq.py
│   │   ├── test_flash_gpt2.py
│   │   ├── test_flash_grammar_llama.py
│   │   ├── test_flash_llama.py
│   │   ├── test_flash_llama_exl2.py
│   │   ├── test_flash_llama_fp8.py
│   │   ├── test_flash_llama_fp8_kv_cache.py
│   │   ├── test_flash_llama_gptq.py
│   │   ├── test_flash_llama_marlin.py
│   │   ├── test_flash_llama_marlin_24.py
│   │   ├── test_flash_llama_prefix.py
│   │   ├── test_flash_llama_prefix_flashdecoding.py
│   │   ├── test_flash_medusa.py
│   │   ├── test_flash_mistral.py
│   │   ├── test_flash_mixtral.py
│   │   ├── test_flash_mixtral_awq.py
│   │   ├── test_flash_mixtral_gptq.py
│   │   ├── test_flash_neox.py
│   │   ├── test_flash_neox_sharded.py
│   │   ├── test_flash_pali_gemma.py
│   │   ├── test_flash_pali_gemma2.py
│   │   ├── test_flash_phi.py
│   │   ├── test_flash_phi35_moe.py
│   │   ├── test_flash_qwen2.py
│   │   ├── test_flash_qwen2_5_vl.py
│   │   ├── test_flash_qwen2_vl.py
│   │   ├── test_flash_santacoder.py
│   │   ├── test_flash_starcoder.py
│   │   ├── test_flash_starcoder2.py
│   │   ├── test_flash_starcoder2_lora.py
│   │   ├── test_flash_starcoder_gptq.py
│   │   ├── test_grammar_llama.py
│   │   ├── test_grammar_response_format_llama.py
│   │   ├── test_idefics.py
│   │   ├── test_idefics2.py
│   │   ├── test_idefics3.py
│   │   ├── test_json_schema_constrain.py
│   │   ├── test_llava_next.py
│   │   ├── test_lora_mistral.py
│   │   ├── test_mamba.py
│   │   ├── test_mllama.py
│   │   ├── test_mpt.py
│   │   ├── test_mt0_base.py
│   │   ├── test_neox.py
│   │   ├── test_neox_sharded.py
│   │   ├── test_opt.py
│   │   ├── test_smolvlm.py
│   │   ├── test_t5_sharded.py
│   │   ├── test_tools_llama.py
│   │   ├── test_transformers_llama4.py
│   │   └── test_transformers_olmo.py
│   ├── neuron/
│   │   ├── test_generate.py
│   │   └── test_implicit_env.py
│   ├── pyproject.toml
│   ├── pytest.ini
│   └── requirements.txt
├── launcher/
│   ├── Cargo.toml
│   ├── build.rs
│   └── src/
│       ├── env_runtime.rs
│       ├── gpu.rs
│       └── main.rs
├── load_tests/
│   ├── Makefile
│   ├── benchmarks.py
│   ├── common.js
│   ├── filter.py
│   ├── long.js
│   ├── long.py
│   ├── long_prompt2.py
│   ├── orca.py
│   └── pyproject.toml
├── nix/
│   ├── client.nix
│   ├── crate-overrides.nix
│   ├── docker.nix
│   ├── impure-shell.nix
│   ├── overlay.nix
│   └── server.nix
├── proto/
│   ├── generate.proto
│   └── v3/
│       └── generate.proto
├── router/
│   ├── Cargo.toml
│   ├── README.md
│   ├── build.rs
│   └── src/
│       ├── chat.rs
│       ├── config.rs
│       ├── infer/
│       │   ├── chat_template.rs
│       │   ├── mod.rs
│       │   └── tool_grammar.rs
│       ├── kserve.rs
│       ├── lib.rs
│       ├── logging.rs
│       ├── sagemaker.rs
│       ├── server.rs
│       ├── usage_stats.rs
│       ├── validation.rs
│       └── vertex.rs
├── rust-toolchain.toml
├── sagemaker-entrypoint.sh
├── server/
│   ├── .gitignore
│   ├── Makefile
│   ├── Makefile-awq
│   ├── Makefile-eetq
│   ├── Makefile-exllamav2
│   ├── Makefile-flash-att
│   ├── Makefile-flash-att-v2
│   ├── Makefile-flashinfer
│   ├── Makefile-selective-scan
│   ├── Makefile-vllm
│   ├── README.md
│   ├── bounds-from-nix.py
│   ├── custom_kernels/
│   │   ├── custom_kernels/
│   │   │   ├── fused_attention_cuda.cu
│   │   │   └── fused_bloom_attention_cuda.cu
│   │   └── setup.py
│   ├── exllama_kernels/
│   │   ├── exllama_kernels/
│   │   │   ├── cu_compat.cuh
│   │   │   ├── cuda_buffers.cu
│   │   │   ├── cuda_buffers.cuh
│   │   │   ├── cuda_func/
│   │   │   │   ├── column_remap.cu
│   │   │   │   ├── column_remap.cuh
│   │   │   │   ├── q4_matmul.cu
│   │   │   │   ├── q4_matmul.cuh
│   │   │   │   ├── q4_matrix.cu
│   │   │   │   └── q4_matrix.cuh
│   │   │   ├── exllama_ext.cpp
│   │   │   ├── hip_compat.cuh
│   │   │   ├── matrix.cuh
│   │   │   ├── tuning.h
│   │   │   └── util.cuh
│   │   └── setup.py
│   ├── exllamav2_kernels/
│   │   ├── exllamav2_kernels/
│   │   │   ├── config.h
│   │   │   ├── cpp/
│   │   │   │   └── util.h
│   │   │   ├── cuda/
│   │   │   │   ├── compat.cuh
│   │   │   │   ├── matrix_view.cuh
│   │   │   │   ├── q_gemm.cu
│   │   │   │   ├── q_gemm.cuh
│   │   │   │   ├── q_gemm_kernel.cuh
│   │   │   │   ├── q_gemm_kernel_gptq.cuh
│   │   │   │   ├── q_matrix.cu
│   │   │   │   ├── q_matrix.cuh
│   │   │   │   ├── quant/
│   │   │   │   │   ├── qdq_2.cuh
│   │   │   │   │   ├── qdq_3.cuh
│   │   │   │   │   ├── qdq_4.cuh
│   │   │   │   │   ├── qdq_5.cuh
│   │   │   │   │   ├── qdq_6.cuh
│   │   │   │   │   ├── qdq_8.cuh
│   │   │   │   │   └── qdq_util.cuh
│   │   │   │   └── util.cuh
│   │   │   └── ext.cpp
│   │   └── setup.py
│   ├── pyproject.toml
│   ├── req.txt
│   ├── requirements_cuda.txt
│   ├── requirements_gen.txt
│   ├── requirements_intel.txt
│   ├── requirements_rocm.txt
│   ├── tests/
│   │   ├── conftest.py
│   │   ├── models/
│   │   │   ├── test_bloom.py
│   │   │   ├── test_causal_lm.py
│   │   │   ├── test_model.py
│   │   │   ├── test_santacoder.py
│   │   │   └── test_seq2seq_lm.py
│   │   └── utils/
│   │       ├── test_adapter.py
│   │       ├── test_convert.py
│   │       ├── test_hub.py
│   │       ├── test_layers.py
│   │       ├── test_tokens.py
│   │       ├── test_watermark.py
│   │       └── test_weights.py
│   └── text_generation_server/
│       ├── __init__.py
│       ├── adapters/
│       │   ├── __init__.py
│       │   ├── config.py
│       │   ├── lora.py
│       │   └── weights.py
│       ├── cache.py
│       ├── cli.py
│       ├── interceptor.py
│       ├── layers/
│       │   ├── __init__.py
│       │   ├── attention/
│       │   │   ├── __init__.py
│       │   │   ├── common.py
│       │   │   ├── cuda.py
│       │   │   ├── flash_attn_triton.py
│       │   │   ├── flashinfer.py
│       │   │   ├── ipex.py
│       │   │   ├── kv_cache.py
│       │   │   └── rocm.py
│       │   ├── awq/
│       │   │   ├── conversion_utils.py
│       │   │   └── quantize/
│       │   │       ├── __init__.py
│       │   │       ├── cuda.py
│       │   │       └── ipex.py
│       │   ├── bnb.py
│       │   ├── compressed_tensors/
│       │   │   ├── __init__.py
│       │   │   ├── loader.py
│       │   │   ├── w8a8_int.py
│       │   │   ├── w8an_fp.py
│       │   │   ├── wna16_int.py
│       │   │   └── wna16_int_24.py
│       │   ├── conv.py
│       │   ├── eetq.py
│       │   ├── exl2.py
│       │   ├── fp8.py
│       │   ├── gptq/
│       │   │   ├── __init__.py
│       │   │   ├── custom_autotune.py
│       │   │   ├── exllama.py
│       │   │   ├── exllamav2.py
│       │   │   ├── ipex.py
│       │   │   ├── quantize.py
│       │   │   ├── triton.py
│       │   │   └── utils.py
│       │   ├── layernorm.py
│       │   ├── linear.py
│       │   ├── lora.py
│       │   ├── marlin/
│       │   │   ├── __init__.py
│       │   │   ├── fp8.py
│       │   │   ├── gptq.py
│       │   │   ├── marlin.py
│       │   │   └── util.py
│       │   ├── medusa.py
│       │   ├── mlp.py
│       │   ├── moe/
│       │   │   ├── __init__.py
│       │   │   ├── fp8.py
│       │   │   ├── fused_moe_ipex.py
│       │   │   ├── gptq_marlin.py
│       │   │   └── unquantized.py
│       │   ├── rotary.py
│       │   ├── speculative.py
│       │   └── tensor_parallel.py
│       ├── models/
│       │   ├── __init__.py
│       │   ├── bloom.py
│       │   ├── causal_lm.py
│       │   ├── custom_modeling/
│       │   │   ├── __init__.py
│       │   │   ├── bloom_modeling.py
│       │   │   ├── clip.py
│       │   │   ├── flash_cohere_modeling.py
│       │   │   ├── flash_dbrx_modeling.py
│       │   │   ├── flash_deepseek_v2_modeling.py
│       │   │   ├── flash_deepseek_v3_modeling.py
│       │   │   ├── flash_gemma2_modeling.py
│       │   │   ├── flash_gemma3_modeling.py
│       │   │   ├── flash_gemma_modeling.py
│       │   │   ├── flash_gpt2_modeling.py
│       │   │   ├── flash_gptj_modeling.py
│       │   │   ├── flash_llama_modeling.py
│       │   │   ├── flash_mistral_modeling.py
│       │   │   ├── flash_mixtral_modeling.py
│       │   │   ├── flash_neox_modeling.py
│       │   │   ├── flash_pali_gemma_modeling.py
│       │   │   ├── flash_phi_modeling.py
│       │   │   ├── flash_phi_moe_modeling.py
│       │   │   ├── flash_qwen2_modeling.py
│       │   │   ├── flash_rw_modeling.py
│       │   │   ├── flash_santacoder_modeling.py
│       │   │   ├── flash_starcoder2_modeling.py
│       │   │   ├── gemma3/
│       │   │   │   ├── configuration_gemma3.py
│       │   │   │   ├── image_processing_gemma3.py
│       │   │   │   ├── processing_gemma3.py
│       │   │   │   └── utils.py
│       │   │   ├── idefics2.py
│       │   │   ├── idefics3.py
│       │   │   ├── idefics_config.py
│       │   │   ├── idefics_image_processing.py
│       │   │   ├── idefics_modeling.py
│       │   │   ├── idefics_perceiver.py
│       │   │   ├── idefics_processing.py
│       │   │   ├── idefics_vision.py
│       │   │   ├── llava_next.py
│       │   │   ├── mamba_modeling.py
│       │   │   ├── mllama.py
│       │   │   ├── mpt_modeling.py
│       │   │   ├── neox_modeling.py
│       │   │   ├── opt_modeling.py
│       │   │   ├── phi_modeling.py
│       │   │   ├── qwen2_5_vl.py
│       │   │   ├── qwen2_vl.py
│       │   │   ├── siglip.py
│       │   │   ├── t5_modeling.py
│       │   │   └── vlm.py
│       │   ├── flash_causal_lm.py
│       │   ├── galactica.py
│       │   ├── globals.py
│       │   ├── idefics_causal_lm.py
│       │   ├── mamba.py
│       │   ├── metadata_kernels.py
│       │   ├── mllama_causal_lm.py
│       │   ├── model.py
│       │   ├── seq2seq_lm.py
│       │   ├── transformers_flash_causal_lm.py
│       │   ├── transformers_flash_vlm.py
│       │   ├── types.py
│       │   └── vlm_causal_lm.py
│       ├── pb/
│       │   └── .gitignore
│       ├── server.py
│       ├── tracing.py
│       └── utils/
│           ├── __init__.py
│           ├── adapter.py
│           ├── chunks.py
│           ├── convert.py
│           ├── dist.py
│           ├── hub.py
│           ├── import_utils.py
│           ├── kernels.py
│           ├── log.py
│           ├── logits_process.py
│           ├── merges/
│           │   ├── strategies.py
│           │   └── utils.py
│           ├── peft.py
│           ├── prefill_chunking.py
│           ├── quantization.py
│           ├── segments.py
│           ├── speculate.py
│           ├── tokens.py
│           ├── watermark.py
│           └── weights.py
├── tgi-entrypoint.sh
└── update_doc.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .dockerignore
================================================
aml
target
server/transformers
server/flash-attention
cmake-build-debug/
cmake-build-release/
Dockerfile*


================================================
FILE: .github/ISSUE_TEMPLATE/bug-report.yml
================================================
name: "\U0001F41B Bug Report"
description: Submit a bug report to help us improve text-generation-inference
body:
  - type: textarea
    id: system-info
    attributes:
      label: System Info
      description: |
        Please share your system info with us (`text-generation-launcher --env` if installed locally).
        The full command line used that causes issues:
        OS version:
        Rust version (if self-compiling, `cargo version`):
        Model being used (`curl 127.0.0.1:8080/info | jq`):
          If local model please explicit the kind of model and/or equivalents.
        Hardware used (GPUs, how many, on which cloud) (`nvidia-smi`):
        Deployment specificities (Kubernetes, EKS, AKS, any particular deployments):
        The current version being used:

      placeholder: text-generation-inference version, platform, python version, ...
    validations:
      required: true

  - type: checkboxes
    id: information-scripts-examples
    attributes:
      label: Information
      description: 'The problem arises when using:'
      options:
        - label: "Docker"
        - label: "The CLI directly"

  - type: checkboxes
    id: information-tasks
    attributes:
      label: Tasks
      description: "The thing I am working on is:"
      options:
        - label: "An officially supported command"
        - label: "My own modifications"

  - type: textarea
    id: reproduction
    validations:
      required: true
    attributes:
      label: Reproduction
      description: |
        Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet.
        If you have code snippets, error messages, stack traces please provide them here as well.
        Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
        Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.

      placeholder: |
        Steps to reproduce the behavior:

          1.
          2.
          3.


  - type: textarea
    id: expected-behavior
    validations:
      required: true
    attributes:
      label: Expected behavior
      description: "A clear and concise description of what you would expect to happen."


================================================
FILE: .github/ISSUE_TEMPLATE/config.yml
================================================
blank_issues_enabled: true
version: 2.1


================================================
FILE: .github/ISSUE_TEMPLATE/feature-request.yml
================================================
name: "\U0001F680 Feature request"
description: Submit a proposal/request for a new text-generation-inference feature
labels: [ "feature" ]
body:
  - type: textarea
    id: feature-request
    validations:
      required: true
    attributes:
      label: Feature request
      description: |
        A clear and concise description of the feature proposal. Please provide a link to the paper and code in case they exist.

  - type: textarea
    id: motivation
    validations:
      required: true
    attributes:
      label: Motivation
      description: |
        Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too.


  - type: textarea
    id: contribution
    validations:
      required: true
    attributes:
      label: Your contribution
      description: |
        Is there any way that you could help, e.g. by submitting a PR? Make sure to read the CONTRIBUTING.MD [readme](https://github.com/huggingface/text-generation-inference/blob/main/CONTRIBUTING.md)


================================================
FILE: .github/ISSUE_TEMPLATE/new-model-addition.yml
================================================
name: "\U0001F31F New model addition"
description: Submit a proposal/request to implement a new model
labels: [ "New model" ]

body:
  - type: textarea
    id: description-request
    validations:
      required: true
    attributes:
      label: Model description
      description: |
        Put any and all important information relative to the model

  - type: checkboxes
    id: information-tasks
    attributes:
      label: Open source status
      description: |
          Please note that if the model implementation isn't available or if the weights aren't open-source, we are less likely to implement it in `transformers`.
      options:
        - label: "The model implementation is available"
        - label: "The model weights are available"

  - type: textarea
    id: additional-info
    attributes:
      label: Provide useful links for the implementation
      description: |
        Please provide information regarding the implementation, the weights, and the authors.
        Please mention the authors by @gh-username if you're aware of their usernames.


================================================
FILE: .github/PULL_REQUEST_TEMPLATE.md
================================================
# What does this PR do?

<!--
Congratulations! You've made it this far! You're not quite done yet though.

Once merged, your PR is going to appear in the release notes with the title you set, so make sure it's a great title that fully reflects the extent of your awesome contribution.

Then, please replace this with a description of the change and which issue is fixed (if applicable). Please also include relevant motivation and context. List any dependencies (if any) that are required for this change.

Once you're done, someone will review your PR shortly (see the section "Who can review?" below to tag some potential reviewers). They may suggest changes to make the code even better. If no one reviewed your PR after a week has passed, don't hesitate to post a new comment @-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case).
- [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes? Here are the
      [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and
      [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the right person to tag with @


@OlivierDehaene OR @Narsil

 -->


================================================
FILE: .github/workflows/autodocs.yaml
================================================
name: Automatic Documentation for Launcher

on:
  pull_request:

jobs:
  update_docs:
    runs-on: ubuntu-latest

    steps:
    - name: Checkout code
      uses: actions/checkout@v2

    - name: Set up Rust
      uses: actions-rs/toolchain@v1
      with:
        profile: minimal
        toolchain: stable

    - name: Install Protocol Buffers compiler
      run: |
        sudo apt-get update
        sudo apt-get install -y protobuf-compiler libprotobuf-dev

    - name: Install Launcher
      id: install-launcher
      run: cargo install --path launcher/

    - name: Install router
      id: install-router
      run: cargo install --path backends/v3/

    - uses: actions/setup-node@v4
      with:
        node-version: 22

    - name: Set up Python
      uses: actions/setup-python@v2
      with:
        python-version: '3.x'

    - name: Check that documentation is up-to-date
      run: |
        npm install -g @redocly/cli@1.34.2
        python update_doc.py --check


================================================
FILE: .github/workflows/build.yaml
================================================
name: Build and push docker image to internal registry

on:
  workflow_call:
    inputs:
      hardware:
        type: string
        description: Hardware
        # options:
        # - cuda
        # - cuda-trtllm
        # - rocm
        # - intel
        required: true
      release-tests:
        description: "Run release integration tests"
        required: true
        default: false
        type: boolean

jobs:
  build-and-push:
    outputs:
      docker_image: ${{ steps.final.outputs.docker_image }}
      docker_volume: ${{ steps.final.outputs.docker_volume }}
      docker_devices: ${{ steps.final.outputs.docker_devices }}
      runs_on: ${{ steps.final.outputs.runs_on }}
      label_extension: ${{ steps.final.outputs.label_extension }}
      extra_pytest: ${{ steps.final.outputs.extra_pytest }}
    concurrency:
      group: ${{ github.workflow }}-build-and-push-image-${{ inputs.hardware }}-${{ github.head_ref || github.run_id }}
      cancel-in-progress: true
    runs-on:
      group: aws-highmemory-64-plus-priv
    permissions:
      contents: write
      packages: write
      id-token: write
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
      - name: Inject slug/short variables
        uses: rlespinasse/github-slug-action@v4.4.1
      - name: Inject required variables for sccache to interact with Github Actions Cache
        uses: actions/github-script@v7
        with:
          script: |
            core.exportVariable('ACTIONS_RESULTS_URL', process.env.ACTIONS_RESULTS_URL || '');
            core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');

      - name: Extract TensorRT-LLM version
        run: |
          echo "TENSORRT_LLM_VERSION=$(grep -oP '([a-z,0-9]{40})' $GITHUB_WORKSPACE/backends/trtllm/cmake/trtllm.cmake)" >> $GITHUB_ENV
          echo "TensorRT-LLM version: ${{ env.TENSORRT_LLM_VERSION }}"
      - name: Construct hardware variables
        shell: bash
        run: |
          case ${{ inputs.hardware }} in
            cuda)
                export dockerfile="Dockerfile"
                export label_extension=""
                export docker_volume="/mnt/cache"
                export docker_devices=""
                export runs_on="aws-g6-12xl-plus-priv-cache"
                export platform=""
                export extra_pytest=""
                export target=""
                ;;
            cuda-trtllm)
                export dockerfile="Dockerfile_trtllm"
                export label_extension="-trtllm"
                export docker_volume="/mnt/cache"
                export docker_devices=""
                export runs_on="ubuntu-latest"
                export platform=""
                export extra_pytest=""
                if [[ "${GITHUB_REF}" == refs/tags/* ]]; then
                  export build_type="release";
                  export target="";
                else
                  export build_type="dev";
                  export target="ci-runtime";
                fi
                ;;
            rocm)
                export dockerfile="Dockerfile_amd"
                export label_extension="-rocm"
                export docker_devices="/dev/kfd,/dev/dri"
                export docker_volume="/mnt"
                # This runner was deactivated.
                export runs_on="ubuntu-latest"
                export platform=""
                export extra_pytest="-k test_flash_gemma_gptq_load"
                export target=""
                ;;
            intel-xpu)
                export dockerfile="Dockerfile_intel"
                export label_extension="-intel-xpu"
                export docker_devices=""
                export docker_volume="/mnt/cache"
                export runs_on="ubuntu-latest"
                export platform="xpu"
                export extra_pytest=""
                export target=""
                ;;
            intel-cpu)
                export dockerfile="Dockerfile_intel"
                export label_extension="-intel-cpu"
                export docker_devices="none"
                export docker_volume="/mnt/cache"
                # export runs_on="ubuntu-latest"
                export runs_on="aws-highmemory-32-plus-priv"
                export platform="cpu"
                export extra_pytest="-k test_flash_gemma_simple"
                export target=""
                ;;
            neuron)
                export dockerfile="Dockerfile.neuron"
                export label_extension="-neuron"
                export docker_devices="/dev/neuron0"
                export docker_volume="/mnt/cache"
                export runs_on="aws-inf2-8xlarge"
                export platform="cpu"
                export extra_pytest="--neuron"
                export target=""
                ;;
            gaudi)
                export dockerfile="Dockerfile_gaudi"
                export label_extension="-gaudi"
                export docker_volume="/mnt/cache"
                export docker_devices=""
                export runs_on="itac-bm-emr-gaudi3-dell-2gaudi"
                export platform=""
                export extra_pytest="--gaudi"
                export target=""
          esac
          echo $dockerfile
          echo "Dockerfile=${dockerfile}"
          echo $label_extension
          echo $docker_devices
          echo $runs_on
          echo $platform
          echo "DOCKERFILE=${dockerfile}" >> $GITHUB_ENV
          echo "LABEL_EXTENSION=${label_extension}" >> $GITHUB_ENV
          echo "PLATFORM=${platform}" >> $GITHUB_ENV
          echo "DOCKER_VOLUME=${docker_volume}" >> $GITHUB_ENV
          echo "DOCKER_DEVICES=${docker_devices}" >> $GITHUB_ENV
          echo "RUNS_ON=${runs_on}" >> $GITHUB_ENV
          echo "EXTRA_PYTEST=${extra_pytest}" >> $GITHUB_ENV
          echo REGISTRY_MIRROR=$REGISTRY_MIRROR >> $GITHUB_ENV
          echo "TARGET=${target}" >> $GITHUB_ENV
          echo "BUILD_TYPE=${build_type}" >> $GITHUB_ENV
      - name: Initialize Docker Buildx
        uses: docker/setup-buildx-action@v3
        with:
          install: true
          buildkitd-config: /tmp/buildkitd.toml
      - name: Login to internal Container Registry
        if: github.event_name != 'pull_request'
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.REGISTRY_USERNAME }}
          password: ${{ secrets.REGISTRY_PASSWORD }}
          registry: registry.internal.huggingface.tech
      - name: Login to GitHub Container Registry
        if: github.event_name != 'pull_request'
        uses: docker/login-action@v3
        with:
          registry: ghcr.io
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}
      - name: Login to Docker Hub Container Registry
        uses: docker/login-action@v3
        with:
          registry: docker.io
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}
      - name: configure aws credentials
        id: aws-creds
        uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502
        with:
          role-to-assume: ${{ secrets.AWS_ROLE_GITHUB_BUILDX_CACHE }}
          role-duration-seconds: 18000
          aws-region: us-east-1
          output-credentials: true
      # If pull request
      - name: Extract metadata (tags, labels) for Docker
        if: ${{ github.event_name == 'pull_request' }}
        id: meta-pr
        uses: docker/metadata-action@v5
        with:
          images: |
            docker.io/huggingface/text-generation-inference-ci
          tags: |
            type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL_EXTENSION }}
      # If main, release or tag
      - name: Extract metadata (tags, labels) for Docker
        if: ${{ github.event_name != 'pull_request' }}
        id: meta
        uses: docker/metadata-action@v4.3.0
        with:
          flavor: |
            latest=false
          images: |
            registry.internal.huggingface.tech/api-inference/community/text-generation-inference
            ghcr.io/huggingface/text-generation-inference
          tags: |
            type=semver,pattern={{version}}${{ env.LABEL_EXTENSION }}
            type=semver,pattern={{major}}.{{minor}}${{ env.LABEL_EXTENSION }}
            type=raw,value=latest${{ env.LABEL_EXTENSION }},enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }}
            type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL_EXTENSION }}
      - name: Build and push Docker image
        id: build-and-push
        uses: docker/build-push-action@v4
        env: 
          DOCKER_BUILD_SUMMARY: false
        with:
          context: .
          file: ${{ env.DOCKERFILE }}
          push: true
          platforms: 'linux/amd64'
          build-args: |
            GIT_SHA=${{ env.GITHUB_SHA }}
            DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL_EXTENSION }}
            PLATFORM=${{ env.PLATFORM }}
            build_type=${{ env.BUILD_TYPE }}
            sccache_gha_enabled=on
          secrets: |
            actions_results_url=${{ env.ACTIONS_RESULTS_URL }}
            actions_runtime_token=${{ env.ACTIONS_RUNTIME_TOKEN }}
          target: ${{ env.TARGET }}
          tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
          cache-from: type=s3,region=us-east-1,bucket=${{ vars.AWS_S3BUCKET_GITHUB_BUILDX_CACHE }},name=text-generation-inference-cache${{ env.LABEL }},mode=min,access_key_id=${{ steps.aws-creds.outputs.aws-access-key-id }},secret_access_key=${{ steps.aws-creds.outputs.aws-secret-access-key }},session_token=${{ steps.aws-creds.outputs.aws-session-token }},mode=max
          cache-to: type=s3,region=us-east-1,bucket=${{ vars.AWS_S3BUCKET_GITHUB_BUILDX_CACHE }},name=text-generation-inference-cache${{ env.LABEL }},mode=min,access_key_id=${{ steps.aws-creds.outputs.aws-access-key-id }},secret_access_key=${{ steps.aws-creds.outputs.aws-secret-access-key }},session_token=${{ steps.aws-creds.outputs.aws-session-token }},mode=max
      - name: Final
        id: final
        run: |

          if [ "${{ github.event_name }}" = "pull_request" ]; then
            echo "docker_image=docker.io/huggingface/text-generation-inference-ci:sha-${{ env.GITHUB_SHA_SHORT}}${{ env.LABEL_EXTENSION }}" >> "$GITHUB_OUTPUT"
          else
            echo "docker_image=ghcr.io/huggingface/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT}}${{ env.LABEL_EXTENSION }}" >> "$GITHUB_OUTPUT"
          fi
          echo "docker_devices=${{ env.DOCKER_DEVICES }}" >> "$GITHUB_OUTPUT"
          echo "docker_volume=${{ env.DOCKER_VOLUME }}" >> "$GITHUB_OUTPUT"
          echo "runs_on=${{ env.RUNS_ON }}" >> "$GITHUB_OUTPUT"
          echo "label_extension=${{ env.LABEL_EXTENSION }}" >> "$GITHUB_OUTPUT"
          echo "extra_pytest=${{ env.EXTRA_PYTEST }}" >> "$GITHUB_OUTPUT"
  precompile_neuron_models:
    concurrency:
      group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label_extension }}-${{ github.head_ref || github.run_id }}
      cancel-in-progress: true
    needs: build-and-push
    if: needs.build-and-push.outputs.label_extension == '-neuron'
    runs-on:
      group: ${{ needs.build-and-push.outputs.runs_on }}
    env:
      PYTEST_FLAGS: ${{ (startsWith(github.ref, 'refs/tags/') || github.ref == 'refs/heads/main' || inputs.release-tests == true) && '--release' || '--release' }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
      - name: Inject slug/short variables
        uses: rlespinasse/github-slug-action@v4.4.1
      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: "3.11"
      - name: Install
        run: |
          make install-integration-tests
      - name: Export neuron models
        run: |
          export DOCKER_IMAGE=${{ needs.build-and-push.outputs.docker_image }}
          echo $DOCKER_IMAGE
          docker pull $DOCKER_IMAGE
          export HF_TOKEN=${{ secrets.HF_TOKEN_NEURON }}
          python integration-tests/fixtures/neuron/export_models.py
  integration_tests:
    concurrency:
      group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label_extension }}-${{ github.head_ref || github.run_id }}
      cancel-in-progress: true
    needs: [precompile_neuron_models, build-and-push]
    if: ${{ always() && !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') && needs.build-and-push.outputs.runs_on != 'ubuntu-latest' }}
    runs-on:
      group: ${{ needs.build-and-push.outputs.runs_on }}
    env:
      PYTEST_FLAGS: ${{ (startsWith(github.ref, 'refs/tags/') || github.ref == 'refs/heads/main' || inputs.release-tests == true) && '--release' || '--release' }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
      - name: Inject slug/short variables
        uses: rlespinasse/github-slug-action@v4.4.1
      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: "3.11"
      - name: Install
        run: |
          make install-integration-tests
      - name: Run tests
        run: |
          export DOCKER_VOLUME=${{ needs.build-and-push.outputs.docker_volume }}
          export DOCKER_IMAGE=${{ needs.build-and-push.outputs.docker_image }}
          export DOCKER_DEVICES=${{ needs.build-and-push.outputs.docker_devices }}
          export EXTRA_PYTEST="${{ needs.build-and-push.outputs.extra_pytest }}"
          export HF_TOKEN=${{ secrets.HF_TOKEN }}
          echo $DOCKER_IMAGE
          docker pull $DOCKER_IMAGE
          pytest -s -vv integration-tests ${PYTEST_FLAGS} ${EXTRA_PYTEST}

  backend_trtllm_cxx_tests:
    needs: build-and-push
    if: needs.build-and-push.outputs.label_extension == '-trtllm'
    concurrency:
      group: ${{ github.workflow }}-${{ github.job }}-trtllm-${{ github.head_ref || github.run_id }}
      cancel-in-progress: true
    runs-on:
      group: aws-g6-12xl-plus-priv-cache
    container:
      image: ${{ needs.build-and-push.outputs.docker_image }}
      credentials:
        username: ${{ secrets.DOCKERHUB_USERNAME }}
        password: ${{ secrets.DOCKERHUB_PASSWORD }}
      options: --gpus all --shm-size=8g

    steps:
      - name: Run C++/CUDA tests
        if: ${{ env.LABEL_EXTENSION == 'ci-runtime' }}
        run: /usr/local/tgi/bin/tgi_trtllm_backend_tests


================================================
FILE: .github/workflows/build_documentation.yaml
================================================
name: Build documentation

on:
  push:
    paths:
      - "docs/source/**"
    branches:
      - main
      - doc-builder*
      - v*-release

jobs:
   build:
    uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
    with:
      commit_sha: ${{ github.sha }}
      package: text-generation-inference
      additional_args: --not_python_module
    secrets:
      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}


================================================
FILE: .github/workflows/build_pr_documentation.yaml
================================================
name: Build PR Documentation

on:
  pull_request:
    paths:
      - "docs/source/**"

concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true

jobs:
  build:
    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
    with:
      commit_sha: ${{ github.event.pull_request.head.sha }}
      pr_number: ${{ github.event.number }}
      package: text-generation-inference
      additional_args: --not_python_module


================================================
FILE: .github/workflows/ci_build.yaml
================================================
name: CI build

on:
  push:
    branches:
      - 'main'
    tags:
      - 'v*'
  pull_request:
    paths:
      - ".github/workflows/build.yaml"
      - "integration-tests/**"
      - "backends/**"
      - "server/**"
      - "proto/**"
      - "router/**"
      - "launcher/**"
      - "Cargo.lock"
      - "rust-toolchain.toml"
      - "Dockerfile"
      - "Dockerfile_amd"
      - "Dockerfile_intel"
      - "Dockerfile.neuron"
      - "Dockerfile_gaudi"
    branches:
      - "main"
  workflow_dispatch:
    inputs:
      release-tests:
        description: "Run release integration tests"
        required: true
        default: false
        type: boolean

jobs:
  build:
    strategy:
      # super important if you want to see all results, even if one fails
      # fail-fast is true by default
      fail-fast: false
      matrix:
        hardware: ["cuda", "cuda-trtllm", "rocm", "intel-xpu", "intel-cpu", "neuron", "gaudi"]
    uses: ./.github/workflows/build.yaml # calls the one above ^
    permissions:
      contents: write
      packages: write
      id-token: write
    with:
      hardware: ${{ matrix.hardware }}
      # https://github.com/actions/runner/issues/2206
      release-tests: ${{ inputs.release-tests == true }}
    secrets: inherit


================================================
FILE: .github/workflows/client-tests.yaml
================================================
name: Python Client Tests

on:
  pull_request:
    paths:
      - ".github/workflows/client-tests.yaml"
      - "clients/python/**"

jobs:
  run_tests:
    runs-on: ubuntu-latest

    steps:
      - uses: actions/checkout@v2
      - name: Set up Python
        uses: actions/setup-python@v1
        with:
          python-version: 3.9
      - name: Install
        run: |
          cd clients/python && pip install .
      - name: Run tests
        run: |
          pip install pytest pytest-asyncio
          export HF_TOKEN=${{ secrets.HF_TOKEN }}
          make python-client-tests


================================================
FILE: .github/workflows/codeql.yml
================================================
---
name: CodeQL Security Analysis For Github Actions

on:
  push:
    branches: ["main"]
  workflow_dispatch:
  # pull_request:

jobs:
  codeql:
    name: CodeQL Analysis
    uses: huggingface/security-workflows/.github/workflows/codeql-reusable.yml@v1.2.0
    permissions:
      security-events: write
      packages: read
      actions: read
      contents: read
    with:
      languages: '["actions"]'
      queries: 'security-extended,security-and-quality'
      runner: 'ubuntu-latest' #optional if need custom runner
      use-runner-group: false #optional

      # if need to use runner group:
      # runner: 'cpu-low'
      # use-runner-group: true


================================================
FILE: .github/workflows/integration_tests.yaml
================================================
name: Integration tests

on:
  workflow_call:
    inputs:
      docker_image:
        type: string
        description: Hardware
        required: true
      docker_devices:
        type: string
        description: Hardware
      runs_on:
        type: string
        required: true
        description: Hardware to run integration tests
jobs:
  integration_tests:
    concurrency:
      group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
      cancel-in-progress: true
    runs-on: ${{ inputs.runs_on }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
      - name: Inject slug/short variables
        uses: rlespinasse/github-slug-action@v4.4.1
      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: 3.9
      - name: Install
        run: |
          make install-integration-tests
      - name: Run tests
        run: |
          export DOCKER_VOLUME=/mnt/cache
          export DOCKER_IMAGE=${{ inputs.docker_image }}
          export DOCKER_DEVICES=${{ inputs.docker_devices }}
          export HF_TOKEN=${{ secrets.HF_TOKEN }}
          pytest -s -vv integration-tests


================================================
FILE: .github/workflows/load_test.yaml
================================================
name: Nightly load test

on:
  schedule:
    - cron: '0 0 * * 1-5'
  workflow_call:
  workflow_dispatch:

  pull_request:
    paths:
      - ".github/workflows/load_test.yaml"

env:
  AWS_DEFAULT_REGION: us-east-1
  AWS_ACCESS_KEY_ID: ${{ secrets.S3_AWS_ACCESS_KEY_ID }}
  AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_AWS_SECRET_ACCESS_KEY }}

jobs:
  load-tests:
    concurrency:
      group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
      cancel-in-progress: true
    runs-on:
      group: aws-g6-12xl-plus-priv-cache
    env:
      DOCKER_VOLUME: /cache
    steps:
      - name: Checkout repository
        uses: actions/checkout@v3

      - name: Install Python 3.11
        uses: actions/setup-python@v2
        with:
          python-version: 3.11

      - name: Install poetry
        run: |
          curl -sSL https://install.python-poetry.org | python3 -
          export PATH="$HOME/.local/bin:$PATH"
          poetry --version

      - name: Run bench test
        run: |
          export PATH="$HOME/.local/bin:$PATH"
          cd load_tests
          poetry install
          poetry run python benchmarks.py --sha ${{ github.sha }} --results-file "s3://text-generation-inference-ci/benchmarks/ci/${{ github.sha }}.parquet"
        shell: bash
        env:
          HF_TOKEN: ${{ secrets.HF_TOKEN_BENCHMARK }}


================================================
FILE: .github/workflows/nix_build.yaml
================================================
name: "Nix Build Docker image"
on:
  pull_request:
  push:
    branches:
      - 'main'
    tags:
      - 'v*'
concurrency:
  group: nix-image-${{ github.workflow }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true

jobs:
  build_nix_image:
    runs-on:
      group: aws-highmemory-32-plus-priv
    steps:
    - uses: actions/checkout@v4
    - uses: cachix/install-nix-action@v27
      with:
        nix_path: nixpkgs=channel:nixos-unstable
    - uses: cachix/cachix-action@v14
      with:
        name: huggingface
        # If you chose signing key for write access
        # authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
      env:
        USER: github_runner
    - name: Build
      run: nix build .#dockerImage
    - name: Initialize Docker Buildx
      uses: docker/setup-buildx-action@v3
      with:
        install: true
        buildkitd-config: /tmp/buildkitd.toml
    - name: Inject slug/short variables
      uses: rlespinasse/github-slug-action@v4.4.1
    - name: Login to internal Container Registry
      # if: github.event_name != 'pull_request'
      uses: docker/login-action@v3
      with:
        username: ${{ secrets.REGISTRY_USERNAME }}
        password: ${{ secrets.REGISTRY_PASSWORD }}
        registry: registry.internal.huggingface.tech
    - name: Push to docker
      run: |
        if [ "${{ github.event_name }}" = "pull_request" ]; then
          export TAG=nix-sha-${{ env.GITHUB_SHA_SHORT }}
        else
          export TAG=${{ github.ref_name }}-nix
        fi
        export IMAGE=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:$TAG
        nix-shell -p skopeo --command "skopeo --insecure-policy copy docker-archive:$(readlink -f ./result) docker://$IMAGE --dest-compress-format zstd"


================================================
FILE: .github/workflows/nix_cache.yaml
================================================
name: "Cache devshells"
on:
  pull_request:
    paths:
      - "flake.nix"
      - "flake.lock"
      - "nix/**"
concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true

jobs:
  tests:
    runs-on:
      group: aws-highmemory-32-plus-priv
    steps:
      - uses: actions/checkout@v4
      - uses: cachix/install-nix-action@v27
        with:
          nix_path: nixpkgs=channel:nixos-unstable
      - uses: cachix/cachix-action@v14
        with:
          name: huggingface
          # If you chose signing key for write access
          #authToken: "${{ secrets.CACHIX_AUTH_TOKEN }}"
        env:
          USER: github_runner
      - name: Build impure devshell
        run: nix build .\#devShells.x86_64-linux.impure
      - name: Build impure devshell (CUDA dev)
        run: nix build .\#devShells.x86_64-linux.impureWithCuda
      # Pure shell dependencies are covered by Nix tests.
      # - name: Build pure devshell
      #   run: nix build .\#devShells.x86_64-linux.pure


================================================
FILE: .github/workflows/nix_tests.yaml
================================================
name: "Nix Tests"
on:
  pull_request:
    paths:
      - ".github/workflows/nix_tests.yaml"
      - "server/**"
      - "proto/**"
      - "router/**"
      - "launcher/**"
      - "backends/**"
      - "Cargo.lock"
      - "rust-toolchain.toml"
concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true

jobs:
  tests:
    runs-on:
      group: aws-highmemory-32-plus-priv
    steps:
    - uses: actions/checkout@v4
    - uses: cachix/install-nix-action@v27
      with:
        nix_path: nixpkgs=channel:nixos-unstable
    - uses: cachix/cachix-action@v14
      with:
        name: huggingface
        # If you chose signing key for write access
        #authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
      env:
        USER: github_runner
    - name: Nix info
      run: nix-shell -p nix-info --run "nix-info -m"
    - name: Build
      run: nix develop .#test --command echo "Ok"
    - name: Pre-commit tests.
      run: nix develop .#test --command pre-commit run --all-files
    - name: Python tests.
      run: nix develop .#test --command python -m pytest server/tests/
      env:
        HF_TOKEN: ${{ secrets.HF_TOKEN }}
    - name: Rust tests.
      run: nix develop .#test --command cargo test


================================================
FILE: .github/workflows/stale.yaml
================================================
name: 'Close stale issues and PRs'
on:
  schedule:
    - cron: '30 1 * * *'

jobs:
  stale:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/stale@v8
        with:
          stale-issue-message: 'This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 5 days.'
          days-before-stale: 30
          days-before-close: 5


================================================
FILE: .github/workflows/tests.yaml
================================================
name: Server Tests

on:
  pull_request:
    paths:
      - ".github/workflows/tests.yaml"
      - "server/**"
      - "proto/**"
      - "router/**"
      - "launcher/**"
      - "backends/**"
      - "Cargo.lock"
      - "rust-toolchain.toml"

concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true

jobs:
  run_tests:
    runs-on:
      group: aws-highmemory-32-plus-priv
    steps:
      - uses: actions/checkout@v4
      - name: Set up Python
        uses: actions/setup-python@v4
        id: python
        with:
          python-version: 3.11
      - uses: dtolnay/rust-toolchain@1.85.0
        with:
          components: rustfmt, clippy
      - name: Install Protoc
        uses: arduino/setup-protoc@v1
      - name: Clean unused files
        run: |
          sudo rm -rf /usr/local/lib/android # will release about 10 GB if you don't need Android
          sudo rm -rf /usr/share/dotnet # will release about 20GB if you don't need .NET
      - name: Install
        run: |
          sudo apt update
          sudo apt install python3.11-dev -y
          pip install -U pip uv
          uv venv
          source ./.venv/bin/activate
          make install-cpu
      - name: Download locked kernels
        run: |
          source ./.venv/bin/activate
          kernels download server
      - name: Run server tests
        run: |
          source ./.venv/bin/activate
          uv pip install pytest
          export HF_TOKEN=${{ secrets.HF_TOKEN }}
          pytest -s -vv server/tests
      - name: Pre-commit checks
        run: |
          pip install pre-commit
          pre-commit install
          pre-commit run --all-files
      - name: Run Rust tests
        run: |
          cargo test
      - name: Run Rust tests with google feature
        run: |
          cargo test --features google


================================================
FILE: .github/workflows/trufflehog.yaml
================================================
on:
  push:

name: Secret Leaks

permissions:
  contents: read

jobs:
  trufflehog:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - name: Secret Scanning
        uses: trufflesecurity/trufflehog@853e1e8d249fd1e29d0fcc7280d29b03df3d643d
        with:
          # exclude buggy postgres detector that is causing false positives and not relevant to our codebase
          extra_args: --results=verified,unknown --exclude-detectors=postgres


================================================
FILE: .github/workflows/upload_pr_documentation.yaml
================================================
name: Upload PR Documentation

on:
  workflow_run:
    workflows: ["Build PR Documentation"]
    types:
      - completed

jobs:
  build:
    uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main
    with:
      package_name: text-generation-inference
    secrets:
      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}


================================================
FILE: .gitignore
================================================
.idea
target
router/tokenizer.json
*__pycache__*

backends/v2/src/client/pb
backends/v3/src/client/pb
backends/client/src/v2/pb
backends/client/src/v3/pb

# ROCm auto-generated files
*.hip
server/exllamav2
server/exllama_kernels/exllama_kernels/hip/
server/exllama_kernels/exllama_kernels/hip_func/
*_hip.cuh
server/exllama_kernels/exllama_kernels/hip_buffers.cuh
server/exllama_kernels/exllama_kernels/exllama_ext_hip.cpp

data/
load_tests/*.json
server/fbgemmm

.direnv/
.venv/

# Gaudi auto-generated files
hl-smi_log*.txt
.graph_dumps
out
hqt_output


================================================
FILE: .pre-commit-config.yaml
================================================
repos:
-   repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v4.5.0
    hooks:
    -   id: check-yaml
    -   id: end-of-file-fixer
        exclude: crate-hashes.json
    -   id: trailing-whitespace
        exclude: docs/source/reference/launcher.md
-   repo: https://github.com/psf/black
    rev: 24.2.0
    hooks:
    -   id: black
-   repo: https://github.com/doublify/pre-commit-rust
    rev: v1.0
    hooks:
    -   id: cargo-check
    -   id: fmt
    -   id: clippy
-   repo: https://github.com/astral-sh/ruff-pre-commit
    rev: v0.3.0
    hooks:
    -   id: ruff
        args: [--fix, --exit-non-zero-on-fix]


================================================
FILE: .redocly.lint-ignore.yaml
================================================
# This file instructs Redocly's linter to ignore the rules contained for specific parts of your API.
# See https://redoc.ly/docs/cli/ for more information.
docs/openapi.json:
  no-empty-servers:
    - '#/openapi'
  spec:
    - >-
      #/components/schemas/GenerateParameters/properties/best_of/exclusiveMinimum
    - >-
      #/components/schemas/GenerateParameters/properties/frequency_penalty/exclusiveMinimum
    - '#/components/schemas/GenerateParameters/properties/grammar/nullable'
    - >-
      #/components/schemas/GenerateParameters/properties/repetition_penalty/exclusiveMinimum
    - '#/components/schemas/GenerateParameters/properties/seed/exclusiveMinimum'
    - >-
      #/components/schemas/GenerateParameters/properties/temperature/exclusiveMinimum
    - '#/components/schemas/GenerateParameters/properties/top_k/exclusiveMinimum'
    - >-
      #/components/schemas/GenerateParameters/properties/top_n_tokens/exclusiveMinimum
    - '#/components/schemas/GenerateParameters/properties/top_p/exclusiveMinimum'
    - >-
      #/components/schemas/GenerateParameters/properties/typical_p/exclusiveMinimum
    - '#/components/schemas/GenerateResponse/properties/details/nullable'
    - '#/components/schemas/StreamResponse/properties/details/nullable'
    - '#/components/schemas/ChatRequest/properties/response_format/nullable'
    - '#/components/schemas/ChatRequest/properties/stream_options/nullable'
    - '#/components/schemas/ChatRequest/properties/tool_choice/nullable'
    - '#/components/schemas/ToolChoice/nullable'
    - '#/components/schemas/ChatCompletionComplete/properties/logprobs/nullable'
    - '#/components/schemas/ChatCompletionChunk/properties/usage/nullable'
    - '#/components/schemas/ChatCompletionChoice/properties/logprobs/nullable'
  no-invalid-media-type-examples:
    - '#/paths/~1/post/responses/422/content/application~1json/example'
    - '#/paths/~1/post/responses/424/content/application~1json/example'
    - '#/paths/~1/post/responses/429/content/application~1json/example'
    - '#/paths/~1/post/responses/500/content/application~1json/example'
    - '#/paths/~1generate/post/responses/422/content/application~1json/example'
    - '#/paths/~1generate/post/responses/424/content/application~1json/example'
    - '#/paths/~1generate/post/responses/429/content/application~1json/example'
    - '#/paths/~1generate/post/responses/500/content/application~1json/example'
    - >-
      #/paths/~1generate_stream/post/responses/422/content/text~1event-stream/example
    - >-
      #/paths/~1generate_stream/post/responses/424/content/text~1event-stream/example
    - >-
      #/paths/~1generate_stream/post/responses/429/content/text~1event-stream/example
    - >-
      #/paths/~1generate_stream/post/responses/500/content/text~1event-stream/example
    - '#/paths/~1tokenize/post/responses/404/content/application~1json/example'
    - >-
      #/paths/~1v1~1chat~1completions/post/responses/422/content/application~1json/example
    - >-
      #/paths/~1v1~1chat~1completions/post/responses/424/content/application~1json/example
    - >-
      #/paths/~1v1~1chat~1completions/post/responses/429/content/application~1json/example
    - >-
      #/paths/~1v1~1chat~1completions/post/responses/500/content/application~1json/example
    - >-
      #/paths/~1v1~1completions/post/responses/422/content/application~1json/example
    - >-
      #/paths/~1v1~1completions/post/responses/424/content/application~1json/example
    - >-
      #/paths/~1v1~1completions/post/responses/429/content/application~1json/example
    - >-
      #/paths/~1v1~1completions/post/responses/500/content/application~1json/example
  operation-4xx-response:
    - '#/paths/~1health/get/responses'
    - '#/paths/~1info/get/responses'
    - '#/paths/~1metrics/get/responses'
  no-unused-components:
    - '#/components/schemas/Completion'
  security-defined:
    - '#/paths/~1/post'
    - '#/paths/~1generate/post'
    - '#/paths/~1generate_stream/post'
    - '#/paths/~1health/get'
    - '#/paths/~1info/get'
    - '#/paths/~1metrics/get'
    - '#/paths/~1tokenize/post'
    - '#/paths/~1v1~1chat~1completions/post'
    - '#/paths/~1v1~1completions/post'
    - '#/paths/~1v1~1models/get'


================================================
FILE: CODE_OF_CONDUCT.md
================================================

# Contributor Covenant Code of Conduct

## Our Pledge

We as members, contributors, and leaders pledge to make participation in our
community a harassment-free experience for everyone, regardless of age, body
size, visible or invisible disability, ethnicity, sex characteristics, gender
identity and expression, level of experience, education, socio-economic status,
nationality, personal appearance, race, caste, color, religion, or sexual
identity and orientation.

We pledge to act and interact in ways that contribute to an open, welcoming,
diverse, inclusive, and healthy community.

## Our Standards

Examples of behavior that contributes to a positive environment for our
community include:

* Demonstrating empathy and kindness toward other people
* Being respectful of differing opinions, viewpoints, and experiences
* Giving and gracefully accepting constructive feedback
* Accepting responsibility and apologizing to those affected by our mistakes,
  and learning from the experience
* Focusing on what is best not just for us as individuals, but for the overall
  community

Examples of unacceptable behavior include:

* The use of sexualized language or imagery, and sexual attention or advances of
  any kind
* Trolling, insulting or derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or email address,
  without their explicit permission
* Other conduct which could reasonably be considered inappropriate in a
  professional setting

## Enforcement Responsibilities

Community leaders are responsible for clarifying and enforcing our standards of
acceptable behavior and will take appropriate and fair corrective action in
response to any behavior that they deem inappropriate, threatening, offensive,
or harmful.

Community leaders have the right and responsibility to remove, edit, or reject
comments, commits, code, wiki edits, issues, and other contributions that are
not aligned to this Code of Conduct, and will communicate reasons for moderation
decisions when appropriate.

## Scope

This Code of Conduct applies within all community spaces, and also applies when
an individual is officially representing the community in public spaces.
Examples of representing our community include using an official e-mail address,
posting via an official social media account, or acting as an appointed
representative at an online or offline event.

## Enforcement

Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported to the community leaders responsible for enforcement at
feedback@huggingface.co.
All complaints will be reviewed and investigated promptly and fairly.

All community leaders are obligated to respect the privacy and security of the
reporter of any incident.

## Enforcement Guidelines

Community leaders will follow these Community Impact Guidelines in determining
the consequences for any action they deem in violation of this Code of Conduct:

### 1. Correction

**Community Impact**: Use of inappropriate language or other behavior deemed
unprofessional or unwelcome in the community.

**Consequence**: A private, written warning from community leaders, providing
clarity around the nature of the violation and an explanation of why the
behavior was inappropriate. A public apology may be requested.

### 2. Warning

**Community Impact**: A violation through a single incident or series of
actions.

**Consequence**: A warning with consequences for continued behavior. No
interaction with the people involved, including unsolicited interaction with
those enforcing the Code of Conduct, for a specified period of time. This
includes avoiding interactions in community spaces as well as external channels
like social media. Violating these terms may lead to a temporary or permanent
ban.

### 3. Temporary Ban

**Community Impact**: A serious violation of community standards, including
sustained inappropriate behavior.

**Consequence**: A temporary ban from any sort of interaction or public
communication with the community for a specified period of time. No public or
private interaction with the people involved, including unsolicited interaction
with those enforcing the Code of Conduct, is allowed during this period.
Violating these terms may lead to a permanent ban.

### 4. Permanent Ban

**Community Impact**: Demonstrating a pattern of violation of community
standards, including sustained inappropriate behavior, harassment of an
individual, or aggression toward or disparagement of classes of individuals.

**Consequence**: A permanent ban from any sort of public interaction within the
community.

## Attribution

This Code of Conduct is adapted from the [Contributor Covenant][homepage],
version 2.1, available at
[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].

Community Impact Guidelines were inspired by
[Mozilla's code of conduct enforcement ladder][Mozilla CoC].

For answers to common questions about this code of conduct, see the FAQ at
[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
[https://www.contributor-covenant.org/translations][translations].

[homepage]: https://www.contributor-covenant.org
[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
[Mozilla CoC]: https://github.com/mozilla/diversity
[FAQ]: https://www.contributor-covenant.org/faq
[translations]: https://www.contributor-covenant.org/translations


================================================
FILE: CONTRIBUTING.md
================================================
<!---
Copyright 2024 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

# Contribute to text-generation-inference

Everyone is welcome to contribute, and we value everybody's contribution. Code
contributions are not the only way to help the community. Answering questions, helping
others, and improving the documentation are also immensely valuable.

It also helps us if you spread the word! Reference the library in blog posts
about the awesome projects it made possible, shout out on Twitter every time it has
helped you, or simply ⭐️ the repository to say thank you.

However you choose to contribute, please be mindful and respect our
[code of conduct](https://github.com/huggingface/text-generation-inference/blob/main/CODE_OF_CONDUCT.md).

**This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md).**

## Ways to contribute

There are several ways you can contribute to text-generation-inference.

* Fix outstanding issues with the existing code.
* Submit issues related to bugs or desired new features.
* Contribute to the examples or to the documentation.

> All contributions are equally valuable to the community. 🥰

## Fixing outstanding issues

If you notice an issue with the existing code and have a fix in mind, feel free to [start contributing](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request) and open
a Pull Request!

## Submitting a bug-related issue or feature request

Do your best to follow these guidelines when submitting a bug-related issue or a feature
request. It will make it easier for us to come back to you quickly and with good
feedback.

### Did you find a bug?

The text-generation-inference library is robust and reliable thanks to users who report the problems they encounter.

Before you report an issue, we would really appreciate it if you could **make sure the bug was not
already reported** (use the search bar on GitHub under Issues). Your issue should also be related to bugs in the
library itself, and not your code.

Once you've confirmed the bug hasn't already been reported, please include the following information in your issue so
we can quickly resolve it:

* Your **OS type and version**, as well as your environment versions (versions of rust, python, and dependencies).
* A short, self-contained, code snippet that allows us to reproduce the bug.
* The *full* traceback if an exception is raised.
* Attach any other additional information, like screenshots, you think may help.

To get the OS and software versions automatically, you can re-run the launcher with the `--env` flag:

```bash
text-generation-launcher --env
```

This will precede the launch of the model with the information relative to your environment. We recommend pasting
that in your issue report.

### Do you want a new feature?

If there is a new feature you'd like to see in text-generation-inference, please open an issue and describe:

1. What is the *motivation* behind this feature? Is it related to a problem or frustration with the library? Is it
   a feature related to something you need for a project? Is it something you worked on and think it could benefit
   the community?

   Whatever it is, we'd love to hear about it!

2. Describe your requested feature in as much detail as possible. The more you can tell us about it, the better
   we'll be able to help you.
3. Provide a *code snippet* that demonstrates the feature's usage.
4. If the feature is related to a paper, please include a link.

If your issue is well written we're already 80% of the way there by the time you create it.

We have added [templates](https://github.com/huggingface/text-generation-inference/tree/main/.github/ISSUE_TEMPLATE)
to help you get started with your issue.

## Do you want to implement a new model?

New models are constantly released and if you want to implement a new model, please provide the following information:

* A short description of the model and a link to the paper.
* Link to the implementation if it is open-sourced.
* Link to the model weights if they are available.

If you are willing to contribute the model yourself, let us know so we can help you add it to text-generation-inference!

## Do you want to add documentation?

We're always looking for improvements to the documentation that make it more clear and accurate. Please let us know
how the documentation can be improved such as typos and any content that is missing, unclear or inaccurate. We'll be
happy to make the changes or help you make a contribution if you're interested!

## I want to become a maintainer of the project. How do I get there?

TGI is a project led and managed by Hugging Face as it powers our internal services. However, we are happy to have
motivated individuals from other organizations join us as maintainers with the goal of making TGI the best inference
service.

If you are such an individual (or organization), please reach out to us and let's collaborate.


================================================
FILE: Cargo.toml
================================================
[workspace]
members = [
    "benchmark",
    "backends/v2",
    "backends/v3",
    "backends/grpc-metadata",
    "backends/trtllm",
    "backends/llamacpp",
    "launcher",
    "router"
]
default-members = [
    "benchmark",
    "backends/v2",
    "backends/v3",
    "backends/grpc-metadata",
    # "backends/trtllm",
    "launcher",
    "router"
]
resolver = "2"

[workspace.package]
version = "3.3.6-dev0"
edition = "2021"
authors = ["Olivier Dehaene"]
homepage = "https://github.com/huggingface/text-generation-inference"

[workspace.dependencies]
base64 = "0.22.0"
tokenizers = { version = "0.20.0", features = ["http"] }
hf-hub = { version = "0.4.2", features = ["tokio"] }
metrics = { version = "0.23.0" }
metrics-exporter-prometheus = { version = "0.15.1", features = [] }
minijinja = { version = "2.2.0", features = ["json"] }
minijinja-contrib = { version = "2.0.2", features = ["pycompat"] }
pyo3 = { version = "0.22.2", features = ["auto-initialize"] }

[profile.release]
incremental = true

[profile.release-binary]
inherits = "release"
debug = 1
incremental = true
panic = "abort"

[profile.release-opt]
inherits = "release"
debug = 0
incremental = false
lto = "fat"
opt-level = 3
codegen-units = 1


================================================
FILE: Dockerfile
================================================
# Rust builder
FROM lukemathwalker/cargo-chef:latest-rust-1.85.1 AS chef
WORKDIR /usr/src

ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse

FROM chef AS planner
COPY Cargo.lock Cargo.lock
COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
COPY benchmark benchmark
COPY router router
COPY backends backends
COPY launcher launcher

RUN cargo chef prepare --recipe-path recipe.json

FROM chef AS builder

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
    python3.11-dev
RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
    rm -f $PROTOC_ZIP

COPY --from=planner /usr/src/recipe.json recipe.json
RUN cargo chef cook --profile release-opt --recipe-path recipe.json

ARG GIT_SHA
ARG DOCKER_LABEL

COPY Cargo.lock Cargo.lock
COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
COPY benchmark benchmark
COPY router router
COPY backends backends
COPY launcher launcher
RUN cargo build --profile release-opt --frozen

# Python builder
# Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS pytorch-install
WORKDIR /usr/src/

# NOTE: When updating PyTorch version, beware to remove `pip install nvidia-nccl-cu12==2.22.3` below in the Dockerfile. Context: https://github.com/huggingface/text-generation-inference/pull/2099
ARG PYTORCH_VERSION=2.7
ARG PYTHON_VERSION=3.11

# Keep in sync with `server/pyproject.toml
# Automatically set by buildx
ARG TARGETPLATFORM

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        build-essential \
        ca-certificates \
        ccache \
        curl \
        git && \
        rm -rf /var/lib/apt/lists/*
COPY --from=ghcr.io/astral-sh/uv:0.5.31 /uv /uvx /bin/
ENV PATH="$PATH:/root/.local/bin"
RUN uv python install ${PYTHON_VERSION}
RUN uv venv --python ${PYTHON_VERSION} && uv pip install torch==${PYTORCH_VERSION} torchvision pip setuptools packaging
ENV VIRTUAL_ENV=/usr/src/.venv/
ENV PATH="$PATH:/usr/src/.venv/bin/"

# CUDA kernels builder image
FROM pytorch-install AS kernel-builder

ARG MAX_JOBS=8
ENV TORCH_CUDA_ARCH_LIST="8.0;8.6;9.0+PTX"

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        ninja-build cmake \
        && rm -rf /var/lib/apt/lists/*

# Build Flash Attention CUDA kernels
FROM kernel-builder AS flash-att-builder

WORKDIR /usr/src

COPY server/Makefile-flash-att Makefile

# Build specific version of flash attention
RUN . .venv/bin/activate && make build-flash-attention

# Build Flash Attention v2 CUDA kernels
FROM kernel-builder AS flash-att-v2-builder

WORKDIR /usr/src

COPY server/Makefile-flash-att-v2 Makefile

# Build specific version of flash attention v2
RUN . .venv/bin/activate && make build-flash-attention-v2-cuda

# Build Transformers exllama kernels
FROM kernel-builder AS exllama-kernels-builder
WORKDIR /usr/src
COPY server/exllama_kernels/ .

RUN . .venv/bin/activate && python setup.py build

# Build Transformers exllama kernels
FROM kernel-builder AS exllamav2-kernels-builder
WORKDIR /usr/src
COPY server/Makefile-exllamav2/ Makefile

# Build specific version of transformers
RUN . .venv/bin/activate && make build-exllamav2

# Build Transformers awq kernels
FROM kernel-builder AS awq-kernels-builder
WORKDIR /usr/src
COPY server/Makefile-awq Makefile
# Build specific version of transformers
RUN . .venv/bin/activate && make build-awq

# Build Transformers CUDA kernels
FROM kernel-builder AS custom-kernels-builder
WORKDIR /usr/src
COPY server/custom_kernels/ .
# Build specific version of transformers
RUN . .venv/bin/activate && python setup.py build

# Build mamba kernels
FROM kernel-builder AS mamba-builder
WORKDIR /usr/src
COPY server/Makefile-selective-scan Makefile
RUN . .venv/bin/activate && make build-all

# Build flashinfer
FROM kernel-builder AS flashinfer-builder
WORKDIR /usr/src
COPY server/Makefile-flashinfer Makefile
RUN . .venv/bin/activate && make install-flashinfer

# Text Generation Inference base image
FROM nvidia/cuda:12.4.0-base-ubuntu22.04 AS base

# Text Generation Inference base env
ENV HF_HOME=/data \
    HF_HUB_ENABLE_HF_TRANSFER=1 \
    PORT=80

WORKDIR /usr/src

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        libssl-dev \
        ca-certificates \
        make \
        curl \
        git \
        && rm -rf /var/lib/apt/lists/*

# RUN curl -LsSf https://astral.sh/uv/install.sh | sh
# ENV PATH="$PATH:/root/.local/bin"
COPY --from=ghcr.io/astral-sh/uv:0.5.31 /uv /uvx /bin/
# Install flash-attention dependencies
# RUN pip install einops --no-cache-dir

# Copy env with PyTorch installed
COPY --from=pytorch-install /usr/src/.venv /usr/src/.venv
ENV PYTHON_VERSION=3.11
RUN uv python install ${PYTHON_VERSION}
ENV VIRTUAL_ENV=/usr/src/.venv/
ENV PATH="$PATH:/usr/src/.venv/bin/"

# Install server
COPY proto proto
COPY server server
COPY server/Makefile server/Makefile
ENV HF_KERNELS_CACHE=/kernels
RUN cd server && \
	uv sync --frozen --extra gen --extra bnb --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines --extra torch --no-install-project --active && \
    make gen-server-raw && \
    kernels download .

RUN cd server && \
    uv sync --frozen --extra gen --extra bnb --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines --extra torch --active --python=${PYTHON_VERSION} && \
    uv pip install nvidia-nccl-cu12==2.25.1 && \
    pwd && \
    text-generation-server --help

# Copy build artifacts from flash attention builder
COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages

# Copy build artifacts from flash attention v2 builder
COPY --from=flash-att-v2-builder /usr/src/.venv/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so /usr/src/.venv/lib/python3.11/site-packages

# Copy build artifacts from custom kernels builder
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
# Copy build artifacts from exllama kernels builder
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
# Copy build artifacts from exllamav2 kernels builder
COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
# Copy build artifacts from awq kernels builder
COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
# Copy build artifacts from mamba builder
COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-311/ /usr/src/.venv/lib/python3.11/site-packages
COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-311/ /usr/src/.venv/lib/python3.11/site-packages
COPY --from=flashinfer-builder /usr/src/.venv/lib/python3.11/site-packages/flashinfer/ /usr/src/.venv/lib/python3.11/site-packages/flashinfer/


# ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
# Required to find libpython within the rust binaries
# This is needed because exl2 tries to load flash-attn
# And fails with our builds.
ENV EXLLAMA_NO_FLASH_ATTN=1

# Deps before the binaries
# The binaries change on every build given we burn the SHA into them
# The deps change less often.
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        build-essential \
        g++ \
        && rm -rf /var/lib/apt/lists/*

# Install benchmarker
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
# Install router
COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
# Install launcher
COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher


# AWS Sagemaker compatible image
FROM base AS sagemaker

COPY sagemaker-entrypoint.sh entrypoint.sh
RUN chmod +x entrypoint.sh

ENTRYPOINT ["./entrypoint.sh"]

# Final image
FROM base

COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
RUN chmod +x /tgi-entrypoint.sh

ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/root/.local/share/uv/python/cpython-3.11.11-linux-x86_64-gnu/lib/"
ENTRYPOINT ["/tgi-entrypoint.sh"]
# CMD ["--json-output"]


================================================
FILE: Dockerfile.neuron
================================================
# Fetch and extract the TGI sources
FROM alpine AS tgi
RUN mkdir -p /tgi

# Fetch the optimum-neuron sources directly to avoid relying on pypi deployments
FROM alpine AS optimum-neuron
RUN mkdir -p /optimum-neuron
ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.3.0.tar.gz /optimum-neuron/sources.tar.gz
RUN tar -C /optimum-neuron -xf /optimum-neuron/sources.tar.gz --strip-components=1

# Build cargo components (adapted from TGI original Dockerfile)
# Note: we cannot use the cargo-chef base image as it uses python 3.11
FROM ubuntu:22.04 AS chef

RUN apt-get update -y \
 && apt-get install -y --no-install-recommends \
    curl ca-certificates build-essential \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean

RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain 1.85.1 --profile minimal -y
ENV PATH="/root/.cargo/bin:${PATH}"
RUN cargo install cargo-chef --locked

WORKDIR /usr/src

FROM chef AS planner
COPY backends/neuron/Cargo.toml Cargo.toml
COPY Cargo.lock Cargo.lock
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
COPY router router
COPY backends backends
COPY launcher launcher
RUN cargo chef prepare --recipe-path recipe.json

FROM chef AS builder

RUN apt-get update -y \
 && apt-get install -y --no-install-recommends \
    unzip python3-dev libssl-dev pkg-config \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean

RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
    rm -f $PROTOC_ZIP

COPY backends/neuron/Cargo.toml Cargo.toml
COPY --from=planner /usr/src/recipe.json recipe.json
RUN cargo chef cook --release --recipe-path recipe.json

COPY Cargo.lock Cargo.lock
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
COPY router router
COPY backends backends
COPY launcher launcher
RUN cargo build --release

# Python base image
FROM ubuntu:22.04 AS base

RUN apt-get update -y \
    && apt-get install -y --no-install-recommends \
    python3-pip \
    python3-setuptools \
    python-is-python3 \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean
RUN pip3 --no-cache-dir install --upgrade pip

# Python server build image
FROM base AS pyserver

RUN apt-get update -y \
    && apt-get install -y --no-install-recommends \
    make \
    python3-venv \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean

RUN install -d /pyserver
WORKDIR /pyserver
COPY backends/neuron/server server
COPY proto proto
RUN pip3 install -r server/build-requirements.txt
RUN VERBOSE=1 BUILDDIR=/pyserver/build PROTODIR=/pyserver/proto make -C server package

# Neuron base image (used for deployment)
FROM base AS neuron

# Install system prerequisites
RUN apt-get update -y \
    && apt-get install -y --no-install-recommends \
    gnupg2 \
    wget \
    python3-dev \
    libexpat1 \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean

RUN echo "deb https://apt.repos.neuron.amazonaws.com jammy main" > /etc/apt/sources.list.d/neuron.list
RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add -

# Install neuronx packages
RUN apt-get update -y \
    && apt-get install -y --no-install-recommends \
    aws-neuronx-dkms=2.22.2.0 \
    aws-neuronx-collectives=2.26.43.0-47cc904ea \
    aws-neuronx-runtime-lib=2.26.42.0-2ff3b5c7d  \
    aws-neuronx-tools=2.24.54.0 \
    libxml2 \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean

ENV PATH="/opt/bin/:/opt/aws/neuron/bin:${PATH}"

# Install manually torch CPU version to avoid pulling CUDA
RUN pip3 install \
    torch==2.7.0 \
    torchvision==0.22.0 \
    --index-url https://download.pytorch.org/whl/cpu

RUN pip3 install \
    neuronx-cc==2.19.8089.0+8ab9f450 \
    torch-neuronx==2.7.0.2.8.6734+ac864f72 \
    neuronx-distributed==0.13.14393+b8569585 \
    libneuronxla==2.2.4410.0+835a67fb \
    --extra-index-url=https://pip.repos.neuron.amazonaws.com

# Install HuggingFace packages
RUN pip3 install \
    hf_transfer huggingface_hub

# Install optimum-neuron
COPY --from=optimum-neuron /optimum-neuron optimum-neuron
RUN pip3 install ./optimum-neuron

# TGI base env
ENV HUGGINGFACE_HUB_CACHE=/tmp \
    HF_HUB_ENABLE_HF_TRANSFER=1 \
    PORT=80

# Disable color logs as they are not supported by CloudWatch
ENV LOGURU_COLORIZE=NO
ENV LOG_COLORIZE=0

# Install router
COPY --from=builder /usr/src/target/release/text-generation-router-v2 /usr/local/bin/text-generation-router
# Install launcher
COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher
# Install python server
COPY --from=pyserver /pyserver/build/dist dist
RUN pip install dist/text_generation_server*.tar.gz

# Final image
FROM neuron

COPY backends/neuron/tgi_entry_point.py /tgi_entry_point.py
COPY backends/neuron/tgi-entrypoint.sh /tgi-entrypoint.sh
RUN chmod +x /tgi-entrypoint.sh

ENTRYPOINT ["/tgi-entrypoint.sh"]


================================================
FILE: Dockerfile.nix
================================================
# Build the image and get out the docker file:
#
# docker build -t tgi-nix-builder -f Dockerfile.nix
# docker run --log-driver=none tgi-nix-builder | docker load

FROM nixos/nix:2.18.8 AS builder
RUN echo "experimental-features = nix-command flakes" >> /etc/nix/nix.conf
RUN nix profile install nixpkgs#cachix
RUN cachix use huggingface
WORKDIR /root
ADD . .
RUN nix build .
RUN mkdir /tmp/nix-store-closure
RUN cp -R $(nix-store -qR result/) /tmp/nix-store-closure

FROM ubuntu:24.04

WORKDIR /app

# Copy /nix/store
COPY --from=builder /tmp/nix-store-closure /nix/store
COPY --from=builder /root/result /app
RUN ldconfig
CMD ["ldconfig", "/app/bin/text-generation-launcher"]


================================================
FILE: Dockerfile_amd
================================================
# Rust builder
FROM lukemathwalker/cargo-chef:latest-rust-1.85.1 AS chef
WORKDIR /usr/src

ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse

FROM chef AS planner
COPY Cargo.lock Cargo.lock
COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
COPY benchmark benchmark
COPY router router
COPY backends backends
COPY launcher launcher
RUN cargo chef prepare --recipe-path recipe.json

FROM chef AS builder

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
    python3.11-dev
RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
    rm -f $PROTOC_ZIP

COPY --from=planner /usr/src/recipe.json recipe.json
RUN cargo chef cook --profile release-opt --recipe-path recipe.json

ARG GIT_SHA
ARG DOCKER_LABEL

COPY Cargo.lock Cargo.lock
COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
COPY benchmark benchmark
COPY router router
COPY backends backends
COPY launcher launcher
RUN cargo build --profile release-opt --frozen

FROM rocm/dev-ubuntu-22.04:6.3.1-complete AS base

ARG HIPBLASLT_BRANCH="4d40e36"
ARG HIPBLAS_COMMON_BRANCH="7c1566b"
ARG LEGACY_HIPBLASLT_OPTION=
ARG RCCL_BRANCH="648a58d"
ARG RCCL_REPO="https://github.com/ROCm/rccl"
ARG TRITON_BRANCH="e5be006"
ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
ARG PYTORCH_BRANCH="3a585126"
ARG PYTORCH_VISION_BRANCH="v0.19.1"
ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
ARG FA_BRANCH="b7d29fb"
ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
ARG AITER_BRANCH="21d47a9"
ARG AITER_REPO="https://github.com/ROCm/aiter.git"

ENV PATH=/opt/rocm/llvm/bin:$PATH
ENV ROCM_PATH=/opt/rocm
ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}

ARG PYTHON_VERSION=3.11

RUN mkdir -p /app
WORKDIR /app
ENV DEBIAN_FRONTEND=noninteractive

# Install Python and other dependencies
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        build-essential \
        ca-certificates \
        ccache \
        curl \
        git \
        ninja-build \
        cmake \
        software-properties-common \
        python3.11-dev \
        python3.11-venv && \
        rm -rf /var/lib/apt/lists/*

COPY --from=ghcr.io/astral-sh/uv:0.5.31 /uv /uvx /bin/
ENV PATH="$PATH:/root/.local/bin"
RUN uv python install ${PYTHON_VERSION}
RUN uv venv --python ${PYTHON_VERSION} && uv pip install pip setuptools packaging
ENV VIRTUAL_ENV=/usr/src/.venv/
ENV PATH="$PATH:/usr/src/.venv/bin/"

RUN . .venv/bin/activate && pip install -U packaging cmake ninja wheel setuptools pybind11 Cython

FROM base AS build_hipblaslt
ARG HIPBLASLT_BRANCH
ARG HIPBLAS_COMMON_BRANCH
# Set to "--legacy_hipblas_direct" for ROCm<=6.2
ARG LEGACY_HIPBLASLT_OPTION
RUN git clone https://github.com/ROCm/hipBLAS-common.git
RUN . .venv/bin/activate && cd hipBLAS-common \
    && git checkout ${HIPBLAS_COMMON_BRANCH} \
    && mkdir build \
    && cd build \
    && cmake .. \
    && make package \
    && dpkg -i ./*.deb
RUN git clone https://github.com/ROCm/hipBLASLt
RUN . .venv/bin/activate && cd hipBLASLt \
    && git checkout ${HIPBLASLT_BRANCH} \
    && ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
    && cd build/release \
    && make package
RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install

FROM base AS build_rccl
ARG RCCL_BRANCH
ARG RCCL_REPO
RUN git clone ${RCCL_REPO}
RUN . .venv/bin/activate && cd rccl \
    && git checkout ${RCCL_BRANCH} \
    && ./install.sh -p --amdgpu_targets ${PYTORCH_ROCM_ARCH}
RUN mkdir -p /app/install && cp /app/rccl/build/release/*.deb /app/install

FROM base AS build_triton
ARG TRITON_BRANCH
ARG TRITON_REPO
RUN git clone ${TRITON_REPO}
RUN . .venv/bin/activate && cd triton \
    && git checkout ${TRITON_BRANCH} \
    && cd python \
    && python3 setup.py bdist_wheel --dist-dir=dist
RUN mkdir -p /app/install && cp /app/triton/python/dist/*.whl /app/install

FROM base AS build_amdsmi
RUN . .venv/bin/activate && cd /opt/rocm/share/amd_smi \
    && pip wheel . --wheel-dir=dist
RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install

FROM base AS build_pytorch
ARG PYTORCH_BRANCH
ARG PYTORCH_VISION_BRANCH
ARG PYTORCH_REPO
ARG PYTORCH_VISION_REPO
ARG FA_BRANCH
ARG FA_REPO
RUN git clone ${PYTORCH_REPO} pytorch
RUN . .venv/bin/activate && cd pytorch && git checkout ${PYTORCH_BRANCH} && \
    pip install -r requirements.txt && git submodule update --init --recursive \
    && python3 tools/amd_build/build_amd.py \
    && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
    && pip install dist/*.whl
RUN git clone ${PYTORCH_VISION_REPO} vision
RUN . .venv/bin/activate && cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
    && python3 setup.py bdist_wheel --dist-dir=dist \
    && pip install dist/*.whl
RUN git clone ${FA_REPO}
RUN . .venv/bin/activate && cd flash-attention \
    && git checkout ${FA_BRANCH} \
    && git submodule update --init \
    && MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
    && cp /app/vision/dist/*.whl /app/install \
    && cp /app/flash-attention/dist/*.whl /app/install

FROM base AS final
RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
    dpkg -i /install/*deb \
    && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \
    && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status
RUN --mount=type=bind,from=build_rccl,src=/app/install/,target=/install \
    dpkg -i /install/*deb \
    && sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \
    && sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status
RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
    . .venv/bin/activate && \
    pip install /install/*.whl
RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
    . .venv/bin/activate && \
    pip install /install/*.whl
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
    . .venv/bin/activate && \
    pip install /install/*.whl

ARG AITER_REPO
ARG AITER_BRANCH
RUN git clone --recursive ${AITER_REPO}
RUN . .venv/bin/activate && cd aiter \
    && git checkout ${AITER_BRANCH} \
    && git submodule update --init --recursive \
    && pip install -r requirements.txt \
    && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter

RUN rm -rf /var/lib/apt/lists/*

FROM final AS kernel-builder
# # Build vllm kernels
FROM kernel-builder AS vllm-builder

COPY server/Makefile-vllm Makefile
RUN . .venv/bin/activate && pip install setuptools_scm

# Build specific version of vllm
RUN . .venv/bin/activate && make build-vllm-rocm

# Build Transformers CUDA kernels (gpt-neox and bloom)
FROM kernel-builder AS custom-kernels-builder
COPY server/custom_kernels/ .
RUN . .venv/bin/activate && python3 setup.py bdist_wheel --dist-dir=dist

# Build exllama kernels
FROM kernel-builder AS exllama-kernels-builder
COPY server/exllama_kernels/ .
RUN . .venv/bin/activate && python3 setup.py bdist_wheel --dist-dir=dist

# Build exllama v2 kernels
FROM kernel-builder AS exllamav2-kernels-builder
COPY server/exllamav2_kernels/ .
RUN . .venv/bin/activate && python3 setup.py bdist_wheel --dist-dir=dist

FROM kernel-builder AS marlin-kernels
ENV MARLIN_KERNELS_BRANCH=v0.3.6
ENV VLLM_TARGET_DEVICE=rocm
RUN . .venv/bin/activate && git clone https://github.com/danieldk/marlin-kernels.git && \
    cd marlin-kernels && \
    git checkout ${MARLIN_KERNELS_BRANCH} && \
    python3 setup.py bdist_wheel --dist-dir=dist

FROM kernel-builder AS moe-kernels
ENV MOE_KERNELS_BRANCH=v0.8.2
ENV VLLM_TARGET_DEVICE=rocm
RUN . .venv/bin/activate && git clone https://github.com/danieldk/moe-kernels.git && \
    cd moe-kernels && \
    git checkout ${MOE_KERNELS_BRANCH} && \
    python3 setup.py bdist_wheel --dist-dir=dist

FROM final AS base-copy

# Text Generation Inference base env
ENV HF_HOME=/data \
    HF_HUB_ENABLE_HF_TRANSFER=1 \
    PORT=80

ENV VIRTUAL_ENV=/app/.venv/
ENV PATH="$PATH:/app/.venv/bin/"

# Install server
COPY proto proto
COPY server server
COPY server/Makefile server/Makefile
RUN cd server && \
    uv pip install grpcio-tools mypy-protobuf && \
    uv pip install -e ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir && \
    make gen-server-raw
RUN cd server && \
    pwd && \
    text-generation-server --help

RUN --mount=type=bind,from=vllm-builder,src=/app/vllm/dist,target=/install \
    uv pip install /install/*.whl
RUN --mount=type=bind,from=custom-kernels-builder,src=/app/dist,target=/install \
    uv pip install /install/*.whl
RUN --mount=type=bind,from=custom-kernels-builder,src=/app/dist,target=/install \
    uv pip install /install/*.whl
RUN --mount=type=bind,from=exllama-kernels-builder,src=/app/dist,target=/install \
    uv pip install /install/*.whl
RUN --mount=type=bind,from=exllamav2-kernels-builder,src=/app/dist,target=/install \
    uv pip install /install/*.whl
RUN --mount=type=bind,from=marlin-kernels,src=/app/marlin-kernels/dist,target=/install \
    uv pip install /install/*.whl
RUN --mount=type=bind,from=moe-kernels,src=/app/moe-kernels/dist,target=/install \
    uv pip install /install/*.whl

# Install benchmarker
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
# Install router
COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
# Install launcher
COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher

# AWS Sagemaker compatible image
FROM base AS sagemaker

COPY sagemaker-entrypoint.sh entrypoint.sh
RUN chmod +x entrypoint.sh

ENTRYPOINT ["./entrypoint.sh"]

# Final image
FROM base-copy

# Set AS recommended: https://github.com/ROCm/triton/wiki/A-script-to-set-program-execution-environment-in-ROCm
ENV HIP_FORCE_DEV_KERNARG=1

# On MI250 and MI300, performances for flash with Triton FA are slightly better than CK.
# However, Triton requires a tunning for each prompt length, which is prohibitive.
ENV ROCM_USE_FLASH_ATTN_V2_TRITON=0
ENV ROCM_USE_CUSTOM_PAGED_ATTN=1
ENV PYTORCH_TUNABLEOP_TUNING_AFTER_WARMUP=0
ENV VLLM_MOE_PADDING=0
ENV ATTENTION=paged
ENV PREFIX_CACHING=0
ENV PREFILL_CHUNKING=0
ENV ROCM_USE_SKINNY_GEMM=1

COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
RUN chmod +x /tgi-entrypoint.sh

ENTRYPOINT ["/tgi-entrypoint.sh"]
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/root/.local/share/uv/python/cpython-3.11.11-linux-x86_64-gnu/lib"
ENV PYTHONPATH=/app/.venv/lib/python3.11/site-packages
# CMD ["--json-output"]


================================================
FILE: Dockerfile_gaudi
================================================
# Those arguments are required to build the image
ARG HABANA_VERSION=1.21.0
ARG PYTORCH_VERSION=2.6.0

# Rust builder
FROM lukemathwalker/cargo-chef:latest-rust-1.85.1 AS chef
WORKDIR /usr/src

ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse

FROM chef AS planner
COPY Cargo.lock Cargo.lock
COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
COPY benchmark benchmark
COPY router router
COPY backends backends
COPY launcher launcher
RUN cargo chef prepare --recipe-path recipe.json

FROM chef AS builder

ENV PYO3_PYTHON="/root/.local/bin/python" \
    PYTHON_SYS_EXECUTABLE="/root/.local/bin/python" \
    PYO3_PYTHON_VERSION="3.10"

RUN curl -LsSf https://astral.sh/uv/install.sh | sh \
    && . $HOME/.local/bin/env \
    && uv python install 3.10 --default --preview \
    && test -f /root/.local/bin/python || (echo "Python 3.10 not found at /root/.local/bin/python" && exit 1)

RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
    rm -f $PROTOC_ZIP

COPY --from=planner /usr/src/recipe.json recipe.json
RUN cargo chef cook --profile release-opt --recipe-path recipe.json

ARG GIT_SHA
ARG DOCKER_LABEL

COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
COPY benchmark benchmark
COPY router router
COPY backends backends
COPY launcher launcher
RUN cargo build --profile release-opt

# Text Generation Inference base image
ARG HABANA_VERSION
ARG PYTORCH_VERSION

FROM vault.habana.ai/gaudi-docker/${HABANA_VERSION}/ubuntu22.04/habanalabs/pytorch-installer-${PYTORCH_VERSION}:latest AS base

ENV ATTENTION=paged
ENV PREFIX_CACHING=0
ENV PREFILL_CHUNKING=0
ENV PT_HPU_LAZY_MODE=1
ENV PT_HPU_WEIGHT_SHARING=0
ENV VLLM_EXPONENTIAL_BUCKETING=true

# Text Generation Inference base env
ENV HF_HOME=/data \
    HF_HUB_ENABLE_HF_TRANSFER=1 \
    PORT=80

# Assert that Python 3.10 is installed as the launcher is compiled with Python 3.10
RUN python3.10 --version || (echo "Python 3.10 is not installed" && exit 1)

# libssl.so.1.1 is not installed on Ubuntu 22.04 by default, install it
RUN wget http://nz2.archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb && \
    dpkg -i ./libssl1.1_1.1.1f-1ubuntu2_amd64.deb

WORKDIR /usr/src

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        libssl-dev \
        ca-certificates \
        make \
        curl \
        git \
        && rm -rf /var/lib/apt/lists/*

# Install server
COPY proto proto
COPY backends/gaudi/server server
COPY backends/gaudi/server/Makefile server/Makefile
ARG HABANA_VERSION
RUN cd server && \
    make gen-server && \
    pip install --no-deps -r requirements.txt && \
    bash ./dill-0.3.8-patch.sh && \
    pip install . --no-cache-dir
RUN pip install git+https://github.com/sywangyi/vllm-hpu-extension.git@bmax_fix
RUN pip install compressed-tensors==0.9.1

# Install benchmarker
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
# Install router
COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
# Install launcher
COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher


# AWS Sagemaker compatible image
FROM base AS sagemaker

COPY sagemaker-entrypoint.sh entrypoint.sh
RUN chmod +x entrypoint.sh

ENTRYPOINT ["./entrypoint.sh"]

# Final image
FROM base

ENV HF_HUB_ENABLE_HF_TRANSFER=1
ENV HABANA_VISIBLE_DEVICES=all
ENV OMPI_MCA_btl_vader_single_copy_mechanism=NONE

COPY backends/gaudi/tgi-entrypoint.sh /tgi-entrypoint.sh
RUN chmod +x /tgi-entrypoint.sh

ENTRYPOINT ["/tgi-entrypoint.sh"]
CMD ["--json-output"]


================================================
FILE: Dockerfile_intel
================================================
ARG PLATFORM=xpu

FROM lukemathwalker/cargo-chef:latest-rust-1.85.1 AS chef
WORKDIR /usr/src

ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse

FROM chef AS planner
COPY Cargo.lock Cargo.lock
COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
COPY benchmark benchmark
COPY router router
COPY backends backends
COPY launcher launcher
RUN cargo chef prepare --recipe-path recipe.json

FROM chef AS builder

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
    python3.11-dev
RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
    rm -f $PROTOC_ZIP

COPY --from=planner /usr/src/recipe.json recipe.json
RUN cargo chef cook --profile release-opt --recipe-path recipe.json

ARG GIT_SHA
ARG DOCKER_LABEL

COPY Cargo.lock Cargo.lock
COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
COPY benchmark benchmark
COPY router router
COPY backends backends
COPY launcher launcher
RUN cargo build --profile release-opt --frozen


# Text Generation Inference base image for Intel

FROM intel/oneapi-basekit:2025.1.3-0-devel-ubuntu22.04 AS xpu

USER root

ARG MAMBA_VERSION=23.1.0-1
ARG PYTHON_VERSION='3.11.10'
# Automatically set by buildx
ARG TARGETPLATFORM
ENV PATH=/opt/conda/bin:$PATH

# TGI seem to require libssl.so.1.1 instead of libssl.so.3 so we can't use ubuntu 22.04. Ubuntu 20.04 has python==3.8, and TGI requires python>=3.9, hence the need for miniconda.
# Install mamba
# translating Docker's TARGETPLATFORM into mamba arches
RUN case ${TARGETPLATFORM} in \
         "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
         *)              MAMBA_ARCH=x86_64   ;; \
    esac && \
    curl -fsSL -v -o ~/mambaforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
RUN chmod +x ~/mambaforge.sh && \
    bash ~/mambaforge.sh -b -p /opt/conda && \
    rm ~/mambaforge.sh

RUN case ${TARGETPLATFORM} in \
         "linux/arm64")  exit 1 ;; \
         *)              /opt/conda/bin/conda update -y conda &&  \
                         /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \
    esac && \
    /opt/conda/bin/conda clean -ya

# libssl.so.1.1 is not installed on Ubuntu 22.04 by default, install it
RUN wget http://nz2.archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb && \
    dpkg -i ./libssl1.1_1.1.1f-1ubuntu2_amd64.deb

RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null

RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
| gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list

RUN echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/intel-for-pytorch-gpu-dev all main" > /tmp/intel-for-pytorch-gpu-dev.list

RUN mv /tmp/intel-for-pytorch-gpu-dev.list /etc/apt/sources.list.d

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt install -y xpu-smi cmake ninja-build pciutils intel-ocloc libnl-genl-3-200

# Text Generation Inference base env
ENV HF_HOME=/data \
    HF_HUB_ENABLE_HF_TRANSFER=1 \
    PORT=80


WORKDIR /usr/src

RUN pip install torch==2.8.0 torchvision==0.23.0 --index-url https://download.pytorch.org/whl/xpu

# Install server
COPY proto proto
COPY server server
COPY server/Makefile server/Makefile
ENV UV_SYSTEM_PYTHON=1
RUN cd server && \
    make gen-server && \
    pip install -U pip uv && \
    uv pip install -e ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir

ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib
ENV CCL_ZE_IPC_EXCHANGE=sockets
ENV TORCH_LLM_ALLREDUCE=1
ENV CCL_TOPO_FABRIC_VERTEX_CONNECTION_CHECK=0
ENV TORCH_DEVICE_BACKEND_AUTOLOAD=0

RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/intel_extension_for_pytorch-2.8.10%2Bxpu-cp311-cp311-linux_x86_64.whl
# Install benchmarker
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
# Install router
COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
# Install launcher
COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher


# Text Generation Inference base image for Intel-cpu
FROM ubuntu:22.04 AS cpu

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
    curl \
    ca-certificates \
    make \
    g++-12 \
    gcc-12 \
    git \
    wget \
    cmake \
    libnuma-dev

RUN update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-12 12
RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12
RUN update-alternatives --install /usr/bin/cc cc /usr/bin/gcc 30
RUN update-alternatives --set cc /usr/bin/gcc

RUN update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 30
RUN update-alternatives --set c++ /usr/bin/g++


ENV HUGGINGFACE_HUB_CACHE=/data \
    HF_HUB_ENABLE_HF_TRANSFER=1 \
    PORT=80

ARG MAMBA_VERSION=23.1.0-1
ARG PYTHON_VERSION='3.11.10'
# Automatically set by buildx
ARG TARGETPLATFORM
ENV PATH=/opt/conda/bin:$PATH

# TGI seem to require libssl.so.1.1 instead of libssl.so.3 so we can't use ubuntu 22.04. Ubuntu 20.04 has python==3.8, and TGI requires python>=3.9, hence the need for miniconda.
# Install mamba
# translating Docker's TARGETPLATFORM into mamba arches
RUN case ${TARGETPLATFORM} in \
         "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
         *)              MAMBA_ARCH=x86_64   ;; \
    esac && \
    curl -fsSL -v -o ~/mambaforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
RUN chmod +x ~/mambaforge.sh && \
    bash ~/mambaforge.sh -b -p /opt/conda && \
    rm ~/mambaforge.sh

RUN case ${TARGETPLATFORM} in \
         "linux/arm64")  exit 1 ;; \
         *)              /opt/conda/bin/conda update -y conda &&  \
                         /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \
    esac && \
    /opt/conda/bin/conda clean -ya

RUN conda install -c conda-forge gperftools mkl

RUN pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/cpu
RUN pip install triton==3.2.0 py-libnuma

WORKDIR /usr/src

RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/cpu/intel_extension_for_pytorch-2.7.0%2Bcpu-cp311-cp311-linux_x86_64.whl
RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/cpu/oneccl_bind_pt-2.7.0%2Bcpu-cp311-cp311-linux_x86_64.whl


ENV LD_PRELOAD=/opt/conda/lib/libtcmalloc.so
ENV CCL_ROOT=/opt/conda/lib/python3.11/site-packages/oneccl_bindings_for_pytorch
ENV I_MPI_ROOT=/opt/conda/lib/python3.11/site-packages/oneccl_bindings_for_pytorch
ENV FI_PROVIDER_PATH=/opt/conda/lib/python3.11/site-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib/prov:/usr/lib64/libfabric
ENV LD_LIBRARY_PATH=/opt/conda/lib/python3.11/site-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib:/opt/conda/lib/python3.11/site-packages/oneccl_bindings_for_pytorch/lib
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib/"

# Install server
COPY proto proto
COPY server server
COPY server/Makefile server/Makefile
ENV UV_SYSTEM_PYTHON=1
RUN cd server && \
    make gen-server && \
    pip install -U pip uv && \
    uv pip install -e ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir

# Install benchmarker
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
# Install router
COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
# Install launcher
COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher

FROM ${PLATFORM} AS final
ENV ATTENTION=flashdecoding-ipex
ENV PREFIX_CACHING=1
ENV PREFILL_CHUNKING=1
ENV CUDA_GRAPHS=0
ENTRYPOINT ["text-generation-launcher"]
CMD ["--json-output"]


================================================
FILE: Dockerfile_llamacpp
================================================
FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu24.04 AS deps

ARG llamacpp_version=b4827
ARG llamacpp_cuda=OFF
ARG llamacpp_native=ON
ARG llamacpp_cpu_arm_arch=native
ARG cuda_arch=75-real;80-real;86-real;89-real;90-real

WORKDIR /opt/src

ENV DEBIAN_FRONTEND=noninteractive
RUN apt update && apt upgrade -y && apt install -y \
    clang \
    cmake \
    curl \
    git \
    python3-dev \
    libssl-dev \
    pkg-config \
    tar

ADD https://github.com/ggml-org/llama.cpp/archive/refs/tags/${llamacpp_version}.tar.gz /opt/src/
RUN mkdir -p llama.cpp \
 && tar -xzf ${llamacpp_version}.tar.gz -C llama.cpp --strip-components=1 \
 && cd llama.cpp \
 && cmake -B build \
    -DCMAKE_INSTALL_PREFIX=/usr \
    -DCMAKE_INSTALL_LIBDIR=/usr/lib \
    -DCMAKE_C_COMPILER=clang \
    -DCMAKE_CXX_COMPILER=clang++ \
    -DCMAKE_CUDA_ARCHITECTURES=${cuda_arch} \
    -DGGML_CUDA=${llamacpp_cuda} \
    -DGGML_NATIVE=${llamacpp_native} \
    -DGGML_CPU_ARM_ARCH=${llamacpp_cpu_arm_arch} \
    -DLLAMA_BUILD_COMMON=OFF \
    -DLLAMA_BUILD_TESTS=OFF \
    -DLLAMA_BUILD_EXAMPLES=OFF \
    -DLLAMA_BUILD_SERVER=OFF \
 && cmake --build build --parallel --config Release \
 && cmake --install build

WORKDIR /app
COPY rust-toolchain.toml rust-toolchain.toml
RUN curl -sSf https://sh.rustup.rs | sh -s -- --no-modify-path --default-toolchain 1.85.1 --profile minimal -y
ENV PATH="/root/.cargo/bin:$PATH"
RUN cargo install cargo-chef --locked

FROM deps AS planner
COPY . .
RUN cargo chef prepare --recipe-path recipe.json

FROM deps AS builder
COPY --from=planner /app/recipe.json recipe.json
RUN cargo chef cook \
    --recipe-path recipe.json \
    --profile release \
    --package text-generation-router-llamacpp
COPY . .
RUN cargo build \
    --profile release \
    --package text-generation-router-llamacpp --frozen

FROM nvidia/cuda:12.8.0-cudnn-runtime-ubuntu24.04
WORKDIR /app

ENV DEBIAN_FRONTEND=noninteractive
RUN apt update && apt upgrade -y && apt install -y \
    python3-venv \
    python3-pip

RUN python3 -m venv /venv
ENV PATH="/venv/bin:$PATH"

COPY backends/llamacpp/requirements.txt requirements.txt
COPY --from=builder /opt/src/llama.cpp/gguf-py gguf-py
COPY --from=builder /opt/src/llama.cpp/convert_hf_to_gguf.py /bin/

RUN pip3 install --no-cache-dir \
    -r requirements.txt \
    -e gguf-py

COPY --from=builder /usr/lib/libllama.so /usr/lib/
COPY --from=builder /usr/lib/libggml*.so /usr/lib/
COPY --from=builder /app/target/release/text-generation-router-llamacpp /usr/bin/

ENV HF_HUB_ENABLE_HF_TRANSFER=1

ENTRYPOINT ["text-generation-router-llamacpp"]


================================================
FILE: Dockerfile_trtllm
================================================
ARG cuda_arch_list="75-real;80-real;86-real;89-real;90-real;100-real;120-real"
ARG cuda_base=12.8.0
ARG build_type=release
ARG ompi_version=4.1.7
ARG sccache_gha_enabled=off
ARG actions_results_url=""
ARG actions_runtime_token=""

# CUDA dependent dependencies resolver stage
FROM nvidia/cuda:${cuda_base}-cudnn-devel-ubuntu24.04 AS cuda-builder

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
    build-essential \
    cmake \
    curl \
    gcc-14  \
    g++-14 \
    git \
    git-lfs \
    lld \
    libssl-dev \
    libucx-dev \
    libasan8 \
    libubsan1 \
    ninja-build \
    pkg-config \
    pipx \
    python3 \
    python3-dev \
    python3-setuptools \
    tar \
    wget --no-install-recommends && \
    pipx ensurepath

ENV TGI_INSTALL_PREFIX=/usr/local/tgi
ENV TENSORRT_INSTALL_PREFIX=/usr/local/tensorrt

# Install OpenMPI
FROM cuda-builder AS mpi-builder
WORKDIR /opt/src/mpi

ARG ompi_version
ENV OMPI_VERSION=${ompi_version}
ENV OMPI_TARBALL_FILENAME=openmpi-${OMPI_VERSION}.tar.bz2
ADD --checksum=sha256:54a33cb7ad81ff0976f15a6cc8003c3922f0f3d8ceed14e1813ef3603f22cd34 \
    https://download.open-mpi.org/release/open-mpi/v4.1/${OMPI_TARBALL_FILENAME} .

RUN tar --strip-components=1 -xf ${OMPI_TARBALL_FILENAME} &&\
    ./configure --prefix=/usr/local/mpi --with-cuda=/usr/local/cuda --with-slurm && \
    make -j all && \
    make install && \
    rm -rf ${OMPI_TARBALL_FILENAME}/..

# Install TensorRT
FROM cuda-builder AS trt-builder
COPY backends/trtllm/scripts/install_tensorrt.sh /opt/install_tensorrt.sh
RUN chmod +x /opt/install_tensorrt.sh && \
    /opt/install_tensorrt.sh

# Build Backend
FROM cuda-builder AS tgi-builder
WORKDIR /usr/src/text-generation-inference

# Scoped global args reuse
ARG cuda_arch_list
ARG build_type
ARG sccache_gha_enabled

# Install Rust
ENV PATH="/root/.cargo/bin:$PATH"
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain 1.85.1 --profile minimal -y && \
    chmod -R a+w /root/.rustup && \
    chmod -R a+w /root/.cargo && \
    cargo install sccache --version ">=0.10.0" --locked

ENV LD_LIBRARY_PATH="/usr/local/mpi/lib:$LD_LIBRARY_PATH"
ENV PKG_CONFIG_PATH="/usr/local/mpi/lib/pkgconfig"
ENV CMAKE_PREFIX_PATH="/usr/local/mpi:/usr/local/tensorrt"

ENV USE_LLD_LINKER=ON
ENV CUDA_ARCH_LIST=${cuda_arch_list}

# SCCACHE Specifics args - before finding a better, more generic, way...
ENV SCCACHE_GHA_ENABLED=${sccache_gha_enabled}

COPY Cargo.lock Cargo.lock
COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml
COPY router router
COPY backends backends
COPY benchmark benchmark
COPY launcher launcher
COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi

ENV RUSTC_WRAPPER=sccache
ENV CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX
RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
    --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
    export CMAKE_C_COMPILER_LAUNCHER=sccache && \
    export CMAKE_CXX_COMPILER_LAUNCHER=sccache && \
    export CMAKE_CUDA_COMPILER_LAUNCHER=sccache && \
    mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$TGI_INSTALL_PREFIX/lib" && \
    cargo build --profile ${build_type} --package text-generation-backends-trtllm --bin text-generation-backends-trtllm && \
    sccache --show-stats

FROM nvidia/cuda:${cuda_base}-cudnn-runtime-ubuntu24.04 AS runtime
RUN apt update && apt install -y libucx0 pipx python3-minimal python3-dev python3-pip python3-venv && \
    rm -rf /var/lib/{apt,dpkg,cache,log}/ && \
    pipx ensurepath && \
    pipx install --include-deps transformers tokenizers

WORKDIR /usr/local/tgi/bin

ENV PATH=/root/.local/share/pipx/venvs/transformers/bin/:$PATH
ENV LD_LIBRARY_PATH="/usr/local/tgi/lib:/usr/local/mpi/lib:/usr/local/tensorrt/lib:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
ENV TOKENIZERS_PARALLELISM=false
ENV OMPI_MCA_plm_rsh_agent=""

COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
COPY --from=tgi-builder /usr/local/tgi /usr/local/tgi
COPY --from=tgi-builder /usr/src/text-generation-inference/target/release/text-generation-backends-trtllm /usr/local/tgi/bin/text-generation-launcher

# This is used only for the CI/CD
FROM nvidia/cuda:${cuda_base}-cudnn-runtime-ubuntu24.04 AS ci-runtime
RUN apt update && apt install -y libasan8 libubsan1 libucx0 pipx python3-minimal python3-dev python3-pip python3-venv && \
    rm -rf /var/lib/{apt,dpkg,cache,log}/ && \
    pipx ensurepath && \
    pipx install --include-deps transformers tokenizers

WORKDIR /usr/local/tgi/bin

ENV PATH=/root/.local/share/pipx/venvs/transformers/bin/:$PATH
ENV LD_LIBRARY_PATH="/usr/local/tgi/lib:/usr/local/mpi/lib:/usr/local/tensorrt/lib:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
ENV TOKENIZERS_PARALLELISM=false
ENV OMPI_MCA_plm_rsh_agent=""

COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
COPY --from=tgi-builder /usr/local/tgi /usr/local/tgi

# Basically we copy from target/debug instead of target/release
COPY --from=tgi-builder /usr/src/text-generation-inference/target/debug/text-generation-backends-trtllm /usr/local/tgi/bin/text-generation-launcher

# This is the final image
FROM runtime

LABEL co.huggingface.vendor="Hugging Face Inc."
LABEL org.opencontainers.image.authors="hardware@hf.co"
LABEL org.opencontainers.title="Text-Generation-Inference TensorRT-LLM Backend"

ENTRYPOINT ["./text-generation-launcher"]
CMD ["--executor-worker", "/usr/local/tgi/bin/executorWorker"]


================================================
FILE: LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright 2022 Hugging Face

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: Makefile
================================================
install-server:
	cd server && make install

install-server-cpu:
	cd server && make install-server

install-router:
	cargo install --path backends/v3/

install-launcher:
	cargo install --path launcher/

install-benchmark:
	cargo install --path benchmark/

install: install-server install-router install-launcher


install-cpu: install-server-cpu install-router install-launcher

server-dev:
	cd server && make run-dev

router-dev:
	cd router && cargo run -- --port 8080

rust-tests: install-router install-launcher
	cargo test

install-integration-tests:
	cd integration-tests && pip install -r requirements.txt
	cd clients/python && pip install .

integration-tests: install-integration-tests
	pytest -s -vv -m "not private" integration-tests

update-integration-tests: install-integration-tests
	pytest -s -vv --snapshot-update integration-tests

python-server-tests:
	HF_HUB_ENABLE_HF_TRANSFER=1 pytest -s -vv -m "not private" server/tests

python-client-tests:
	pytest clients/python/tests

python-tests: python-server-tests python-client-tests

run-falcon-7b-instruct:
	text-generation-launcher --model-id tiiuae/falcon-7b-instruct --port 8080

run-falcon-7b-instruct-quantize:
	text-generation-launcher --model-id tiiuae/falcon-7b-instruct --quantize bitsandbytes --port 8080

clean:
	rm -rf target aml

preview_doc:
	doc-builder preview text-generation-inference docs/source --not_python_module


================================================
FILE: README.md
================================================
> [!CAUTION]
> text-generation-inference is now in maintenance mode. Going forward, we will accept pull requests for minor bug fixes, documentation improvements and lightweight maintenance tasks.
>
> TGI has initiated the movement for optimized inference engines to rely on a `transformers` model architectures. This approach is now adopted by downstream inference engines, which we contribute to and recommend using going forward: [vllm](https://github.com/vllm-project/vllm), [SGLang](https://github.com/sgl-project/sglang), as well as local engines with inter-compatibility such as llama.cpp or MLX.

<div align="center">

<a href="https://www.youtube.com/watch?v=jlMAX2Oaht0">
  <img width=560 alt="Making TGI deployment optimal" src="https://huggingface.co/datasets/Narsil/tgi_assets/resolve/main/thumbnail.png">
</a>

# Text Generation Inference

<a href="https://github.com/huggingface/text-generation-inference">
  <img alt="GitHub Repo stars" src="https://img.shields.io/github/stars/huggingface/text-generation-inference?style=social">
</a>
<a href="https://huggingface.github.io/text-generation-inference">
  <img alt="Swagger API documentation" src="https://img.shields.io/badge/API-Swagger-informational">
</a>

A Rust, Python and gRPC server for text generation inference. Used in production at [Hugging Face](https://huggingface.co)
to power Hugging Chat, the Inference API and Inference Endpoints.

</div>

## Table of contents

  - [Get Started](#get-started)
    - [Docker](#docker)
    - [API documentation](#api-documentation)
    - [Using a private or gated model](#using-a-private-or-gated-model)
    - [A note on Shared Memory (shm)](#a-note-on-shared-memory-shm)
    - [Distributed Tracing](#distributed-tracing)
    - [Architecture](#architecture)
    - [Local install](#local-install)
    - [Local install (Nix)](#local-install-nix)
  - [Optimized architectures](#optimized-architectures)
  - [Run locally](#run-locally)
    - [Run](#run)
    - [Quantization](#quantization)
  - [Develop](#develop)
  - [Testing](#testing)

Text Generation Inference (TGI) is a toolkit for deploying and serving Large Language Models (LLMs). TGI enables high-performance text generation for the most popular open-source LLMs, including Llama, Falcon, StarCoder, BLOOM, GPT-NeoX, and [more](https://huggingface.co/docs/text-generation-inference/supported_models). TGI implements many features, such as:

- Simple launcher to serve most popular LLMs
- Production ready (distributed tracing with Open Telemetry, Prometheus metrics)
- Tensor Parallelism for faster inference on multiple GPUs
- Token streaming using Server-Sent Events (SSE)
- Continuous batching of incoming requests for increased total throughput
- [Messages API](https://huggingface.co/docs/text-generation-inference/en/messages_api) compatible with Open AI Chat Completion API
- Optimized transformers code for inference using [Flash Attention](https://github.com/HazyResearch/flash-attention) and [Paged Attention](https://github.com/vllm-project/vllm) on the most popular architectures
- Quantization with :
  - [bitsandbytes](https://github.com/TimDettmers/bitsandbytes)
  - [GPT-Q](https://arxiv.org/abs/2210.17323)
  - [EETQ](https://github.com/NetEase-FuXi/EETQ)
  - [AWQ](https://github.com/casper-hansen/AutoAWQ)
  - [Marlin](https://github.com/IST-DASLab/marlin)
  - [fp8](https://developer.nvidia.com/blog/nvidia-arm-and-intel-publish-fp8-specification-for-standardization-as-an-interchange-format-for-ai/)
- [Safetensors](https://github.com/huggingface/safetensors) weight loading
- Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
- Logits warper (temperature scaling, top-p, top-k, repetition penalty, more details see [transformers.LogitsProcessor](https://huggingface.co/docs/transformers/internal/generation_utils#transformers.LogitsProcessor))
- Stop sequences
- Log probabilities
- [Speculation](https://huggingface.co/docs/text-generation-inference/conceptual/speculation) ~2x latency
- [Guidance/JSON](https://huggingface.co/docs/text-generation-inference/conceptual/guidance). Specify output format to speed up inference and make sure the output is valid according to some specs..
- Custom Prompt Generation: Easily generate text by providing custom prompts to guide the model's output
- Fine-tuning Support: Utilize fine-tuned models for specific tasks to achieve higher accuracy and performance

### Hardware support

- [Nvidia](https://github.com/huggingface/text-generation-inference/pkgs/container/text-generation-inference)
- [AMD](https://github.com/huggingface/text-generation-inference/pkgs/container/text-generation-inference) (-rocm)
- [Inferentia](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference)
- [Intel GPU](https://github.com/huggingface/text-generation-inference/pull/1475)
- [Gaudi](https://github.com/huggingface/tgi-gaudi)
- [Google TPU](https://huggingface.co/docs/optimum-tpu/howto/serving)


## Get Started

### Docker

For a detailed starting guide, please see the [Quick Tour](https://huggingface.co/docs/text-generation-inference/quicktour). The easiest way of getting started is using the official Docker container:

```shell
model=HuggingFaceH4/zephyr-7b-beta
# share a volume with the Docker container to avoid downloading weights every run
volume=$PWD/data

docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
    ghcr.io/huggingface/text-generation-inference:3.3.5 --model-id $model
```

And then you can make requests like

```bash
curl 127.0.0.1:8080/generate_stream \
    -X POST \
    -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
    -H 'Content-Type: application/json'
```

You can also use [TGI's Messages API](https://huggingface.co/docs/text-generation-inference/en/messages_api) to obtain Open AI Chat Completion API compatible responses.

```bash
curl localhost:8080/v1/chat/completions \
    -X POST \
    -d '{
  "model": "tgi",
  "messages": [
    {
      "role": "system",
      "content": "You are a helpful assistant."
    },
    {
      "role": "user",
      "content": "What is deep learning?"
    }
  ],
  "stream": true,
  "max_tokens": 20
}' \
    -H 'Content-Type: application/json'
```

**Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.

**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.5-rocm --model-id $model` instead of the command above.

To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli):
```
text-generation-launcher --help
```

### API documentation

You can consult the OpenAPI documentation of the `text-generation-inference` REST API using the `/docs` route.
The Swagger UI is also available at: [https://huggingface.github.io/text-generation-inference](https://huggingface.github.io/text-generation-inference).

### Using a private or gated model

You have the option to utilize the `HF_TOKEN` environment variable for configuring the token employed by
`text-generation-inference`. This allows you to gain access to protected resources.

For example, if you want to serve the gated Llama V2 model variants:

1. Go to https://huggingface.co/settings/tokens
2. Copy your CLI READ token
3. Export `HF_TOKEN=<your CLI READ token>`

or with Docker:

```shell
model=meta-llama/Meta-Llama-3.1-8B-Instruct
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
token=<your cli READ token>

docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data \
    ghcr.io/huggingface/text-generation-inference:3.3.5 --model-id $model
```

### A note on Shared Memory (shm)

[`NCCL`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/index.html) is a communication framework used by
`PyTorch` to do distributed training/inference. `text-generation-inference` makes
use of `NCCL` to enable Tensor Parallelism to dramatically speed up inference for large language models.

In order to share data between the different devices of a `NCCL` group, `NCCL` might fall back to using the host memory if
peer-to-peer using NVLink or PCI is not possible.

To allow the container to use 1G of Shared Memory and support SHM sharing, we add `--shm-size 1g` on the above command.

If you are running `text-generation-inference` inside `Kubernetes`. You can also add Shared Memory to the container by
creating a volume with:

```yaml
- name: shm
  emptyDir:
   medium: Memory
   sizeLimit: 1Gi
```

and mounting it to `/dev/shm`.

Finally, you can also disable SHM sharing by using the `NCCL_SHM_DISABLE=1` environment variable. However, note that
this will impact performance.

### Distributed Tracing

`text-generation-inference` is instrumented with distributed tracing using OpenTelemetry. You can use this feature
by setting the address to an OTLP collector with the `--otlp-endpoint` argument. The default service name can be
overridden with the `--otlp-service-name` argument

### Architecture

![TGI architecture](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/TGI.png)

Detailed blogpost by Adyen on TGI inner workings: [LLM inference at scale with TGI (Martin Iglesias Goyanes - Adyen, 2024)](https://www.adyen.com/knowledge-hub/llm-inference-at-scale-with-tgi)

### Local install

You can also opt to install `text-generation-inference` locally.

First clone the repository and change directory into it:

```shell
git clone https://github.com/huggingface/text-generation-inference
cd text-generation-inference
```

Then [install Rust](https://rustup.rs/) and create a Python virtual environment with at least
Python 3.9, e.g. using `conda` or `python venv`:

```shell
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh

#using conda
conda create -n text-generation-inference python=3.11
conda activate text-generation-inference

#using python venv
python3 -m venv .venv
source .venv/bin/activate
```

You may also need to install Protoc.

On Linux:

```shell
PROTOC_ZIP=protoc-21.12-linux-x86_64.zip
curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP
sudo unzip -o $PROTOC_ZIP -d /usr/local bin/protoc
sudo unzip -o $PROTOC_ZIP -d /usr/local 'include/*'
rm -f $PROTOC_ZIP
```

On MacOS, using Homebrew:

```shell
brew install protobuf
```

Then run:

```shell
BUILD_EXTENSIONS=True make install # Install repository and HF/transformer fork with CUDA kernels
text-generation-launcher --model-id mistralai/Mistral-7B-Instruct-v0.2
```

**Note:** on some machines, you may also need the OpenSSL libraries and gcc. On Linux machines, run:

```shell
sudo apt-get install libssl-dev gcc -y
```

### Local install (Nix)

Another option is to install `text-generation-inference` locally using [Nix](https://nixos.org). Currently,
we only support Nix on x86_64 Linux with CUDA GPUs. When using Nix, all dependencies can
be pulled from a binary cache, removing the need to build them locally.

First follow the instructions to [install Cachix and enable the Hugging Face cache](https://app.cachix.org/cache/huggingface).
Setting up the cache is important, otherwise Nix will build many of the dependencies
locally, which can take hours.

After that you can run TGI with `nix run`:

```shell
cd text-generation-inference
nix run --extra-experimental-features nix-command --extra-experimental-features flakes . -- --model-id meta-llama/Llama-3.1-8B-Instruct
```

**Note:** when you are using Nix on a non-NixOS system, you have to [make some symlinks](https://danieldk.eu/Nix-CUDA-on-non-NixOS-systems#make-runopengl-driverlib-and-symlink-the-driver-library)
to make the CUDA driver libraries visible to Nix packages.

For TGI development, you can use the `impure` dev shell:

```shell
nix develop .#impure

# Only needed the first time the devshell is started or after updating the protobuf.
(
cd server
mkdir text_generation_server/pb || true
python -m grpc_tools.protoc -I../proto/v3 --python_out=text_generation_server/pb \
       --grpc_python_out=text_generation_server/pb --mypy_out=text_generation_server/pb ../proto/v3/generate.proto
find text_generation_server/pb/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \;
touch text_generation_server/pb/__init__.py
)
```

All development dependencies (cargo, Python, Torch), etc. are available in this
dev shell.

## Optimized architectures

TGI works out of the box to serve optimized models for all modern models. They can be found in [this list](https://huggingface.co/docs/text-generation-inference/supported_models).

Other architectures are supported on a best-effort basis using:

`AutoModelForCausalLM.from_pretrained(<model>, device_map="auto")`

or

`AutoModelForSeq2SeqLM.from_pretrained(<model>, device_map="auto")`


## Run locally

### Run

```shell
text-generation-launcher --model-id mistralai/Mistral-7B-Instruct-v0.2
```

### Quantization

You can also run pre-quantized weights (AWQ, GPTQ, Marlin) or on-the-fly quantize weights with bitsandbytes, EETQ, fp8, to reduce the VRAM requirement:

```shell
text-generation-launcher --model-id mistralai/Mistral-7B-Instruct-v0.2 --quantize
```

4bit quantization is available using the [NF4 and FP4 data types from bitsandbytes](https://arxiv.org/pdf/2305.14314.pdf). It can be enabled by providing `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` as a command line argument to `text-generation-launcher`.

Read more about quantization in the [Quantization documentation](https://huggingface.co/docs/text-generation-inference/en/conceptual/quantization).

## Develop

```shell
make server-dev
make router-dev
```

## Testing

```shell
# python
make python-server-tests
make python-client-tests
# or both server and client tests
make python-tests
# rust cargo tests
make rust-tests
# integration tests
make integration-tests
```


================================================
FILE: assets/tgi_grafana.json
================================================
{
  "__inputs": [
    {
      "name": "DS_PROMETHEUS_EKS API INFERENCE PROD",
      "label": "Prometheus EKS API Inference Prod",
      "description": "",
      "type": "datasource",
      "pluginId": "prometheus",
      "pluginName": "Prometheus"
    }
  ],
  "__elements": {},
  "__requires": [
    {
      "type": "panel",
      "id": "gauge",
      "name": "Gauge",
      "version": ""
    },
    {
      "type": "grafana",
      "id": "grafana",
      "name": "Grafana",
      "version": "10.0.2"
    },
    {
      "type": "panel",
      "id": "heatmap",
      "name": "Heatmap",
      "version": ""
    },
    {
      "type": "datasource",
      "id": "prometheus",
      "name": "Prometheus",
      "version": "1.0.0"
    },
    {
      "type": "panel",
      "id": "timeseries",
      "name": "Time series",
      "version": ""
    }
  ],
  "annotations": {
    "list": [
      {
        "builtIn": 1,
        "datasource": {
          "type": "grafana",
          "uid": "-- Grafana --"
        },
        "enable": true,
        "hide": true,
        "iconColor": "rgba(0, 211, 255, 1)",
        "name": "Annotations & Alerts",
        "target": {
          "limit": 100,
          "matchAny": false,
          "tags": [],
          "type": "dashboard"
        },
        "type": "dashboard"
      }
    ]
  },
  "editable": true,
  "fiscalYearStartMonth": 0,
  "graphTooltip": 2,
  "id": 551,
  "links": [],
  "liveNow": false,
  "panels": [
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "thresholds"
          },
          "fieldMinMax": false,
          "mappings": [],
          "min": 0,
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 1000
              }
            ]
          },
          "unit": "ms"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 7,
        "w": 8,
        "x": 0,
        "y": 0
      },
      "id": 49,
      "options": {
        "colorMode": "value",
        "graphMode": "area",
        "justifyMode": "auto",
        "orientation": "auto",
        "reduceOptions": {
          "calcs": [
            "mean"
          ],
          "fields": "",
          "values": false
        },
        "showPercentChange": false,
        "textMode": "auto",
        "wideLayout": true
      },
      "pluginVersion": "10.4.2",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "(histogram_quantile(0.5, sum by (le) (rate(tgi_request_queue_duration_bucket{container=\"$service\"}[10m]))) * 1000) > 0",
          "hide": true,
          "instant": false,
          "legendFormat": "__auto",
          "range": true,
          "refId": "B"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "(histogram_quantile(0.5, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"prefill\", container=\"$service\"}[10m]))) * 1000) > 0",
          "hide": true,
          "instant": false,
          "legendFormat": "__auto",
          "range": true,
          "refId": "C"
        },
        {
          "datasource": {
            "name": "Expression",
            "type": "__expr__",
            "uid": "__expr__"
          },
          "expression": "$B + $C",
          "hide": false,
          "refId": "D",
          "type": "math"
        }
      ],
      "title": "Time to first token",
      "type": "stat"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "thresholds"
          },
          "mappings": [],
          "min": 0,
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "ms"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 7,
        "w": 8,
        "x": 9,
        "y": 0
      },
      "id": 44,
      "options": {
        "colorMode": "value",
        "graphMode": "area",
        "justifyMode": "auto",
        "orientation": "auto",
        "reduceOptions": {
          "calcs": [
            "mean"
          ],
          "fields": "",
          "values": false
        },
        "showPercentChange": false,
        "textMode": "auto",
        "wideLayout": true
      },
      "pluginVersion": "10.4.2",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "(histogram_quantile(0.5, sum by (le) (rate(tgi_batch_forward_duration_bucket{method=\"decode\", container=\"$service\"}[10m]))) * 1000)>0",
          "instant": false,
          "range": true,
          "refId": "A"
        }
      ],
      "title": "Decode per-token latency",
      "type": "stat"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "thresholds"
          },
          "mappings": [],
          "min": 0,
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              }
            ]
          },
          "unit": "short"
        },
        "overrides": []
      },
      "gridPos": {
        "h": 7,
        "w": 7,
        "x": 17,
        "y": 0
      },
      "id": 45,
      "options": {
        "colorMode": "value",
        "graphMode": "area",
        "justifyMode": "auto",
        "orientation": "auto",
        "reduceOptions": {
          "calcs": [
            "mean"
          ],
          "fields": "",
          "values": false
        },
        "showPercentChange": false,
        "textMode": "auto",
        "wideLayout": true
      },
      "pluginVersion": "10.4.2",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "sum((rate(tgi_request_generated_tokens_sum{container=\"$service\"}[10m]) / rate(tgi_request_generated_tokens_count{container=\"$service\"}[10m]))>0)",
          "instant": false,
          "range": true,
          "refId": "A"
        }
      ],
      "title": "Throughput (generated tok/s)",
      "type": "stat"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 0,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "none"
        },
        "overrides": [
          {
            "matcher": {
              "id": "byName",
              "options": "p50"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "green",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "p90"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "orange",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "p99"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "red",
                  "mode": "fixed"
                }
              }
            ]
          }
        ]
      },
      "gridPos": {
        "h": 8,
        "w": 12,
        "x": 0,
        "y": 7
      },
      "id": 48,
      "options": {
        "legend": {
          "calcs": [
            "min",
            "max"
          ],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_request_input_length_bucket{container=\"$service\"}[10m])))",
          "legendFormat": "p50",
          "range": true,
          "refId": "A"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_request_input_length_bucket{container=\"$service\"}[10m])))",
          "hide": false,
          "legendFormat": "p90",
          "range": true,
          "refId": "B"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_request_input_length_bucket{container=\"$service\"}[10m])))",
          "hide": false,
          "legendFormat": "p99",
          "range": true,
          "refId": "C"
        }
      ],
      "title": "Number of tokens per prompt",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 0,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "none"
        },
        "overrides": [
          {
            "matcher": {
              "id": "byName",
              "options": "p50"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "green",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "p90"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "orange",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "p99"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "red",
                  "mode": "fixed"
                }
              }
            ]
          }
        ]
      },
      "gridPos": {
        "h": 8,
        "w": 12,
        "x": 12,
        "y": 7
      },
      "id": 30,
      "options": {
        "legend": {
          "calcs": [
            "min",
            "max"
          ],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_request_generated_tokens_bucket{container=\"$service\"}[10m])))",
          "legendFormat": "p50",
          "range": true,
          "refId": "A"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_request_generated_tokens_bucket{container=\"$service\"}[10m])))",
          "hide": false,
          "legendFormat": "p90",
          "range": true,
          "refId": "B"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_request_generated_tokens_bucket{container=\"$service\"}[10m])))",
          "hide": false,
          "legendFormat": "p99",
          "range": true,
          "refId": "C"
        }
      ],
      "title": "Number of generated tokens per request",
      "type": "timeseries"
    },
    {
      "collapsed": false,
      "gridPos": {
        "h": 1,
        "w": 24,
        "x": 0,
        "y": 15
      },
      "id": 20,
      "panels": [],
      "title": "General",
      "type": "row"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 30,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          }
        },
        "overrides": []
      },
      "gridPos": {
        "h": 8,
        "w": 6,
        "x": 0,
        "y": 16
      },
      "id": 4,
      "maxDataPoints": 100,
      "options": {
        "legend": {
          "calcs": [
            "min",
            "max"
          ],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "sum(increase(tgi_request_success{container=\"$service\"}[1m]))",
          "legendFormat": "Success",
          "range": true,
          "refId": "A"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "sum(increase(tgi_request_failure{container=\"$service\"}[1m])) by (err)",
          "hide": false,
          "legendFormat": "Error: {{err}}",
          "range": true,
          "refId": "B"
        }
      ],
      "title": "Requests",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 0,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "s"
        },
        "overrides": [
          {
            "matcher": {
              "id": "byName",
              "options": "p50"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "green",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "p90"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "orange",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "p99"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "red",
                  "mode": "fixed"
                }
              }
            ]
          }
        ]
      },
      "gridPos": {
        "h": 13,
        "w": 9,
        "x": 6,
        "y": 16
      },
      "id": 6,
      "options": {
        "legend": {
          "calcs": [
            "min",
            "max"
          ],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_request_mean_time_per_token_duration_bucket{container=\"$service\"}[10m])))",
          "legendFormat": "p50",
          "range": true,
          "refId": "A"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_request_mean_time_per_token_duration_bucket{container=\"$service\"}[10m])))",
          "hide": false,
          "legendFormat": "p90",
          "range": true,
          "refId": "B"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_request_mean_time_per_token_duration_bucket{container=\"$service\"}[10m])))",
          "hide": false,
          "legendFormat": "p99",
          "range": true,
          "refId": "C"
        }
      ],
      "title": "Mean Time Per Token quantiles",
      "type": "timeseries"
    },
    {
      "cards": {},
      "color": {
        "cardColor": "#5794F2",
        "colorScale": "linear",
        "colorScheme": "interpolateSpectral",
        "exponent": 0.5,
        "min": 0,
        "mode": "opacity"
      },
      "dataFormat": "tsbuckets",
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
      },
      "fieldConfig": {
        "defaults": {
          "custom": {
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "scaleDistribution": {
              "type": "linear"
            }
          }
        },
        "overrides": []
      },
      "gridPos": {
        "h": 13,
        "w": 9,
        "x": 15,
        "y": 16
      },
      "heatmap": {},
      "hideZeroBuckets": false,
      "highlightCards": true,
      "id": 13,
      "legend": {
        "show": false
      },
      "maxDataPoints": 25,
      "options": {
        "calculate": false,
        "calculation": {},
        "cellGap": 2,
        "cellValues": {},
        "color": {
          "exponent": 0.5,
          "fill": "#5794F2",
          "min": 0,
          "mode": "scheme",
          "reverse": false,
          "scale": "exponential",
          "scheme": "Spectral",
          "steps": 128
        },
        "exemplars": {
          "color": "rgba(255,0,255,0.7)"
        },
        "filterValues": {
          "le": 1e-9
        },
        "legend": {
          "show": false
        },
        "rowsFrame": {
          "layout": "auto"
        },
        "showValue": "never",
        "tooltip": {
          "mode": "single",
          "showColorScale": false,
          "yHistogram": false
        },
        "yAxis": {
          "axisPlacement": "left",
          "decimals": 1,
          "reverse": false,
          "unit": "s"
        }
      },
      "pluginVersion": "10.4.2",
      "reverseYBuckets": false,
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "exemplar": true,
          "expr": "sum(increase(tgi_request_mean_time_per_token_duration_bucket{container=\"$service\"}[5m])) by (le)",
          "format": "heatmap",
          "interval": "",
          "legendFormat": "{{ le }}",
          "range": true,
          "refId": "A"
        }
      ],
      "title": "Mean Time Per Token",
      "tooltip": {
        "show": true,
        "showHistogram": false
      },
      "type": "heatmap",
      "xAxis": {
        "show": true
      },
      "yAxis": {
        "decimals": 1,
        "format": "s",
        "logBase": 1,
        "show": true
      },
      "yBucketBound": "auto"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 0,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "auto",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "mappings": [],
          "thresholds": {
            "mode": "percentage",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "orange",
                "value": 70
              },
              {
                "color": "red",
                "value": 85
              }
            ]
          }
        },
        "overrides": []
      },
      "gridPos": {
        "h": 5,
        "w": 3,
        "x": 0,
        "y": 24
      },
      "id": 18,
      "options": {
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": false
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "pluginVersion": "9.1.0",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "count(tgi_request_count{container=\"$service\"})",
          "legendFormat": "Replicas",
          "range": true,
          "refId": "A"
        }
      ],
      "title": "Number of replicas",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
      },
      "fieldConfig": {
        "defaults": {
          "mappings": [],
          "thresholds": {
            "mode": "percentage",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "orange",
                "value": 70
              },
              {
                "color": "red",
                "value": 85
              }
            ]
          }
        },
        "overrides": []
      },
      "gridPos": {
        "h": 5,
        "w": 3,
        "x": 3,
        "y": 24
      },
      "id": 32,
      "options": {
        "minVizHeight": 75,
        "minVizWidth": 75,
        "orientation": "auto",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        },
        "showThresholdLabels": false,
        "showThresholdMarkers": true,
        "sizing": "auto"
      },
      "pluginVersion": "10.4.2",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "sum(tgi_queue_size{container=\"$service\"})",
          "legendFormat": "__auto",
          "range": true,
          "refId": "A"
        }
      ],
      "title": "Queue Size",
      "type": "gauge"
    },
    {
      "collapsed": false,
      "gridPos": {
        "h": 1,
        "w": 24,
        "x": 0,
        "y": 29
      },
      "id": 26,
      "panels": [],
      "title": "Batching",
      "type": "row"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "bars",
            "fillOpacity": 50,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "normal"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          }
        },
        "overrides": []
      },
      "gridPos": {
        "h": 5,
        "w": 6,
        "x": 0,
        "y": 30
      },
      "id": 29,
      "maxDataPoints": 40,
      "options": {
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": false
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "pluginVersion": "9.1.0",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "avg(tgi_batch_current_max_tokens{container=\"$service\"})",
          "legendFormat": "{{ pod }}",
          "range": true,
          "refId": "A"
        }
      ],
      "title": "Max tokens per batch",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 0,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "none"
        },
        "overrides": [
          {
            "matcher": {
              "id": "byName",
              "options": "p50"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "green",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "p90"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "orange",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "p99"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "red",
                  "mode": "fixed"
                }
              }
            ]
          }
        ]
      },
      "gridPos": {
        "h": 9,
        "w": 4,
        "x": 6,
        "y": 30
      },
      "id": 33,
      "options": {
        "legend": {
          "calcs": [
            "min",
            "max"
          ],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_request_skipped_tokens_bucket{container=\"$service\"}[10m])))",
          "legendFormat": "p50",
          "range": true,
          "refId": "A"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_request_skipped_tokens_bucket{container=\"$service\"}[10m])))",
          "hide": false,
          "legendFormat": "p90",
          "range": true,
          "refId": "B"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_request_skipped_tokens_bucket{container=\"$service\"}[10m])))",
          "hide": false,
          "legendFormat": "p99",
          "range": true,
          "refId": "C"
        }
      ],
      "title": "Speculated Tokens",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 0,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "none"
        },
        "overrides": [
          {
            "matcher": {
              "id": "byName",
              "options": "p50"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "green",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "p90"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "orange",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "p99"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "red",
                  "mode": "fixed"
                }
              }
            ]
          }
        ]
      },
      "gridPos": {
        "h": 9,
        "w": 5,
        "x": 10,
        "y": 30
      },
      "id": 46,
      "options": {
        "legend": {
          "calcs": [
            "min",
            "max"
          ],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_request_input_length_bucket{container=\"$service\"}[10m])))",
          "legendFormat": "p50",
          "range": true,
          "refId": "A"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_request_input_length_bucket{container=\"$service\"}[10m])))",
          "hide": false,
          "legendFormat": "p90",
          "range": true,
          "refId": "B"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_request_input_length_bucket{container=\"$service\"}[10m])))",
          "hide": false,
          "legendFormat": "p99",
          "range": true,
          "refId": "C"
        }
      ],
      "title": "Prompt Tokens",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 0,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "s"
        },
        "overrides": [
          {
            "matcher": {
              "id": "byName",
              "options": "p50"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "green",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "p90"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "orange",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "p99"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "red",
                  "mode": "fixed"
                }
              }
            ]
          }
        ]
      },
      "gridPos": {
        "h": 9,
        "w": 9,
        "x": 15,
        "y": 30
      },
      "id": 8,
      "options": {
        "legend": {
          "calcs": [
            "min",
            "max"
          ],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_request_duration_bucket{container=\"$service\"}[10m])))",
          "legendFormat": "p50",
          "range": true,
          "refId": "A"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_request_duration_bucket{container=\"$service\"}[10m])))",
          "hide": false,
          "legendFormat": "p90",
          "range": true,
          "refId": "B"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_request_duration_bucket{container=\"$service\"}[10m])))",
          "hide": false,
          "legendFormat": "p99",
          "range": true,
          "refId": "C"
        }
      ],
      "title": "Latency quantiles",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "bars",
            "fillOpacity": 50,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "normal"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          }
        },
        "overrides": []
      },
      "gridPos": {
        "h": 4,
        "w": 6,
        "x": 0,
        "y": 35
      },
      "id": 27,
      "maxDataPoints": 40,
      "options": {
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": false
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "pluginVersion": "9.1.0",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "avg(tgi_batch_current_size{container=\"$service\"})",
          "legendFormat": "{{ pod }}",
          "range": true,
          "refId": "A"
        }
      ],
      "title": "Batch Size",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 30,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          }
        },
        "overrides": []
      },
      "gridPos": {
        "h": 9,
        "w": 6,
        "x": 0,
        "y": 39
      },
      "id": 28,
      "maxDataPoints": 100,
      "options": {
        "legend": {
          "calcs": [
            "min",
            "max"
          ],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "sum(increase(tgi_batch_concat{container=\"$service\"}[1m])) by (reason)",
          "hide": false,
          "legendFormat": "Reason: {{ reason }}",
          "range": true,
          "refId": "B"
        }
      ],
      "title": "Concatenates",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 0,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "s"
        },
        "overrides": [
          {
            "matcher": {
              "id": "byName",
              "options": "p50"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "green",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "p90"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "orange",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "p99"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "red",
                  "mode": "fixed"
                }
              }
            ]
          }
        ]
      },
      "gridPos": {
        "h": 9,
        "w": 9,
        "x": 6,
        "y": 39
      },
      "id": 31,
      "options": {
        "legend": {
          "calcs": [
            "min",
            "max"
          ],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_request_queue_duration_bucket{container=\"$service\"}[10m])))",
          "legendFormat": "p50",
          "range": true,
          "refId": "A"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_request_queue_duration_bucket{container=\"$service\"}[10m])))",
          "hide": false,
          "legendFormat": "p90",
          "range": true,
          "refId": "B"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_request_queue_duration_bucket{container=\"$service\"}[10m])))",
          "hide": false,
          "legendFormat": "p99",
          "range": true,
          "refId": "C"
        }
      ],
      "title": "Queue quantiles",
      "type": "timeseries"
    },
    {
      "collapsed": false,
      "gridPos": {
        "h": 1,
        "w": 24,
        "x": 0,
        "y": 48
      },
      "id": 22,
      "panels": [],
      "title": "Prefill",
      "type": "row"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 0,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "s"
        },
        "overrides": [
          {
            "matcher": {
              "id": "byName",
              "options": "p50"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "green",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "p90"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "orange",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "p99"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "red",
                  "mode": "fixed"
                }
              }
            ]
          }
        ]
      },
      "gridPos": {
        "h": 11,
        "w": 12,
        "x": 0,
        "y": 49
      },
      "id": 7,
      "options": {
        "legend": {
          "calcs": [
            "min",
            "max"
          ],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"prefill\", container=\"$service\"}[10m])))",
          "legendFormat": "p50",
          "range": true,
          "refId": "A"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"prefill\", container=\"$service\"}[10m])))",
          "hide": false,
          "legendFormat": "p90",
          "range": true,
          "refId": "B"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"prefill\", container=\"$service\"}[10m])))",
          "hide": false,
          "legendFormat": "p99",
          "range": true,
          "refId": "C"
        }
      ],
      "title": "Prefill Quantiles",
      "type": "timeseries"
    },
    {
      "cards": {},
      "color": {
        "cardColor": "#5794F2",
        "colorScale": "linear",
        "colorScheme": "interpolateSpectral",
        "exponent": 0.5,
        "min": 0,
        "mode": "opacity"
      },
      "dataFormat": "tsbuckets",
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
      },
      "fieldConfig": {
        "defaults": {
          "custom": {
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "scaleDistribution": {
              "type": "linear"
            }
          }
        },
        "overrides": []
      },
      "gridPos": {
        "h": 11,
        "w": 12,
        "x": 12,
        "y": 49
      },
      "heatmap": {},
      "hideZeroBuckets": false,
      "highlightCards": true,
      "id": 14,
      "legend": {
        "show": false
      },
      "maxDataPoints": 25,
      "options": {
        "calculate": false,
        "calculation": {},
        "cellGap": 2,
        "cellValues": {},
        "color": {
          "exponent": 0.5,
          "fill": "#5794F2",
          "min": 0,
          "mode": "scheme",
          "reverse": false,
          "scale": "exponential",
          "scheme": "Spectral",
          "steps": 128
        },
        "exemplars": {
          "color": "rgba(255,0,255,0.7)"
        },
        "filterValues": {
          "le": 1e-9
        },
        "legend": {
          "show": false
        },
        "rowsFrame": {
          "layout": "auto"
        },
        "showValue": "never",
        "tooltip": {
          "mode": "single",
          "showColorScale": false,
          "yHistogram": false
        },
        "yAxis": {
          "axisPlacement": "left",
          "decimals": 1,
          "reverse": false,
          "unit": "s"
        }
      },
      "pluginVersion": "10.4.2",
      "reverseYBuckets": false,
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "exemplar": true,
          "expr": "sum(increase(tgi_batch_inference_duration_bucket{method=\"prefill\", container=\"$service\"}[5m])) by (le)",
          "format": "heatmap",
          "interval": "",
          "legendFormat": "{{ le }}",
          "range": true,
          "refId": "A"
        }
      ],
      "title": "Prefill Latency",
      "tooltip": {
        "show": true,
        "showHistogram": false
      },
      "type": "heatmap",
      "xAxis": {
        "show": true
      },
      "yAxis": {
        "decimals": 1,
        "format": "s",
        "logBase": 1,
        "show": true
      },
      "yBucketBound": "auto"
    },
    {
      "collapsed": false,
      "gridPos": {
        "h": 1,
        "w": 24,
        "x": 0,
        "y": 60
      },
      "id": 24,
      "panels": [],
      "title": "Decode",
      "type": "row"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 0,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "s"
        },
        "overrides": [
          {
            "matcher": {
              "id": "byName",
              "options": "p50"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "green",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "p90"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "orange",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "p99"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "red",
                  "mode": "fixed"
                }
              }
            ]
          }
        ]
      },
      "gridPos": {
        "h": 11,
        "w": 12,
        "x": 0,
        "y": 61
      },
      "id": 11,
      "options": {
        "legend": {
          "calcs": [
            "min",
            "max"
          ],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
          "legendFormat": "p50",
          "range": true,
          "refId": "A"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
          "hide": false,
          "legendFormat": "p90",
          "range": true,
          "refId": "B"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
          "hide": false,
          "legendFormat": "p99",
          "range": true,
          "refId": "C"
        }
      ],
      "title": "Decode quantiles",
      "type": "timeseries"
    },
    {
      "cards": {},
      "color": {
        "cardColor": "#5794F2",
        "colorScale": "linear",
        "colorScheme": "interpolateSpectral",
        "exponent": 0.5,
        "min": 0,
        "mode": "opacity"
      },
      "dataFormat": "tsbuckets",
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
      },
      "fieldConfig": {
        "defaults": {
          "custom": {
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "scaleDistribution": {
              "type": "linear"
            }
          }
        },
        "overrides": []
      },
      "gridPos": {
        "h": 11,
        "w": 12,
        "x": 12,
        "y": 61
      },
      "heatmap": {},
      "hideZeroBuckets": false,
      "highlightCards": true,
      "id": 15,
      "legend": {
        "show": false
      },
      "maxDataPoints": 25,
      "options": {
        "calculate": false,
        "calculation": {},
        "cellGap": 2,
        "cellValues": {},
        "color": {
          "exponent": 0.5,
          "fill": "#5794F2",
          "min": 0,
          "mode": "scheme",
          "reverse": false,
          "scale": "exponential",
          "scheme": "Spectral",
          "steps": 128
        },
        "exemplars": {
          "color": "rgba(255,0,255,0.7)"
        },
        "filterValues": {
          "le": 1e-9
        },
        "legend": {
          "show": false
        },
        "rowsFrame": {
          "layout": "auto"
        },
        "showValue": "never",
        "tooltip": {
          "mode": "single",
          "showColorScale": false,
          "yHistogram": false
        },
        "yAxis": {
          "axisPlacement": "left",
          "decimals": 1,
          "reverse": false,
          "unit": "s"
        }
      },
      "pluginVersion": "10.4.2",
      "reverseYBuckets": false,
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "exemplar": true,
          "expr": "sum(increase(tgi_batch_inference_duration_bucket{method=\"decode\", container=\"$service\"}[5m])) by (le)",
          "format": "heatmap",
          "interval": "",
          "legendFormat": "{{ le }}",
          "range": true,
          "refId": "A"
        }
      ],
      "title": "Decode Latency",
      "tooltip": {
        "show": true,
        "showHistogram": false
      },
      "type": "heatmap",
      "xAxis": {
        "show": true
      },
      "yAxis": {
        "decimals": 1,
        "format": "s",
        "logBase": 1,
        "show": true
      },
      "yBucketBound": "auto"
    },
    {
      "collapsed": false,
      "gridPos": {
        "h": 1,
        "w": 24,
        "x": 0,
        "y": 72
      },
      "id": 43,
      "panels": [],
      "title": "Debug",
      "type": "row"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 0,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "s"
        },
        "overrides": [
          {
            "matcher": {
              "id": "byName",
              "options": "p50"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "green",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "p90"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "orange",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "p99"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "red",
                  "mode": "fixed"
                }
              }
            ]
          }
        ]
      },
      "gridPos": {
        "h": 11,
        "w": 6,
        "x": 0,
        "y": 73
      },
      "id": 38,
      "options": {
        "legend": {
          "calcs": [
            "min",
            "max"
          ],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_batch_forward_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
          "legendFormat": "p50",
          "range": true,
          "refId": "A"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_batch_forward_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
          "hide": false,
          "legendFormat": "p90",
          "range": true,
          "refId": "B"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_batch_forward_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
          "hide": false,
          "legendFormat": "p99",
          "range": true,
          "refId": "C"
        }
      ],
      "title": "Forward quantiles",
      "type": "timeseries"
    },
    {
      "cards": {},
      "color": {
        "cardColor": "#5794F2",
        "colorScale": "linear",
        "colorScheme": "interpolateSpectral",
        "exponent": 0.5,
        "min": 0,
        "mode": "opacity"
      },
      "dataFormat": "tsbuckets",
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
      },
      "fieldConfig": {
        "defaults": {
          "custom": {
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "scaleDistribution": {
              "type": "linear"
            }
          }
        },
        "overrides": []
      },
      "gridPos": {
        "h": 11,
        "w": 6,
        "x": 6,
        "y": 73
      },
      "heatmap": {},
      "hideZeroBuckets": false,
      "highlightCards": true,
      "id": 35,
      "legend": {
        "show": false
      },
      "maxDataPoints": 25,
      "options": {
        "calculate": false,
        "calculation": {},
        "cellGap": 2,
        "cellValues": {},
        "color": {
          "exponent": 0.5,
          "fill": "#5794F2",
          "min": 0,
          "mode": "scheme",
          "reverse": false,
          "scale": "exponential",
          "scheme": "Spectral",
          "steps": 128
        },
        "exemplars": {
          "color": "rgba(255,0,255,0.7)"
        },
        "filterValues": {
          "le": 1e-9
        },
        "legend": {
          "show": false
        },
        "rowsFrame": {
          "layout": "auto"
        },
        "showValue": "never",
        "tooltip": {
          "mode": "single",
          "showColorScale": false,
          "yHistogram": false
        },
        "yAxis": {
          "axisPlacement": "left",
          "decimals": 1,
          "reverse": false,
          "unit": "s"
        }
      },
      "pluginVersion": "10.4.2",
      "reverseYBuckets": false,
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "exemplar": true,
          "expr": "sum(increase(tgi_batch_forward_duration_bucket{method=\"decode\", container=\"$service\"}[5m])) by (le)",
          "format": "heatmap",
          "interval": "",
          "legendFormat": "{{ le }}",
          "range": true,
          "refId": "A"
        }
      ],
      "title": "Forward Latency",
      "tooltip": {
        "show": true,
        "showHistogram": false
      },
      "type": "heatmap",
      "xAxis": {
        "show": true
      },
      "yAxis": {
        "decimals": 1,
        "format": "s",
        "logBase": 1,
        "show": true
      },
      "yBucketBound": "auto"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 0,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "s"
        },
        "overrides": [
          {
            "matcher": {
              "id": "byName",
              "options": "p50"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "green",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "p90"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "orange",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "p99"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "red",
                  "mode": "fixed"
                }
              }
            ]
          }
        ]
      },
      "gridPos": {
        "h": 11,
        "w": 6,
        "x": 12,
        "y": 73
      },
      "id": 34,
      "options": {
        "legend": {
          "calcs": [
            "min",
            "max"
          ],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_batch_decode_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
          "legendFormat": "p50",
          "range": true,
          "refId": "A"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_batch_decode_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
          "hide": false,
          "legendFormat": "p90",
          "range": true,
          "refId": "B"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_batch_decode_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
          "hide": false,
          "legendFormat": "p99",
          "range": true,
          "refId": "C"
        }
      ],
      "title": "Token Decode quantiles",
      "type": "timeseries"
    },
    {
      "cards": {},
      "color": {
        "cardColor": "#5794F2",
        "colorScale": "linear",
        "colorScheme": "interpolateSpectral",
        "exponent": 0.5,
        "min": 0,
        "mode": "opacity"
      },
      "dataFormat": "tsbuckets",
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
      },
      "fieldConfig": {
        "defaults": {
          "custom": {
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "scaleDistribution": {
              "type": "linear"
            }
          }
        },
        "overrides": []
      },
      "gridPos": {
        "h": 11,
        "w": 6,
        "x": 18,
        "y": 73
      },
      "heatmap": {},
      "hideZeroBuckets": false,
      "highlightCards": true,
      "id": 40,
      "legend": {
        "show": false
      },
      "maxDataPoints": 25,
      "options": {
        "calculate": false,
        "calculation": {},
        "cellGap": 2,
        "cellValues": {},
        "color": {
          "exponent": 0.5,
          "fill": "#5794F2",
          "min": 0,
          "mode": "scheme",
          "reverse": false,
          "scale": "exponential",
          "scheme": "Spectral",
          "steps": 128
        },
        "exemplars": {
          "color": "rgba(255,0,255,0.7)"
        },
        "filterValues": {
          "le": 1e-9
        },
        "legend": {
          "show": false
        },
        "rowsFrame": {
          "layout": "auto"
        },
        "showValue": "never",
        "tooltip": {
          "mode": "single",
          "showColorScale": false,
          "yHistogram": false
        },
        "yAxis": {
          "axisPlacement": "left",
          "decimals": 1,
          "reverse": false,
          "unit": "s"
        }
      },
      "pluginVersion": "10.4.2",
      "reverseYBuckets": false,
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "exemplar": true,
          "expr": "sum(increase(tgi_batch_decode_duration_bucket{method=\"decode\", container=\"$service\"}[5m])) by (le)",
          "format": "heatmap",
          "interval": "",
          "legendFormat": "{{ le }}",
          "range": true,
          "refId": "A"
        }
      ],
      "title": "Token Decode Latency",
      "tooltip": {
        "show": true,
        "showHistogram": false
      },
      "type": "heatmap",
      "xAxis": {
        "show": true
      },
      "yAxis": {
        "decimals": 1,
        "format": "s",
        "logBase": 1,
        "show": true
      },
      "yBucketBound": "auto"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 0,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "s"
        },
        "overrides": [
          {
            "matcher": {
              "id": "byName",
              "options": "p50"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "green",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "p90"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "orange",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "p99"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "red",
                  "mode": "fixed"
                }
              }
            ]
          }
        ]
      },
      "gridPos": {
        "h": 11,
        "w": 6,
        "x": 0,
        "y": 84
      },
      "id": 42,
      "options": {
        "legend": {
          "calcs": [
            "min",
            "max"
          ],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_batch_filter_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
          "legendFormat": "p50",
          "range": true,
          "refId": "A"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_batch_filter_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
          "hide": false,
          "legendFormat": "p90",
          "range": true,
          "refId": "B"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_batch_filter_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
          "hide": false,
          "legendFormat": "p99",
          "range": true,
          "refId": "C"
        }
      ],
      "title": "Filter Batch quantiles",
      "type": "timeseries"
    },
    {
      "cards": {},
      "color": {
        "cardColor": "#5794F2",
        "colorScale": "linear",
        "colorScheme": "interpolateSpectral",
        "exponent": 0.5,
        "min": 0,
        "mode": "opacity"
      },
      "dataFormat": "tsbuckets",
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
      },
      "fieldConfig": {
        "defaults": {
          "custom": {
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "scaleDistribution": {
              "type": "linear"
            }
          }
        },
        "overrides": []
      },
      "gridPos": {
        "h": 11,
        "w": 6,
        "x": 6,
        "y": 84
      },
      "heatmap": {},
      "hideZeroBuckets": false,
      "highlightCards": true,
      "id": 39,
      "legend": {
        "show": false
      },
      "maxDataPoints": 25,
      "options": {
        "calculate": false,
        "calculation": {},
        "cellGap": 2,
        "cellValues": {},
        "color": {
          "exponent": 0.5,
          "fill": "#5794F2",
          "min": 0,
          "mode": "scheme",
          "reverse": false,
          "scale": "exponential",
          "scheme": "Spectral",
          "steps": 128
        },
        "exemplars": {
          "color": "rgba(255,0,255,0.7)"
        },
        "filterValues": {
          "le": 1e-9
        },
        "legend": {
          "show": false
        },
        "rowsFrame": {
          "layout": "auto"
        },
        "showValue": "never",
        "tooltip": {
          "mode": "single",
          "showColorScale": false,
          "yHistogram": false
        },
        "yAxis": {
          "axisPlacement": "left",
          "decimals": 1,
          "reverse": false,
          "unit": "s"
        }
      },
      "pluginVersion": "10.4.2",
      "reverseYBuckets": false,
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "exemplar": true,
          "expr": "sum(increase(tgi_batch_filter_duration_bucket{method=\"decode\", container=\"$service\"}[5m])) by (le)",
          "format": "heatmap",
          "interval": "",
          "legendFormat": "{{ le }}",
          "range": true,
          "refId": "A"
        }
      ],
      "title": "Filter Batch Latency",
      "tooltip": {
        "show": true,
        "showHistogram": false
      },
      "type": "heatmap",
      "xAxis": {
        "show": true
      },
      "yAxis": {
        "decimals": 1,
        "format": "s",
        "logBase": 1,
        "show": true
      },
      "yBucketBound": "auto"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
      },
      "fieldConfig": {
        "defaults": {
          "color": {
            "mode": "palette-classic"
          },
          "custom": {
            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 0,
            "gradientMode": "none",
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": {
              "type": "linear"
            },
            "showPoints": "never",
            "spanNulls": false,
            "stacking": {
              "group": "A",
              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              },
              {
                "color": "red",
                "value": 80
              }
            ]
          },
          "unit": "s"
        },
        "overrides": [
          {
            "matcher": {
              "id": "byName",
              "options": "p50"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "green",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "p90"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "orange",
                  "mode": "fixed"
                }
              }
            ]
          },
          {
            "matcher": {
              "id": "byName",
              "options": "p99"
            },
            "properties": [
              {
                "id": "color",
                "value": {
                  "fixedColor": "red",
                  "mode": "fixed"
                }
              }
            ]
          }
        ]
      },
      "gridPos": {
        "h": 11,
        "w": 6,
        "x": 12,
        "y": 84
      },
      "id": 36,
      "options": {
        "legend": {
          "calcs": [
            "min",
            "max"
          ],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "single",
          "sort": "none"
        }
      },
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_batch_concat_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
          "legendFormat": "p50",
          "range": true,
          "refId": "A"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_batch_concat_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
          "hide": false,
          "legendFormat": "p90",
          "range": true,
          "refId": "B"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_batch_concat_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
          "hide": false,
          "legendFormat": "p99",
          "range": true,
          "refId": "C"
        }
      ],
      "title": "Batch Concat quantiles",
      "type": "timeseries"
    },
    {
      "cards": {},
      "color": {
        "cardColor": "#5794F2",
        "colorScale": "linear",
        "colorScheme": "interpolateSpectral",
        "exponent": 0.5,
        "min": 0,
        "mode": "opacity"
      },
      "dataFormat": "tsbuckets",
      "datasource": {
        "type": "prometheus",
        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
      },
      "fieldConfig": {
        "defaults": {
          "custom": {
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "scaleDistribution": {
              "type": "linear"
            }
          }
        },
        "overrides": []
      },
      "gridPos": {
        "h": 11,
        "w": 6,
        "x": 18,
        "y": 84
      },
      "heatmap": {},
      "hideZeroBuckets": false,
      "highlightCards": true,
      "id": 41,
      "legend": {
        "show": false
      },
      "maxDataPoints": 25,
      "options": {
        "calculate": false,
        "calculation": {},
        "cellGap": 2,
        "cellValues": {},
        "color": {
          "exponent": 0.5,
          "fill": "#5794F2",
          "min": 0,
          "mode": "scheme",
          "reverse": false,
          "scale": "exponential",
          "scheme": "Spectral",
          "steps": 128
        },
        "exemplars": {
          "color": "rgba(255,0,255,0.7)"
        },
        "filterValues": {
          "le": 1e-9
        },
        "legend": {
          "show": false
        },
        "rowsFrame": {
          "layout": "auto"
        },
        "showValue": "never",
        "tooltip": {
          "mode": "single",
          "showColorScale": false,
          "yHistogram": false
        },
        "yAxis": {
          "axisPlacement": "left",
          "decimals": 1,
          "reverse": false,
          "unit": "s"
        }
      },
      "pluginVersion": "10.4.2",
      "reverseYBuckets": false,
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
          },
          "editorMode": "code",
          "exemplar": true,
          "expr": "sum(increase(tgi_batch_concat_duration_bucket{method=\"decode\", container=\"$service\"}[5m])) by (le)",
          "format": "heatmap",
          "interval": "",
          "legendFormat": "{{ le }}",
          "range": true,
          "refId": "A"
        }
      ],
      "title": "Batch Concat latency",
      "tooltip": {
        "show": true,
        "showHistogram": false
      },
      "type": "heatmap",
      "xAxis": {
        "show": true
      },
      "yAxis": {
        "decimals": 1,
        "format": "s",
        "logBase": 1,
        "show": true
      },
      "yBucketBound": "auto"
    }
  ],
  "refresh": "",
  "schemaVersion": 39,
  "tags": [],
  "templating": {
    "list": [
      {
        "current": {
          "selected": false,
          "text": "gpu-txt-gen-cohereforai-c4ai-command-r-plu-ba7f1",
          "value": "gpu-txt-gen-cohereforai-c4ai-command-r-plu-ba7f1"
        },
        "datasource": {
          "type": "prometheus",
          "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
        },
        "definition": "label_values(tgi_request_count, container)",
        "hide": 0,
        "includeAll": false,
        "multi": false,
        "name": "service",
        "options": [],
        "query": {
          "query": "label_values(tgi_request_count, container)",
          "refId": "StandardVariableQuery"
        },
        "refresh": 1,
        "regex": "",
        "skipUrlSync": false,
        "sort": 1,
        "type": "query"
      }
    ]
  },
  "time": {
    "from": "now-30m",
    "to": "now-30s"
  },
  "timepicker": {
    "nowDelay": "30s"
  },
  "timezone": "",
  "title": "Text Generation Inference",
  "uid": "RHSk7EL4kdqsd",
  "version": 12,
  "weekStart": ""
}


================================================
FILE: backends/client/Cargo.toml
================================================
[package]
name = "text-generation-client"
version.workspace = true
edition.workspace = true
authors.workspace = true
homepage.workspace = true

[dependencies]
async-trait = "^0.1"
base64 = { workspace = true }
futures = "^0.3"
grpc-metadata = { path = "../grpc-metadata" }
prost = "^0.12"
thiserror = "^1.0"
tokio = { version = "^1.32", features = ["sync"] }
tonic = "^0.10"
tower = "^0.4"
tracing = "^0.1"

[build-dependencies]
tonic-build = "0.10.1"
prost-build = "0.12.1"


================================================
FILE: backends/client/build.rs
================================================
use std::fs;

fn main() -> Result<(), Box<dyn std::error::Error>> {
    println!("cargo:rerun-if-changed=../../proto/");

    fs::create_dir_all("src/v2/pb").unwrap_or(());
    let mut config = prost_build::Config::new();
    config.protoc_arg("--experimental_allow_proto3_optional");

    tonic_build::configure()
        .build_client(true)
        .build_server(false)
        .out_dir("src/v2/pb")
        .include_file("mod.rs")
        .compile_with_config(config, &["../../proto/generate.proto"], &["../../proto"])
        .map_err(|e| match e.kind(){
            std::io::ErrorKind::NotFound => {panic!("`protoc` not found, install libprotoc")},
            std::io::ErrorKind::Other => {panic!("`protoc` version unsupported, upgrade protoc: https://github.com/protocolbuffers/protobuf/releases")},
            e => {e}
        }).unwrap_or_else(|e| panic!("protobuf compilation failed: {e}"));

    fs::create_dir_all("src/v3/pb").unwrap_or(());
    let mut config = prost_build::Config::new();
    config.protoc_arg("--experimental_allow_proto3_optional");

    tonic_build::configure()
        .build_client(true)
        .build_server(false)
        .out_dir("src/v3/pb")
        .include_file("mod.rs")
        .compile_with_config(config, &["../../proto/v3/generate.proto"], &["../../proto"])
        .unwrap_or_else(|e| panic!("protobuf compilation failed: {e}"));

    Ok(())
}


================================================
FILE: backends/client/src/lib.rs
================================================
//! Text Generation gRPC client library

pub mod v2;
pub mod v3;

use async_trait::async_trait;
use base64::{engine::general_purpose::STANDARD, Engine};
use thiserror::Error;
use tonic::transport;
use tonic::Status;

pub use v3::{Chunk, Image, Input, InputChunk};

#[async_trait]
pub trait Health {
    /// Check if a generate server is healthy by asking it to allocate a tensor on device
    async fn device_health(&self) -> Result<()>;

    /// Check if a generate server is healthy by doing a forward pass.
    /// EXPENSIVE
    async fn model_health(&self) -> Result<()>;
}

#[derive(Debug)]
pub struct ShardInfo {
    pub requires_padding: bool,
    pub dtype: String,
    pub device_type: String,
    pub window_size: Option<u32>,
    pub speculate: u32,
}

#[derive(Error, Debug, Clone)]
pub enum ClientError {
    #[error("Could not connect to Text Generation server: {0}")]
    Connection(String),
    #[error("Server error: {0}")]
    Generation(String),
    #[error("Sharded results are empty")]
    EmptyResults,
}

impl From<Status> for ClientError {
    fn from(err: Status) -> Self {
        let err = Self::Generation(err.message().to_string());
        tracing::error!("{err}");
        err
    }
}

impl From<transport::Error> for ClientError {
    fn from(err: transport::Error) -> Self {
        let err = Self::Connection(err.to_string());
        tracing::error!("{err}");
        err
    }
}

// Small convenience re-wrapping of `Chunk`.
impl From<Chunk> for InputChunk {
    fn from(chunk: Chunk) -> Self {
        InputChunk { chunk: Some(chunk) }
    }
}

/// Convert input chunks to a stringly-typed input for backwards
/// compat for backends that haven't implemented chunked inputs.
pub trait ChunksToString {
    /// Convert chunks to string.
    fn chunks_to_string(&self) -> String;
}

impl ChunksToString for Vec<InputChunk> {
    fn chunks_to_string(&self) -> String {
        let mut output = String::new();
        self.iter().for_each(|c| match &c.chunk {
            Some(Chunk::Text(text)) => output.push_str(text),
            Some(Chunk::Image(Image { data, mimetype })) => {
                let encoded = STANDARD.encode(data);
                output.push_str(&format!("![](data:{};base64,{})", mimetype, encoded))
            }
            // We don't create empty chunks, so this should be unreachable.
            None => unreachable!("Chunks should never be empty"),
        });
        output
    }
}

static WARMUP_IMAGE_BASE64 :&str = "iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAIAAAAC64paAAABg2lDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw0AcxV/TSotUROxQxCFDdbKLijjWKhShQqgVWnUwufQLmrQkKS6OgmvBwY/FqoOLs64OroIg+AHi7OCk6CIl/i8ptIjx4Lgf7+497t4BQqvKNDOQADTdMjKppJjLr4rBVwQQwhAERGVm1uckKQ3P8XUPH1/v4jzL+9yfY0AtmAzwicQJVjcs4g3imU2rznmfOMLKskp8Tjxh0AWJH7muuPzGueSwwDMjRjYzTxwhFks9rPQwKxsa8TRxTNV0yhdyLquctzhr1Qbr3JO/MFzQV5a5TnMUKSxiCRJEKGiggiosxGnVSTGRof2kh3/E8UvkUshVASPHAmrQIDt+8D/43a1ZnJp0k8JJoO/Ftj/GgOAu0G7a9vexbbdPAP8zcKV3/bUWMPtJerOrxY6AwW3g4rqrKXvA5Q4QfarLhuxIfppCsQi8n9E35YHhW6B/ze2ts4/TByBLXaVvgINDYLxE2ese7w719vbvmU5/PycecohsjayNAAAACXBIWXMAAC4jAAAuIwF4pT92AAAAB3RJTUUH6AQIEQMnlTSSjwAAABl0RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAASSURBVDjLY2AYBaNgFIyCoQsABMQAAeRw1DoAAAAASUVORK5CYII=";

pub type Result<T> = std::result::Result<T, ClientError>;


================================================
FILE: backends/client/src/v2/client.rs
================================================
/// Single shard Client
use crate::v2::pb;
use crate::{ClientError, Result};

use crate::WARMUP_IMAGE_BASE64;
use grpc_metadata::InjectTelemetryContext;
use pb::generate::v2::text_generation_service_client::TextGenerationServiceClient;
use pb::generate::v2::*;
use std::cmp::min;
use std::time::Duration;
use tonic::transport::{Channel, Uri};
use tracing::instrument;

/// Text Generation Inference gRPC client
#[derive(Debug, Clone)]
pub struct Client {
    stub: TextGenerationServiceClient<Channel>,
}

impl Client {
    /// Returns a client connected to the given url
    pub async fn connect(uri: Uri) -> Result<Self> {
        let channel = Channel::builder(uri).connect().await?;

        Ok(Self {
            stub: TextGenerationServiceClient::new(channel),
        })
    }

    /// Returns a client connected to the given unix socket
    pub async fn connect_uds(path: String) -> Result<Self> {
        let channel = Channel::from_shared("http://[::]:50051".to_string())
            .unwrap()
            .connect_with_connector(tower::service_fn(move |_: Uri| {
                tokio::net::UnixStream::connect(path.clone())
            }))
            .await?;

        Ok(Self {
            stub: TextGenerationServiceClient::new(channel),
        })
    }

    /// Returns a list of uris or unix sockets of all shards
    #[instrument(skip(self))]
    pub async fn service_discovery(&mut self) -> Result<Vec<String>> {
        let request = tonic::Request::new(ServiceDiscoveryRequest {}).inject_context();
        let response = self.stub.service_discovery(request).await.map_err(|_| {
            ClientError::Connection("Server does not support v2 interface".to_string())
        })?;
        let urls = response
            .into_inner()
            .urls
            .into_iter()
            // Remove unix socket prefix
            .map(|url| match url.strip_prefix("unix://") {
                None => url,
                Some(stripped_url) => stripped_url.to_string(),
            })
            .collect();
        Ok(urls)
    }

    /// Get model info
    #[instrument(skip(self))]
    pub async fn info(&mut self) -> Result<InfoResponse> {
        let request = tonic::Request::new(InfoRequest {}).inject_context();
        let response = self.stub.info(request).await?.into_inner();
        Ok(response)
    }

    /// Get model health
    #[instrument(skip(self))]
    pub async fn health(&mut self) -> Result<HealthResponse> {
        let request = tonic::Request::new(HealthRequest {}).inject_context();
        let response = self.stub.health(request).await?.into_inner();
        Ok(response)
    }

    /// Clear the past generations cache
    #[instrument(skip(self))]
    pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {
        let request = tonic::Request::new(ClearCacheRequest { id: batch_id }).inject_context();
        self.stub.clear_cache(request).await?;
        Ok(())
    }

    /// Filter a cached batch
    #[instrument(skip(self))]
    pub async fn filter_batch(
        &mut self,
        batch_id: u64,
        request_ids: Vec<u64>,
    ) -> Result<Option<CachedBatch>> {
        let request = tonic::Request::new(FilterBatchRequest {
            batch_id,
            request_ids,
        })
        .inject_context();
        let filtered_batch = self.stub.filter_batch(request).await?.into_inner();
        Ok(filtered_batch.batch)
    }

    /// Warmup on a max size batch
    ///
    /// Returns the maximum amount of tokens supported by the hardware
    #[instrument(skip_all)]
    pub async fn warmup(
        &mut self,
        max_input_length: u32,
        max_prefill_tokens: u32,
        max_total_tokens: u32,
        max_batch_size: Option<usize>,
    ) -> Result<Option<u32>> {
        let mut n_tokens = 0;
        let mut requests = Vec::new();
        // Create requests
        while n_tokens < max_prefill_tokens {
            let truncate = min(max_input_length, max_prefill_tokens - n_tokens);

            let mut inputs = String::new();
            inputs.push_str(&"_test ".to_string().repeat(max_input_length as usize));
            if n_tokens == 0 {
                // 1 request is enough to test vision heads.
                // Sending images on other queries messes up easily with truncation.
                inputs.push_str(&format!(
                    "![](data:image/jpeg;base64,{WARMUP_IMAGE_BASE64})",
                ));
            }

            requests.push(Request {
                id: 0,
                inputs,
                // We truncate the input on the server side to be sure that it has the correct size
                truncate,
                // Set sampling parameters to also take these ops into account in the max memory
                parameters: Some(NextTokenChooserParameters {
                    temperature: 0.9,
                    top_k: 10,
                    top_p: 0.9,
                    typical_p: 0.9,
                    do_sample: false,
                    seed: 0,
                    repetition_penalty: 1.2,
                    frequency_penalty: 0.1,
                    watermark: true,
                    grammar: String::new(),
                    grammar_type: GrammarType::None as i32,
                }),
                stopping_parameters: Some(StoppingCriteriaParameters {
                    max_new_tokens: max_total_tokens - truncate,
                    stop_sequences: vec![],
                    ignore_eos_token: true,
                }),
                prefill_logprobs: true,
                top_n_tokens: 20,
            });
            n_tokens += max_input_length;

            // Check max_batch_size
            if Some(requests.len()) == max_batch_size {
                break;
            }
        }

        let batch = Batch {
            id: 0,
            size: requests.len() as u32,
            requests,
            max_tokens: 0,
        };

        let request = tonic::Request::new(WarmupRequest {
            batch: Some(batch),
            max_input_length,
            max_prefill_tokens,
            max_total_tokens,
        })
        .inject_context();
        let response = self.stub.warmup(request).await?.into_inner();
        Ok(response.max_supported_total_tokens)
    }

    /// Generate one token for each request in the given batch
    ///
    /// Returns Generation for each request in batch
    /// and the next cached batch
    #[instrument(skip_all, fields(id = &batch.id, size = &batch.size))]
    pub async fn prefill(
        &mut self,
        batch: Batch,
    ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
        let request = tonic::Request::new(PrefillRequest { batch: Some(batch) }).inject_context();
        let response = self.stub.prefill(request).await?.into_inner();
        Ok((
            response.generations,
            response.batch,
            PrefillTimings::new(response.forward_ns, response.decode_ns, response.total_ns),
        ))
    }

    /// Generate one token for each request in the given cached batches
    ///
    /// Returns Generation for each request in batches
    /// and the next cached batch
    #[instrument(skip_all, fields(size = batches.iter().map(|batch|{batch.size}).sum::<u32>()))]
    pub async fn decode(
        &mut self,
        batches: Vec<CachedBatch>,
    ) -> Result<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)> {
        let request = tonic::Request::new(DecodeRequest { batches }).inject_context();
        let response = self.stub.decode(request).await?.into_inner();
        Ok((
            response.generations,
            response.batch,
            DecodeTimings::new(
                response.concat_ns,
                response.forward_ns,
                response.decode_ns,
                response.total_ns,
            ),
        ))
    }
}

pub struct PrefillTimings {
    pub forward: Duration,
    pub decode: Duration,
    pub total: Duration,
}

impl PrefillTimings {
    fn new(forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
        Self {
            forward: Duration::from_nanos(forward_ns),
            decode: Duration::from_nanos(decode_ns),
            total: Duration::from_nanos(total_ns),
        }
    }
}

pub struct DecodeTimings {
    pub concat: Option<Duration>,
    pub forward: Duration,
    pub decode: Duration,
    pub total: Duration,
}

impl DecodeTimings {
    fn new(concat_ns: Option<u64>, forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
        Self {
            concat: concat_ns.map(Duration::from_nanos),
            forward: Duration::from_nanos(forward_ns),
            decode: Duration::from_nanos(decode_ns),
            total: Duration::from_nanos(total_ns),
        }
    }
}


================================================
FILE: backends/client/src/v2/mod.rs
================================================
#[allow(clippy::derive_partial_eq_without_eq)]
mod pb;

mod client;
mod sharded_client;

pub use client::Client;
pub use pb::generate::v2::HealthResponse;
pub use pb::generate::v2::{
    Batch, CachedBatch, FinishReason, GeneratedText, Generation, GrammarType, InfoResponse,
    NextTokenChooserParameters, Request, StoppingCriteriaParameters, Tokens,
};
pub use sharded_client::ShardedClient;


================================================
FILE: backends/client/src/v2/sharded_client.rs
================================================
/// Multi shard Client
use crate::{v2, Health, ShardInfo};
use crate::{ClientError, Result};

use crate::v2::InfoResponse;
use async_trait::async_trait;
use futures::future::join_all;
use tonic::transport::Uri;
use tracing::instrument;
use v2::client::{DecodeTimings, PrefillTimings};
use v2::{
    Batch, CachedBatch, Client, Generation, GrammarType, HealthResponse,
    NextTokenChooserParameters, Request, StoppingCriteriaParameters,
};

#[derive(Debug, Clone)]
/// Text Generation Inference gRPC multi client
pub struct ShardedClient {
    clients: Vec<Client>,
}

impl ShardedClient {
    fn new(clients: Vec<Client>) -> Self {
        Self { clients }
    }

    /// Create a new ShardedClient from a master client. The master client will communicate with
    /// the other shards and returns all uris/unix sockets with the `service_discovery` gRPC method.
    async fn from_master_client(mut master_client: Client) -> Result<Self> {
        // Get all uris/unix sockets from the master client
        let uris = master_client.service_discovery().await?;
        let futures = uris.into_iter().map(Client::connect_uds);
        let clients: Result<Vec<Client>> = join_all(futures).await.into_iter().collect();
        Ok(Self::new(clients?))
    }

    /// Returns a client connected to the given uri
    pub async fn connect(uri: Uri) -> Result<Self> {
        let master_client = Client::connect(uri).await?;
        Self::from_master_client(master_client).await
    }

    /// Returns a client connected to the given unix socket
    pub async fn connect_uds(path: String) -> Result<Self> {
        let master_client = Client::connect_uds(path).await?;
        Self::from_master_client(master_client).await
    }

    /// Get the model info
    #[instrument(skip(self))]
    pub async fn info(&mut self) -> Result<ShardInfo> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| client.info())
            .collect();
        join_all(futures).await.pop().unwrap().map(ShardInfo::from)
    }

    /// GRPC health check
    #[instrument(skip(self))]
    pub async fn health(&mut self) -> Result<HealthResponse> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| client.health())
            .collect();
        join_all(futures).await.pop().unwrap()
    }

    /// Clear the past generations cache
    #[instrument(skip(self))]
    pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| client.clear_cache(batch_id))
            .collect();
        join_all(futures).await.into_iter().collect()
    }

    /// Filter a cached batch
    #[instrument(skip(self))]
    pub async fn filter_batch(
        &mut self,
        batch_id: u64,
        request_ids: Vec<u64>,
    ) -> Result<Option<CachedBatch>> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| Box::pin(client.filter_batch(batch_id, request_ids.clone())))
            .collect();
        // all shards return the same message
        join_all(futures).await.pop().unwrap()
    }

    /// Warmup on a max size batch
    ///
    /// Returns the maximum amount of tokens supported by the hardware
    #[instrument(skip(self))]
    pub async fn warmup(
        &mut self,
        max_input_length: u32,
        max_prefill_tokens: u32,
        max_total_tokens: u32,
        max_batch_size: Option<usize>,
    ) -> Result<Option<u32>> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| {
                Box::pin(client.warmup(
                    max_input_length,
                    max_prefill_tokens,
                    max_total_tokens,
                    max_batch_size,
                ))
            })
            .collect();
        // Take the minimum value
        let results = join_all(futures)
            .await
            .into_iter()
            .collect::<Result<Vec<Option<u32>>>>()?;
        Ok(results.into_iter().flatten().min())
    }

    /// Generate one token for each request in the given batch
    ///
    /// Returns Generation for each request in batch
    /// and the next cached batch
    #[instrument(skip_all, fields(id = & batch.id, size = & batch.size))]
    pub async fn prefill(
        &mut self,
        batch: Batch,
    ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| Box::pin(client.prefill(batch.clone())))
            .collect();
        #[allow(clippy::type_complexity)]
        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)>> =
            join_all(futures).await.into_iter().collect();
        let mut results = results?;

        let (mut generations, next_batch, mut timings) =
            results.pop().ok_or(ClientError::EmptyResults)?;

        // Merge generations from different model shards
        for (mut shard_generations, _, shard_timings) in results.into_iter() {
            generations.append(&mut shard_generations);
            // Return the timings of the slowest shard
            if shard_timings.total > timings.total {
                timings = shard_timings;
            }
        }
        Ok((generations, next_batch, timings))
    }

    /// Generate one token for each request in the given cached batches
    ///
    /// Returns Generation for each request in batches
    /// and the next cached batch
    #[instrument(skip_all, fields(size = batches.iter().map(| batch | {batch.size}).sum::< u32 > ()))]
    pub async fn decode(
        &mut self,
        batches: Vec<CachedBatch>,
    ) -> Result<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| Box::pin(client.decode(batches.clone())))
            .collect();
        #[allow(clippy::type_complexity)]
        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)>> =
            join_all(futures).await.into_iter().collect();
        let mut results = results?;

        let (mut generations, next_batch, mut timings) =
            results.pop().ok_or(ClientError::EmptyResults)?;

        // Merge generations from different model shards
        for (mut shard_generations, _, shard_timings) in results.into_iter() {
            generations.append(&mut shard_generations);
            // Return the timings of the slowest shard
            if shard_timings.total > timings.total {
                timings = shard_timings;
            }
        }
        Ok((generations, next_batch, timings))
    }
}

impl From<InfoResponse> for ShardInfo {
    fn from(value: InfoResponse) -> Self {
        Self {
            requires_padding: value.requires_padding,
            dtype: value.dtype,
            device_type: value.device_type,
            window_size: value.window_size,
            speculate: value.speculate,
        }
    }
}

#[async_trait]
impl Health for ShardedClient {
    async fn device_health(&self) -> Result<()> {
        self.clone().health().await?;
        Ok(())
    }

    async fn model_health(&self) -> Result<()> {
        // Dummy batch of 1 token and 1 generated token
        let liveness_request = Request {
            id: u64::MAX,
            inputs: "liveness".to_string(),
            truncate: 10,
            prefill_logprobs: false,
            parameters: Some(NextTokenChooserParameters {
                temperature: 1.0,
                top_k: 0,
                top_p: 1.0,
                typical_p: 1.0,
                do_sample: false,
                seed: 0,
                repetition_penalty: 1.0,
                frequency_penalty: 0.0,
                watermark: false,
                grammar: String::new(),
                grammar_type: GrammarType::None as i32,
            }),
            stopping_parameters: Some(StoppingCriteriaParameters {
                max_new_tokens: 1,
                stop_sequences: vec![],
                ignore_eos_token: false,
            }),
            top_n_tokens: 0,
        };
        let batch = Batch {
            id: u64::MAX,
            requests: vec![liveness_request],
            size: 1,
            max_tokens: 2,
        };
        self.clone().prefill(batch).await?;
        Ok(())
    }
}


================================================
FILE: backends/client/src/v3/client.rs
================================================
use crate::v3::{pb, Chunk};
use crate::{ClientError, Result, WARMUP_IMAGE_BASE64};
/// Single shard Client
use base64::engine::general_purpose::STANDARD;
use base64::Engine;
use grpc_metadata::InjectTelemetryContext;
use pb::generate::v3::text_generation_service_client::TextGenerationServiceClient;
use pb::generate::v3::*;
use std::cmp::min;
use std::time::Duration;
use tonic::transport::{Channel, Uri};
use tracing::instrument;

/// Text Generation Inference gRPC client
#[derive(Debug, Clone)]
pub struct Client {
    stub: TextGenerationServiceClient<Channel>,
}

impl Client {
    /// Returns a client connected to the given url
    pub async fn connect(uri: Uri) -> Result<Self> {
        let channel = Channel::builder(uri).connect().await?;

        Ok(Self {
            stub: TextGenerationServiceClient::new(channel),
        })
    }

    /// Returns a client connected to the given unix socket
    pub async fn connect_uds(path: String) -> Result<Self> {
        let channel = Channel::from_shared("http://[::]:50051".to_string())
            .unwrap()
            .connect_with_connector(tower::service_fn(move |_: Uri| {
                tokio::net::UnixStream::connect(path.clone())
            }))
            .await?;

        Ok(Self {
            stub: TextGenerationServiceClient::new(channel),
        })
    }

    /// Returns a list of uris or unix sockets of all shards
    #[instrument(skip(self))]
    pub async fn service_discovery(&mut self) -> Result<Vec<String>> {
        let request = tonic::Request::new(ServiceDiscoveryRequest {}).inject_context();
        let response = self.stub.service_discovery(request).await.map_err(|_| {
            ClientError::Connection("Server does not support v3 interface".to_string())
        })?;
        let urls = response
            .into_inner()
            .urls
            .into_iter()
            // Remove unix socket prefix
            .map(|url| match url.strip_prefix("unix://") {
                None => url,
                Some(stripped_url) => stripped_url.to_string(),
            })
            .collect();
        Ok(urls)
    }

    /// Get model info
    #[instrument(skip(self))]
    pub async fn info(&mut self) -> Result<InfoResponse> {
        let request = tonic::Request::new(InfoRequest {}).inject_context();
        let response = self.stub.info(request).await?.into_inner();
        Ok(response)
    }

    /// Get model health
    #[instrument(skip(self))]
    pub async fn health(&mut self) -> Result<HealthResponse> {
        let request = tonic::Request::new(HealthRequest {}).inject_context();
        let response = self.stub.health(request).await?.into_inner();
        Ok(response)
    }

    /// Clear the past generations cache
    #[instrument(skip(self))]
    pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {
        let request = tonic::Request::new(ClearCacheRequest { id: batch_id }).inject_context();
        self.stub.clear_cache(request).await?;
        Ok(())
    }

    /// Filter a cached batch
    #[instrument(skip(self))]
    pub async fn filter_batch(
        &mut self,
        batch_id: u64,
        request_ids: Vec<u64>,
    ) -> Result<Option<CachedBatch>> {
        let request = tonic::Request::new(FilterBatchRequest {
            batch_id,
            request_ids,
        })
        .inject_context();
        let filtered_batch = self.stub.filter_batch(request).await?.into_inner();
        Ok(filtered_batch.batch)
    }

    /// Warmup on a max size batch
    ///
    /// Returns the maximum amount of tokens supported by the hardware
    #[instrument(skip_all)]
    pub async fn warmup(
        &mut self,
        max_input_tokens: Option<u32>,
        max_prefill_tokens: u32,
        max_total_tokens: Option<u32>,
        max_batch_size: Option<usize>,
    ) -> Result<(Option<u32>, u32, u32)> {
        let mut n_tokens = 0;
        let mut requests = Vec::new();
        // Create requests
        while n_tokens < max_prefill_tokens {
            let mut truncate = max_prefill_tokens - n_tokens;
            if let Some(max_input_tokens) = max_input_tokens {
                truncate = min(max_input_tokens, truncate);
            }

            let mut input_chunks = Vec::new();
            input_chunks.push(Chunk::Text("_test ".to_string().repeat(truncate as usize)).into());
            if n_tokens == 0 {
                input_chunks.push(
                    Chunk::Image(Image {
                        // Safe unwrap, because we control the data.
                        data: STANDARD.decode(WARMUP_IMAGE_BASE64).unwrap(),
                        mimetype: "image/jpeg;base64".to_string(),
                    })
                    .into(),
                );
            }

            // Send stringly-typed inputs for compatibility for backends that haven't
            // been updated to support chunks.

            let mut inputs = String::new();
            inputs.push_str(&"_test ".to_string().repeat(truncate as usize));
            if n_tokens == 0 {
                // 1 request is enough to test vision heads.
                // Sending images on other queries messes up easily with truncation.
                inputs.push_str(&format!(
                    "![](data:image/jpeg;base64,{WARMUP_IMAGE_BASE64})",
                ));
            }

            let max_new_tokens = if let Some(max_total_tokens) = max_total_tokens {
                max_total_tokens - truncate
            } else {
                1
            };

            requests.push(Request {
                id: 0,
                inputs,
                input_chunks: Some(Input {
                    chunks: input_chunks,
                }),
                // We truncate the input on the server side to be sure that it has the correct size
                truncate,
                // Most request will have that
                add_special_tokens: true,
                // Blocks and slots will be set on the server side if we use paged attention
                blocks: vec![],
                slots: vec![],
                cache_len: 0,
                chunk_len: None,
                // Set sampling parameters to also take these ops into account in the max memory
                parameters: Some(NextTokenChooserParameters {
                    temperature: 0.9,
                    top_k: 10,
                    top_p: 0.9,
                    typical_p: 0.9,
                    do_sample: false,
                    seed: 0,
                    repetition_penalty: 1.2,
                    frequency_penalty: 0.1,
                    watermark: true,
                    grammar: String::new(),
                    grammar_type: GrammarType::None as i32,
                }),
                stopping_parameters: Some(StoppingCriteriaParameters {
                    max_new_tokens,
                    stop_sequences: vec![],
                    ignore_eos_token: true,
                }),
                prefill_logprobs: true,
                top_n_tokens: 20,
                adapter_id: None,
            });
            n_tokens += truncate;

            // Check max_batch_size
            if Some(requests.len()) == max_batch_size {
                break;
            }
        }

        let batch = Batch {
            id: 0,
            size: requests.len() as u32,
            requests,
            max_tokens: max_input_tokens.unwrap_or(0),
            max_blocks: 0,
        };

        let request = tonic::Request::new(WarmupRequest {
            batch: Some(batch),
            max_input_tokens,
            max_prefill_tokens,
            max_total_tokens,
        })
        .inject_context();
        let response = self.stub.warmup(request).await?.into_inner();
        Ok((
            response.max_supported_total_tokens,
            response.max_input_tokens,
            response.max_total_tokens,
        ))
    }

    /// Generate one token for each request in the given batch
    ///
    /// Returns Generation for each request in batch
    /// and the next cached batch
    #[instrument(skip_all, fields(id = &batch.id, size = &batch.size))]
    pub async fn prefill(
        &mut self,
        batch: Batch,
        cached_batch: Option<CachedBatch>,
    ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
        let request = tonic::Request::new(PrefillRequest {
            batch: Some(batch),
            cached_batch,
        })
        .inject_context();
        let response = self.stub.prefill(request).await?.into_inner();
        Ok((
            response.generations,
            response.batch,
            PrefillTimings::new(response.forward_ns, response.decode_ns, response.total_ns),
        ))
    }

    /// Generate one token for each request in the given cached batches
    ///
    /// Returns Generation for each request in batches
    /// and the next cached batch
    #[instrument(skip_all, fields(size = batches.iter().map(|batch|{batch.size}).sum::<u32>()))]
    pub async fn decode(
        &mut self,
        batches: Vec<CachedBatch>,
    ) -> Result<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)> {
        let request = tonic::Request::new(DecodeRequest { batches }).inject_context();
        let response = self.stub.decode(request).await?.into_inner();
        Ok((
            response.generations,
            response.batch,
            DecodeTimings::new(
                response.concat_ns,
                response.forward_ns,
                response.decode_ns,
                response.total_ns,
            ),
        ))
    }
}

pub struct PrefillTimings {
    pub forward: Duration,
    pub decode: Duration,
    pub total: Duration,
}

impl PrefillTimings {
    fn new(forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
        Self {
            forward: Duration::from_nanos(forward_ns),
            decode: Duration::from_nanos(decode_ns),
            total: Duration::from_nanos(total_ns),
        }
    }
}

pub struct DecodeTimings {
    pub concat: Option<Duration>,
    pub forward: Duration,
    pub decode: Duration,
    pub total: Duration,
}

impl DecodeTimings {
    fn new(concat_ns: Option<u64>, forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
        Self {
            concat: concat_ns.map(Duration::from_nanos),
            forward: Duration::from_nanos(forward_ns),
            decode: Duration::from_nanos(decode_ns),
            total: Duration::from_nanos(total_ns),
        }
    }
}


================================================
FILE: backends/client/src/v3/mod.rs
================================================
#[allow(clippy::derive_partial_eq_without_eq)]
mod pb;

mod client;
mod sharded_client;

pub use client::Client;
pub use pb::generate::v3::{
    input_chunk::Chunk, Batch, CachedBatch, FinishReason, GeneratedText, Generation, GrammarType,
    HealthResponse, Image, InfoResponse, Input, InputChunk, NextTokenChooserParameters, Request,
    StoppingCriteriaParameters, Tokens,
};
pub use sharded_client::ShardedClient;


================================================
FILE: backends/client/src/v3/sharded_client.rs
================================================
/// Multi shard Client
use crate::{v3, Health, ShardInfo};
use crate::{ClientError, Result};

use crate::v3::{Chunk, InfoResponse, Input};
use async_trait::async_trait;
use futures::future::join_all;
use tonic::transport::Uri;
use tracing::instrument;
use v3::client::{DecodeTimings, PrefillTimings};
use v3::{
    Batch, CachedBatch, Client, Generation, GrammarType, HealthResponse,
    NextTokenChooserParameters, Request, StoppingCriteriaParameters,
};

#[derive(Debug, Clone)]
/// Text Generation Inference gRPC multi client
pub struct ShardedClient {
    clients: Vec<Client>,
}

impl ShardedClient {
    fn new(clients: Vec<Client>) -> Self {
        Self { clients }
    }

    /// Create a new ShardedClient from a master client. The master client will communicate with
    /// the other shards and returns all uris/unix sockets with the `service_discovery` gRPC method.
    async fn from_master_client(mut master_client: Client) -> Result<Self> {
        // Get all uris/unix sockets from the master client
        let uris = master_client.service_discovery().await?;
        let futures = uris.into_iter().map(Client::connect_uds);
        let clients: Result<Vec<Client>> = join_all(futures).await.into_iter().collect();
        Ok(Self::new(clients?))
    }

    /// Returns a client connected to the given uri
    pub async fn connect(uri: Uri) -> Result<Self> {
        let master_client = Client::connect(uri).await?;
        Self::from_master_client(master_client).await
    }

    /// Returns a client connected to the given unix socket
    pub async fn connect_uds(path: String) -> Result<Self> {
        let master_client = Client::connect_uds(path).await?;
        Self::from_master_client(master_client).await
    }

    /// Get the model info
    #[instrument(skip(self))]
    pub async fn info(&mut self) -> Result<ShardInfo> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| client.info())
            .collect();
        join_all(futures).await.pop().unwrap().map(ShardInfo::from)
    }

    /// GRPC health check
    #[instrument(skip(self))]
    pub async fn health(&mut self) -> Result<HealthResponse> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| client.health())
            .collect();
        join_all(futures).await.pop().unwrap()
    }

    /// Clear the past generations cache
    #[instrument(skip(self))]
    pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| client.clear_cache(batch_id))
            .collect();
        join_all(futures).await.into_iter().collect()
    }

    /// Filter a cached batch
    #[instrument(skip(self))]
    pub async fn filter_batch(
        &mut self,
        batch_id: u64,
        request_ids: Vec<u64>,
    ) -> Result<Option<CachedBatch>> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| Box::pin(client.filter_batch(batch_id, request_ids.clone())))
            .collect();
        // all shards return the same message
        join_all(futures).await.pop().unwrap()
    }

    /// Warmup on a max size batch
    ///
    /// Returns the maximum amount of tokens supported by the hardware
    #[instrument(skip(self))]
    pub async fn warmup(
        &mut self,
        max_input_length: Option<u32>,
        max_prefill_tokens: u32,
        max_total_tokens: Option<u32>,
        max_batch_size: Option<usize>,
    ) -> Result<(Option<u32>, u32, u32)> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| {
                Box::pin(client.warmup(
                    max_input_length,
                    max_prefill_tokens,
                    max_total_tokens,
                    max_batch_size,
                ))
            })
            .collect();
        // Take the minimum value
        let results = join_all(futures)
            .await
            .into_iter()
            .collect::<Result<Vec<(Option<u32>, u32, u32)>>>()?;

        // Take the minimum value
        // Different shards hold different parts of vocab, might yield
        // different available block size.
        let min = results
            .iter()
            .min()
            .expect("Expect at least 1 warmup result");
        Ok(*min)
    }

    /// Generate one token for each request in the given batch
    ///
    /// Returns Generation for each request in batch
    /// and the next cached batch
    #[instrument(skip_all, fields(id = & batch.id, size = & batch.size))]
    pub async fn prefill(
        &mut self,
        batch: Batch,
        cached_batch: Option<CachedBatch>,
    ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| Box::pin(client.prefill(batch.clone(), cached_batch.clone())))
            .collect();
        #[allow(clippy::type_complexity)]
        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)>> =
            join_all(futures).await.into_iter().collect();
        let mut results = results?;

        let (mut generations, next_batch, mut timings) =
            results.pop().ok_or(ClientError::EmptyResults)?;

        // Merge generations from different model shards
        for (mut shard_generations, _, shard_timings) in results.into_iter() {
            generations.append(&mut shard_generations);
            // Return the timings of the slowest shard
            if shard_timings.total > timings.total {
                timings = shard_timings;
            }
        }
        Ok((generations, next_batch, timings))
    }

    /// Generate one token for each request in the given cached batches
    ///
    /// Returns Generation for each request in batches
    /// and the next cached batch
    #[instrument(skip_all, fields(size = batches.iter().map(| batch | {batch.size}).sum::< u32 > ()))]
    pub async fn decode(
        &mut self,
        batches: Vec<CachedBatch>,
    ) -> Result<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| Box::pin(client.decode(batches.clone())))
            .collect();
        #[allow(clippy::type_complexity)]
        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)>> =
            join_all(futures).await.into_iter().collect();
        let mut results = results?;

        let (mut generations, next_batch, mut timings) =
            results.pop().ok_or(ClientError::EmptyResults)?;

        // Merge generations from different model shards
        for (mut shard_generations, _, shard_timings) in results.into_iter() {
            generations.append(&mut shard_generations);
            // Return the timings of the slowest shard
            if shard_timings.total > timings.total {
                timings = shard_timings;
            }
        }
        Ok((generations, next_batch, timings))
    }
}

impl From<InfoResponse> for ShardInfo {
    fn from(value: InfoResponse) -> Self {
        Self {
            requires_padding: value.requires_padding,
            dtype: value.dtype,
            device_type: value.device_type,
            window_size: value.window_size,
            speculate: value.speculate,
        }
    }
}

#[async_trait]
impl Health for ShardedClient {
    async fn device_health(&self) -> Result<()> {
        self.clone().health().await?;
        Ok(())
    }

    async fn model_health(&self) -> Result<()> {
        // Dummy batch of 1 token and 1 generated token
        let liveness_request = Request {
            id: u64::MAX,
            inputs: "liveness".to_string(),
            input_chunks: Some(Input {
                chunks: vec![Chunk::Text("liveness".into()).into()],
            }),
            truncate: 10,
            add_special_tokens: true,
            prefill_logprobs: false,
            parameters: Some(NextTokenChooserParameters {
                temperature: 1.0,
                top_k: 0,
                top_p: 1.0,
                typical_p: 1.0,
                do_sample: false,
                seed: 0,
                repetition_penalty: 1.0,
                frequency_penalty: 0.0,
                watermark: false,
                grammar: String::new(),
                grammar_type: GrammarType::None as i32,
            }),
            stopping_parameters: Some(StoppingCriteriaParameters {
                max_new_tokens: 1,
                stop_sequences: vec![],
                ignore_eos_token: false,
            }),
            top_n_tokens: 0,
            // Block 0 is reserved for health checks
            blocks: vec![0],
            slots: (0..16).collect(),
            cache_len: 0,
            chunk_len: None,
            adapter_id: None,
        };
        let batch = Batch {
            id: u64::MAX,
            requests: vec![liveness_request],
            size: 1,
            max_tokens: 2,
            max_blocks: 1,
        };
        self.clone().prefill(batch, None).await?;
        Ok(())
    }
}


================================================
FILE: backends/gaudi/Makefile
================================================
mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
mkfile_dir := $(dir $(mkfile_path))
root_dir := ${mkfile_dir}/../..

HABANA_VERSION := 1.21.0
PYTORCH_VERSION := 2.6.0

.PHONY:	image run-local-dev-container install-dependencies install-server install-router install-launcher local-dev-install

image:
	docker build --ulimit nofile=4096 -t tgi-gaudi -f ${root_dir}/Dockerfile_gaudi ${root_dir} --build-arg HABANA_VERSION=$(HABANA_VERSION) --build-arg PYTORCH_VERSION=$(PYTORCH_VERSION)

run-local-dev-container:
		docker run -it \
		--runtime=habana \
		--ipc=host \
		--cap-add=sys_nice \
		--net=host \
		-e HABANA_VISIBLE_DEVICES=all \
		-e OMPI_MCA_btl_vader_single_copy_mechanism=none \
		-e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
		-e HF_TOKEN=`cat /home/ubuntu/.cache/huggingface/token` \
		-e LOG_LEVEL=debug \
		-e PORT=8080 \
		-v /home/ubuntu/.cache/huggingface:/data \
		-v $(PWD):/text-generation-inference \
		-w /text-generation-inference \
		vault.habana.ai/gaudi-docker/$(HABANA_VERSION)/ubuntu22.04/habanalabs/pytorch-installer-$(PYTORCH_VERSION):latest

install-dependencies:
	pip install git+https://github.com/HabanaAI/DeepSpeed.git@$(HABANA_VERSION)
	pip install outlines~=0.0.34
	curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y

install-server:
	make -C ${root_dir}/backends/gaudi/server install PROTO_PATH=../../../proto/v3

install-router:
	make -C ${root_dir} install-router

install-launcher:
	make -C ${root_dir} install-launcher

# use source to load the rust in path
local-dev-install: install-dependencies
	bash -c 'source "$$HOME/.cargo/env" && \
		make install-server && \
		make install-router && \
		make install-launcher'

# In order to run the integration tests, you need to first build the image (make -C backends/gaudi image)
run-integration-tests:
	DOCKER_VOLUME=${root_dir}/data \
	HF_TOKEN=`cat ${HOME}/.cache/huggingface/token` \
    pytest --durations=0 -s -vv ${root_dir}/integration-tests --gaudi

run-integration-tests-with-all-models:
	DOCKER_VOLUME=${root_dir}/data \
	HF_TOKEN=`cat ${HOME}/.cache/huggingface/token` \
	pytest --durations=0 -s -vv ${root_dir}/integration-tests --gaudi --gaudi-all-models

# This is used to capture the expected outputs for the integration tests offering an easy way to add more models to the integration tests
capture-expected-outputs-for-integration-tests:
	pip install -U pip uv
	DOCKER_VOLUME=${root_dir}/data \
	HF_TOKEN=`cat ${HOME}/.cache/huggingface/token` \
	uv run pytest --durations=0 -sv ${root_dir}/backends/gaudi/server/integration-tests/capture_expected_outputs.py


================================================
FILE: backends/gaudi/README.md
================================================
# Text-generation-inference - Gaudi backend

## Description

This is the TGI backend for Intel Gaudi. This backend is composed of the tgi server optimized for Gaudi hardware.

## Build your own image

The simplest way to build TGI with the Gaudi backend is to use the provided `Makefile`:

Option 1: From the project root directory:
```bash
make -C backends/gaudi image
```

Option 2: From the Gaudi backend directory:
```bash
cd backends/gaudi
make image
```

You can now run the server with the following command:

Option 1: Sharded:
```bash
model=meta-llama/Llama-3.1-8B-Instruct
hf_token=$(cat ${HOME}/.cache/huggingface/token)
volume=${HOME}/.cache/huggingface

docker run --runtime=habana --ipc=host --cap-add=sys_nice \
  -p 8080:80 -v $volume:/data \
  -e LOG_LEVEL=debug -e HF_TOKEN=$hf_token \
  tgi-gaudi --model-id $model \
  --sharded true --num-shard 8 \
  --max-input-tokens 512 --max-total-tokens 1024 --max-batch-size 8 --max-batch-prefill-tokens 2048
```

Option 2: Non-sharded:
```bash
model=meta-llama/Llama-3.1-8B-Instruct
hf_token=$(cat ${HOME}/.cache/huggingface/token)
volume=${HOME}/.cache/huggingface

docker run --runtime=habana --ipc=host --cap-add=sys_nice \
  -p 8080:80 -v $volume:/data \
  -e LOG_LEVEL=debug -e HF_TOKEN=$hf_token \
  tgi-gaudi --model-id $model \
  --max-input-tokens 512 --max-total-tokens 1024 --max-batch-size 4 --max-batch-prefill-tokens 2048
```

## Contributing

### Local Development

This is useful if you want to run the server locally for better debugging.
```bash
make -C backends/gaudi run-local-dev-container
```

Then run the following command inside the container to install tgi for gaudi:
```bash
make -C backends/gaudi local-dev-install
```

Add rust to path:
```bash
. "$HOME/.cargo/env"
```

Option 1: Run the server (sharded model):
```bash
LOG_LEVEL=debug text-generation-launcher \
    --model-id meta-llama/Llama-3.1-8B-Instruct \
    --sharded true \
    --num-shard 8 \
    --max-input-tokens 512 \
    --max-total-tokens 1024 \
    --max-batch-size 8 \
    --max-batch-prefill-tokens 2048
```

Option 2: Run the server (non-sharded model):
```bash
LOG_LEVEL=debug text-generation-launcher \
    --model-id meta-llama/Llama-3.1-8B-Instruct \
    --max-input-tokens 512 \
    --max-total-tokens 1024 \
    --max-batch-size 4 \
    --max-batch-prefill-tokens 2048
```

You can then test the server with the following curl command from another terminal (can be outside the container):
```bash
curl 127.0.0.1:8080/generate \
     -X POST \
     -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
     -H 'Content-Type: application/json'
```

### Integration tests

Install the dependencies:
```bash
pip install -r integration-tests/requirements.txt
```

To run the integration tests, you need to first build the image:
```bash
make -C backends/gaudi image
```

Then run the following command to run the integration tests (CI tests):
```bash
make -C backends/gaudi run-integration-tests
```

To run the integration tests with all models, you can run the following command:
```bash
make -C backends/gaudi run-integration-tests-with-all-models
```

To capture the expected outputs for the integration tests, you can run the following command:
```bash
make -C backends/gaudi capture-expected-outputs-for-integration-tests
```

#### How the integration tests works
The integration tests works as follows:

1. Start a tgi server in a container, similar to the command:
```bash
docker run --runtime=habana --ipc=host --cap-add=sys_nice \
  -p 8080:80 -v $volume:/data \
  -e LOG_LEVEL=debug -e HF_TOKEN=$hf_token \
  tgi-gaudi --model-id $model \
  --max-input-tokens 512 --max-total-tokens 1024 --max-batch-size 4 --max-batch-prefill-tokens 2048
```

2. Do a /generate request to the server, similar to the command:
```bash
curl 127.0.0.1:8080/generate \
     -X POST \
     -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
     -H 'Content-Type: application/json'
```

3. Check the output of the server against the expected output:
```python
assert curl_output == expected_output
```

This is the repeated for a set of models and configurations.


================================================
FILE: backends/gaudi/examples/docker_commands/docker_commands.md
================================================
# Examples of Docker Commands for Gaudi Backend

This page gives a list of examples of docker run commands for some of the most popular models.

> **Note:** The parameters are chosen for Gaudi2 hardware to maximize performance on this given hardware, please adjust the parameters based on your hardware. For example, if you are using Gaudi3, you may want to increase the batch size.

## Default Precision (BF16)

### Llama3.1-8B on 1 card (BF16)

```bash
model=meta-llama/Meta-Llama-3.1-8B-Instruct
hf_token=YOUR_ACCESS_TOKEN
volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run

docker run -p 8080:80 \
   --runtime=habana \
   --cap-add=sys_nice \
   --ipc=host \
   -v $volume:/data \
   -e HF_TOKEN=$hf_token \
   ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \
   --model-id $model \
   --max-input-tokens 1024 --max-total-tokens 2048 \
   --max-batch-prefill-tokens 2048 --max-batch-size 32 \
   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 64
```

### Llama3.1-70B 8 cards (BF16)

```bash
model=meta-llama/Meta-Llama-3.1-70B-Instruct
hf_token=YOUR_ACCESS_TOKEN
volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run

docker run -p 8080:80 \
   --runtime=habana \
   --cap-add=sys_nice \
   --ipc=host \
   -v $volume:/data \
   -e HF_TOKEN=$hf_token \
   ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \
   --model-id $model \
   --sharded true --num-shard 8 \
   --max-input-tokens 1024 --max-total-tokens 2048 \
   --max-batch-prefill-tokens 4096 --max-batch-size 256 \
   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512
```

### Llava-v1.6-Mistral-7B on 1 card (BF16)

```bash
model=llava-hf/llava-v1.6-mistral-7b-hf
volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run

docker run -p 8080:80 \
   --runtime=habana \
   --cap-add=sys_nice \
   --ipc=host \
   -v $volume:/data \
   ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \
   --model-id $model \
   --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
   --max-total-tokens 8192 --max-batch-size 4
```

## FP8 Precision

You could also set kv cache dtype to FP8 when launching the server, fp8_e4m3fn is supported in Gaudi

## Llama3-8B on 1 Card (FP8)

```bash
model=RedHatAI/Meta-Llama-3-8B-Instruct-FP8-KV
hf_token=YOUR_ACCESS_TOKEN
volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run

docker run -p 8080:80 \
   --runtime=habana \
   --cap-add=sys_nice \
   --ipc=host \
   -v $volume:/data \
   -e HF_TOKEN=$hf_token \
   ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \
   --model-id $model \
   --kv-cache-dtype fp8_e4m3fn \
   --max-input-tokens 1024 --max-total-tokens 2048 \
   --max-batch-prefill-tokens 2048 --max-batch-size 32 \
   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 64
```

## Llama3-70B on 8 cards (FP8)

```bash
model=RedHatAI/Meta-Llama-3-70B-Instruct-FP8
hf_token=YOUR_ACCESS_TOKEN
volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run

docker run -p 8080:80 \
   --runtime=habana \
   --cap-add=sys_nice \
   --ipc=host \
   -v $volume:/data \
   -e HF_TOKEN=$hf_token \
   ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \
   --model-id $model \
   --kv-cache-dtype fp8_e4m3fn \
   --sharded true --num-shard 8 \
   --max-input-tokens 1024 --max-total-tokens 2048 \
   --max-batch-prefill-tokens 4096 --max-batch-size 256 \
   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512
```


================================================
FILE: backends/gaudi/server/.gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
text_generation_server/__pycache__/
text_generation_server/pb/__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# poetry
#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
#   This is especially recommended for binary packages to ensure reproducibility, and is more
#   commonly ignored for libraries.
#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# pdm
#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
#   in version control.
#   https://pdm.fming.dev/#use-with-ide
.pdm.toml

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

transformers
safetensors
flash-attention/
flash-attention-v2/
vllm/
llm-awq/
eetq/
mamba/


================================================
FILE: backends/gaudi/server/Makefile
================================================
include Makefile-flash-att
include Makefile-flash-att-v2
include Makefile-vllm
include Makefile-awq
include Makefile-eetq
include Makefile-selective-scan

PROTO_PATH ?= ../proto/v3

unit-tests:
	pytest -s -vv -m "not private" tests

gen-server:
	# Compile protos
	pip install grpcio-tools==1.62.2 mypy-protobuf==3.6.0 'types-protobuf' --no-cache-dir
	mkdir text_generation_server/pb || true
	python -m grpc_tools.protoc -I$(PROTO_PATH) --python_out=text_generation_server/pb \
		--grpc_python_out=text_generation_server/pb --mypy_out=text_generation_server/pb $(PROTO_PATH)/generate.proto
	find text_generation_server/pb/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \;
	touch text_generation_server/pb/__init__.py

install: gen-server
	pip install pip --upgrade
	pip install --no-deps -r requirements.txt
	pip install -e "."

run-dev:
	SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=2 text_generation_server/cli.py serve bigscience/bloom-560m --sharded

install-poetry:
	curl -sSL https://install.python-poetry.org | python3 -

update-lock:
	rm poetry.lock
	poetry lock --no-update

export-requirements:
	poetry export -o requirements.txt --without-hashes


================================================
FILE: backends/gaudi/server/Makefile-awq
================================================
# Fork that adds only the correct stream to this kernel in order
# to make cuda graphs work.
awq_commit := bd1dc2d5254345cc76ab71894651fb821275bdd4

awq:
	rm -rf llm-awq
	git clone https://github.com/huggingface/llm-awq

build-awq: awq
	cd llm-awq/ && git fetch && git checkout $(awq_commit)
	cd llm-awq/awq/kernels && python setup.py build

install-awq: build-awq
	pip uninstall awq_inference_engine -y || true
	cd llm-awq/awq/kernels && python setup.py install


================================================
FILE: backends/gaudi/server/Makefile-eetq
================================================
eetq_commit := 1657b1504faa359e2ce0ac02999439d7ac8c74c0

eetq:
    # Clone eetq
	pip install packaging
	git clone https://github.com/NetEase-FuXi/EETQ.git eetq

build-eetq: eetq
	cd eetq && git fetch && git checkout $(eetq_commit) && git submodule update --init --recursive
	cd eetq && python setup.py build

install-eetq: build-eetq
	cd eetq && python setup.py install


================================================
FILE: backends/gaudi/server/Makefile-fbgemm
================================================
fbgemm_commit := v0.8.0

build-fbgemm:
	@if [ ! -d "fbgemm" ]; then \
		git clone https://github.com/pytorch/FBGEMM.git fbgemm; \
	fi
	cd fbgemm && git fetch && git checkout $(fbgemm_commit)  && \
	git submodule update --init --recursive && \
	cd fbgemm_gpu && \
	pip install -r requirements.txt && \
	CUDA_ARCH_LIST="8.0;9.0a" NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90a,code=sm_90a" TORCH_CUDA_ARCH_LIST="8.0;9.0a" python setup.py --package_variant genai build

install-fbgemm: build-fbgemm
	cd fbgemm/fbgemm_gpu &&  \
	CUDA_ARCH_LIST="8.0;9.0a" NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90a,code=sm_90a" TORCH_CUDA_ARCH_LIST="8.0;9.0a" python setup.py --package_variant genai install


================================================
FILE: backends/gaudi/server/Makefile-flash-att
================================================
flash_att_commit := 3a9bfd076f98746c73362328958dbc68d145fbec

build-flash-attention:
	if [ ! -d 'flash-attention' ]; then \
		pip install -U packaging ninja  --no-cache-dir && \
		git clone https://github.com/HazyResearch/flash-attention.git; \
	fi
	cd flash-attention && git fetch && git checkout $(flash_att_commit) && \
	MAX_JOBS=8 python setup.py build && cd csrc/layer_norm && python setup.py build && cd ../rotary && python setup.py build

install-flash-attention: build-flash-attention
	cd flash-attention && git checkout $(flash_att_commit) && MAX_JOBS=8 python setup.py install && cd csrc/layer_norm && python setup.py install && cd ../rotary && python setup.py install


================================================
FILE: backends/gaudi/server/Makefile-flash-att-v2
================================================
flash_att_v2_commit_cuda := v2.6.1
flash_att_v2_commit_rocm := 2092111b9f975b3347c652ff7fabd431130256c4

build-flash-attention-v2-cuda:
	pip install -U packaging wheel
	pip install flash-attn==$(flash_att_v2_commit_cuda)

install-flash-attention-v2-cuda: build-flash-attention-v2-cuda
	echo "Flash v2 installed"

build-flash-attention-v2-rocm:
	if [ ! -d 'flash-attention-v2' ]; then \
		pip install -U packaging ninja  --no-cache-dir && \
		git clone https://github.com/mht-sharma/flash-attention.git flash-attention-v2 && \
		cd flash-attention-v2 && git fetch && git checkout $(flash_att_v2_commit_rocm) && \
		git submodule update --init --recursive && GPU_ARCHS="gfx90a;gfx942" PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py build; \
	fi

install-flash-attention-v2-rocm: build-flash-attention-v2-rocm
	cd flash-attention-v2 &&  \
	GPU_ARCHS="gfx90a;gfx942" PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py install


================================================
FILE: backends/gaudi/server/Makefile-selective-scan
================================================
selective_scan_commit := 2a3704fd47ba817b415627b06fd796b971fdc137

causal-conv1d:
	rm -rf causal-conv1d
	git clone https://github.com/Dao-AILab/causal-conv1d.git

build-causal-conv1d: causal-conv1d
	cd causal-conv1d/ && git checkout v1.1.1 # known latest working version tag
	cd causal-conv1d/ && CAUSAL_CONV1D_FORCE_BUILD=TRUE python setup.py build

install-causal-conv1d: build-causal-conv1d
	pip uninstall causal-conv1d -y || true
	cd causal-conv1d/ && pip install .

# selective-scan dependends on causal-conv1d
selective-scan:
	rm -rf mamba
	git clone https://github.com/state-spaces/mamba.git mamba

build-selective-scan: selective-scan
	cd mamba/ && git fetch && git checkout $(selective_scan_commit)
	cd mamba && python setup.py build

install-selective-scan: install-causal-conv1d build-selective-scan
	pip uninstall selective-scan-cuda -y || true
	cd mamba && pip install .

build-all: build-causal-conv1d build-selective-scan


================================================
FILE: backends/gaudi/server/Makefile-vllm
================================================
commit_cuda := d243e9dc7e2c9c2e36a4150ec8e64809cb55c01b
commit_rocm := 4e0929e6e4fa0a3d09d358715c288020ea9dc247
build-vllm-cuda:
	if [ ! -d 'vllm' ]; then \
		pip install -U ninja packaging --no-cache-dir && \
		git clone https://github.com/Narsil/vllm.git vllm; \
	fi
	cd vllm  && git fetch origin && git checkout $(commit_cuda) && python setup.py build

install-vllm-cuda: build-vllm-cuda
	cd vllm  && git fetch origin && git checkout $(commit_cuda) && pip install -e .

build-vllm-rocm:
	if [ ! -d 'vllm' ]; then \
		pip install -U ninja packaging --no-cache-dir && \
		git clone https://github.com/mht-sharma/vllm.git vllm; \
	fi
	cd vllm && git fetch && git checkout $(commit_rocm) &&  \
	PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py build

install-vllm-rocm: build-vllm-rocm
	cd vllm && git fetch && git checkout $(commit_rocm) && \
	PYTORCH_ROCM_ARCH="gfx90a;gfx942" pip install -e .


================================================
FILE: backends/gaudi/server/README.md
================================================
# Text Generation Inference Python gRPC Server

A Python gRPC server for Text Generation Inference

## Install

```shell
make install
```

## Run

```shell
make run-dev
```


================================================
FILE: backends/gaudi/server/dill-0.3.7-patch.sh
================================================
#!/bin/bash
git clone -b dill-0.3.7 https://github.com/uqfoundation/dill.git
pushd dill
cat <<EOF > dill-0.3.7.patch
diff --git a/dill/_dill.py b/dill/_dill.py
index d0cf543..f6eb662 100644
--- a/dill/_dill.py
+++ b/dill/_dill.py
@@ -69,7 +69,15 @@ TypeType = type # 'new-style' classes #XXX: unregistered
 XRangeType = range
 from types import MappingProxyType as DictProxyType, new_class
 from pickle import DEFAULT_PROTOCOL, HIGHEST_PROTOCOL, PickleError, PicklingError, UnpicklingError
-import __main__ as _main_module
+class _LazyMainModule(object):
+    _module = None
+    @property
+    def module(self):
+        if self._module is None:
+            import __main__ as _m_module
+            self._module = _m_module
+        return self._module
+_main_module = _LazyMainModule()
 import marshal
 import gc
 # import zlib
@@ -353,7 +361,7 @@ class Pickler(StockPickler):
         _fmode = kwds.pop('fmode', None)
         _recurse = kwds.pop('recurse', None)
         StockPickler.__init__(self, file, *args, **kwds)
-        self._main = _main_module
+        self._main = _main_module.module
         self._diff_cache = {}
         self._byref = settings['byref'] if _byref is None else _byref
         self._strictio = False #_strictio
@@ -435,12 +443,12 @@ class Unpickler(StockUnpickler):
         settings = Pickler.settings
         _ignore = kwds.pop('ignore', None)
         StockUnpickler.__init__(self, *args, **kwds)
-        self._main = _main_module
+        self._main = _main_module.module
         self._ignore = settings['ignore'] if _ignore is None else _ignore

     def load(self): #NOTE: if settings change, need to update attributes
         obj = StockUnpickler.load(self)
-        if type(obj).__module__ == getattr(_main_module, '__name__', '__main__'):
+        if type(obj).__module__ == getattr(self._main, '__name__', '__main__'):
             if not self._ignore:
                 # point obj class to main
                 try: obj.__class__ = getattr(self._main, type(obj).__name__)
@@ -1194,11 +1202,11 @@ def save_module_dict(pickler, obj):
         logger.trace(pickler, "D1: %s", _repr_dict(obj)) # obj
         pickler.write(bytes('c__builtin__\n__main__\n', 'UTF-8'))
         logger.trace(pickler, "# D1")
-    elif (not is_dill(pickler, child=False)) and (obj == _main_module.__dict__):
+    elif (not is_dill(pickler, child=False)) and (obj == _main_module.module.__dict__):
         logger.trace(pickler, "D3: %s", _repr_dict(obj)) # obj
         pickler.write(bytes('c__main__\n__dict__\n', 'UTF-8'))  #XXX: works in general?
         logger.trace(pickler, "# D3")
-    elif '__name__' in obj and obj != _main_module.__dict__ \\
+    elif '__name__' in obj and obj != _main_module.module.__dict__ \\
             and type(obj['__name__']) is str \\
             and obj is getattr(_import_module(obj['__name__'],True), '__dict__', None):
         logger.trace(pickler, "D4: %s", _repr_dict(obj)) # obj
diff --git a/dill/session.py b/dill/session.py
index 74234ab..1be8d89 100644
--- a/dill/session.py
+++ b/dill/session.py
@@ -233,7 +233,7 @@ def dump_module(
     protocol = settings['protocol']
     main = module
     if main is None:
-        main = _main_module
+        main = _main_module.module
     elif isinstance(main, str):
         main = _import_module(main)
     if not isinstance(main, ModuleType):
@@ -501,7 +501,7 @@ def load_module(
             pass
     assert loaded is main
     _restore_modules(unpickler, main)
-    if main is _main_module or main is module:
+    if main is _main_module.module or main is module:
         return None
     else:
         return main

EOF
git apply dill-0.3.7.patch
python -m pip install .
popd
rm -fr dill


================================================
FILE: backends/gaudi/server/dill-0.3.8-patch.sh
================================================
#!/bin/bash
git clone -b 0.3.8 https://github.com/uqfoundation/dill.git
pushd dill
cat <<EOF > dill-0.3.8.patch
diff --git a/dill/_dill.py b/dill/_dill.py
index d42432f..1d251e6 100644
--- a/dill/_dill.py
+++ b/dill/_dill.py
@@ -69,7 +69,15 @@ TypeType = type # 'new-style' classes #XXX: unregistered
 XRangeType = range
 from types import MappingProxyType as DictProxyType, new_class
 from pickle import DEFAULT_PROTOCOL, HIGHEST_PROTOCOL, PickleError, PicklingError, UnpicklingError
-import __main__ as _main_module
+class _LazyMainModule(object):
+    _module = None
+    @property
+    def module(self):
+        if self._module is None:
+            import __main__ as _m_module
+            self._module = _m_module
+        return self._module
+_main_module = _LazyMainModule()
 import marshal
 import gc
 # import zlib
@@ -355,7 +363,7 @@ class Pickler(StockPickler):
         _fmode = kwds.pop('fmode', None)
         _recurse = kwds.pop('recurse', None)
         StockPickler.__init__(self, file, *args, **kwds)
-        self._main = _main_module
+        self._main = _main_module.module
         self._diff_cache = {}
         self._byref = settings['byref'] if _byref is None else _byref
         self._strictio = False #_strictio
@@ -437,12 +445,12 @@ class Unpickler(StockUnpickler):
         settings = Pickler.settings
         _ignore = kwds.pop('ignore', None)
         StockUnpickler.__init__(self, *args, **kwds)
-        self._main = _main_module
+        self._main = _main_module.module
         self._ignore = settings['ignore'] if _ignore is None else _ignore

     def load(self): #NOTE: if settings change, need to update attributes
         obj = StockUnpickler.load(self)
-        if type(obj).__module__ == getattr(_main_module, '__name__', '__main__'):
+        if type(obj).__module__ == getattr(self._main, '__name__', '__main__'):
             if not self._ignore:
                 # point obj class to main
                 try: obj.__class__ = getattr(self._main, type(obj).__name__)
@@ -1199,11 +1207,11 @@ def save_module_dict(pickler, obj):
         logger.trace(pickler, "D1: %s", _repr_dict(obj)) # obj
         pickler.write(bytes('c__builtin__\n__main__\n', 'UTF-8'))
         logger.trace(pickler, "# D1")
-    elif (not is_dill(pickler, child=False)) and (obj == _main_module.__dict__):
+    elif (not is_dill(pickler, child=False)) and (obj == _main_module.module.__dict__):
         logger.trace(pickler, "D3: %s", _repr_dict(obj)) # obj
         pickler.write(bytes('c__main__\n__dict__\n', 'UTF-8'))  #XXX: works in general?
         logger.trace(pickler, "# D3")
-    elif '__name__' in obj and obj != _main_module.__dict__ \\
+    elif '__name__' in obj and obj != _main_module.module.__dict__ \\
             and type(obj['__name__']) is str \\
             and obj is getattr(_import_module(obj['__name__'],True), '__dict__', None):
         logger.trace(pickler, "D4: %s", _repr_dict(obj)) # obj
diff --git a/dill/session.py b/dill/session.py
index e91068a..a921b43 100644
--- a/dill/session.py
+++ b/dill/session.py
@@ -233,7 +233,7 @@ def dump_module(
     protocol = settings['protocol']
     main = module
     if main is None:
-        main = _main_module
+        main = _main_module.module
     elif isinstance(main, str):
         main = _import_module(main)
     if not isinstance(main, ModuleType):
@@ -501,7 +501,7 @@ def load_module(
             pass
     assert loaded is main
     _restore_modules(unpickler, main)
-    if main is _main_module or main is module:
+    if main is _main_module.module or main is module:
         return None
     else:
         return main

EOF
git apply dill-0.3.8.patch
python -m pip install .
popd
rm -fr dill


================================================
FILE: backends/gaudi/server/pyproject.toml
================================================
[tool.poetry]
name = "text-generation-server"
version = "2.0.4"
description = "Text Generation Inference Python gRPC Server"
authors = ["Olivier Dehaene <olivier@huggingface.co>"]

[tool.poetry.scripts]
text-generation-server = 'text_generation_server.cli:app'

[tool.poetry.dependencies]
python = ">=3.9,<3.13"
protobuf = "^5.0"
grpcio = "^1.71.1"
grpcio-status = "*"
grpcio-reflection = "*"
grpc-interceptor = "^0.15.0"
typer = "^0.15.0"
loguru = "^0.7.3"
opentelemetry-api = "^1.32.0"
opentelemetry-exporter-otlp = "^1.32.0"
opentelemetry-instrumentation-grpc = "^0.53b0"
hf-transfer = "^0.1.9"
sentencepiece = "^0.2.0"
peft = "^0.15"
transformers = "^4.52.4"
numpy = "^1.26"
accelerate = "^1.7.0"
outlines= { version = "^0.0.36", optional = true }
prometheus-client = "^0.21.1"
py-cpuinfo = "^9.0.0"

[tool.poetry.group.dev.dependencies]
grpcio-tools = "*"
pytest = "^8.3.5"

[tool.pytest.ini_options]
markers = ["private: marks tests as requiring an admin hf token (deselect with '-m \"not private\"')"]

[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"

[tool.poetry.requires-plugins]
poetry-plugin-export = ">=1.8"


================================================
FILE: backends/gaudi/server/requirements.txt
================================================
accelerate==1.7.0 ; python_version >= "3.9" and python_version < "3.13"
annotated-types==0.7.0 ; python_version >= "3.9" and python_version < "3.13"
attrs==25.3.0 ; python_version >= "3.9" and python_version < "3.13"
certifi==2025.1.31 ; python_version >= "3.9" and python_version < "3.13"
charset-normalizer==3.4.1 ; python_version >= "3.9" and python_version < "3.13"
click==8.1.8 ; python_version >= "3.9" and python_version < "3.13"
cloudpickle==3.1.1 ; python_version >= "3.9" and python_version < "3.13"
colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and platform_system == "Windows" or python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
deprecated==1.2.18 ; python_version >= "3.9" and python_version < "3.13"
diffusers==0.31.0 ; python_version >= "3.9" and python_version < "3.13"
diskcache==5.6.3 ; python_version >= "3.9" and python_version < "3.13"
filelock==3.18.0 ; python_version >= "3.9" and python_version < "3.13"
fsspec==2025.3.2 ; python_version >= "3.9" and python_version < "3.13"
googleapis-common-protos==1.70.0 ; python_version >= "3.9" and python_version < "3.13"
grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
grpcio-reflection==1.71.0 ; python_version >= "3.9" and python_version < "3.13"
grpcio-status==1.71.0 ; python_version >= "3.9" and python_version < "3.13"
grpcio==1.72.0rc1 ; python_version >= "3.9" and python_version < "3.13"
hf-transfer==0.1.9 ; python_version >= "3.9" and python_version < "3.13"
huggingface-hub==0.30.2 ; python_version >= "3.9" and python_version < "3.13"
idna==3.10 ; python_version >= "3.9" and python_version < "3.13"
importlib-metadata==8.6.1 ; python_version >= "3.9" and python_version < "3.13"
interegular==0.3.3 ; python_version >= "3.9" and python_version < "3.13"
jinja2==3.1.6 ; python_version >= "3.9" and python_version < "3.13"
joblib==1.4.2 ; python_version >= "3.9" and python_version < "3.13"
jsonschema-specifications==2024.10.1 ; python_version >= "3.9" and python_version < "3.13"
jsonschema==4.23.0 ; python_version >= "3.9" and python_version < "3.13"
lark==1.2.2 ; python_version >= "3.9" and python_version < "3.13"
llvmlite==0.43.0 ; python_version >= "3.9" and python_version < "3.13"
loguru==0.7.3 ; python_version >= "3.9" and python_version < "3.13"
markdown-it-py==3.0.0 ; python_version >= "3.9" and python_version < "3.13"
markupsafe==3.0.2 ; python_version >= "3.9" and python_version < "3.13"
mdurl==0.1.2 ; python_version >= "3.9" and python_version < "3.13"
mpmath==1.3.0 ; python_version >= "3.9" and python_version < "3.13"
nest-asyncio==1.6.0 ; python_version >= "3.9" and python_version < "3.13"
networkx==3.2.1 ; python_version >= "3.9" and python_version < "3.13"
numba==0.60.0 ; python_version >= "3.9" and python_version < "3.13"
numpy==1.26.4 ; python_version >= "3.9" and python_version < "3.13"
opentelemetry-api==1.32.0 ; python_version >= "3.9" and python_version < "3.13"
opentelemetry-exporter-otlp-proto-common==1.32.0 ; python_version >= "3.9" and python_version < "3.13"
opentelemetry-exporter-otlp-proto-grpc==1.32.0 ; python_version >= "3.9" and python_version < "3.13"
opentelemetry-exporter-otlp-proto-http==1.32.0 ; python_version >= "3.9" and python_version < "3.13"
opentelemetry-exporter-otlp==1.32.0 ; python_version >= "3.9" and python_version < "3.13"
opentelemetry-instrumentation-grpc==0.53b0 ; python_version >= "3.9" and python_version < "3.13"
opentelemetry-instrumentation==0.53b0 ; python_version >= "3.9" and python_version < "3.13"
opentelemetry-proto==1.32.0 ; python_version >= "3.9" and python_version < "3.13"
opentelemetry-sdk==1.32.0 ; python_version >= "3.9" and python_version < "3.13"
opentelemetry-semantic-conventions==0.53b0 ; python_version >= "3.9" and python_version < "3.13"
optimum==1.24.0 ; python_version >= "3.9" and python_version < "3.13"
outlines==0.0.36 ; python_version >= "3.9" and python_version < "3.13"
packaging==24.2 ; python_version >= "3.9" and python_version < "3.13"
peft==0.15.1 ; python_version >= "3.9" and python_version < "3.13"
pillow==11.2.1 ; python_version >= "3.9" and python_version < "3.13"
prometheus-client==0.21.1 ; python_version >= "3.9" and python_version < "3.13"
protobuf==5.29.4 ; python_version >= "3.9" and python_version < "3.13"
psutil==7.0.0 ; python_version >= "3.9" and python_version < "3.13"
py-cpuinfo==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
pydantic-core==2.33.1 ; python_version >= "3.9" and python_version < "3.13"
pydantic==2.11.3 ; python_version >= "3.9" and python_version < "3.13"
pygments==2.19.1 ; python_version >= "3.9" and python_version < "3.13"
pyyaml==6.0.2 ; python_version >= "3.9" and python_version < "3.13"
referencing==0.36.2 ; python_version >= "3.9" and python_version < "3.13"
regex==2024.11.6 ; python_version >= "3.9" and python_version < "3.13"
requests==2.32.3 ; python_version >= "3.9" and python_version < "3.13"
rich==14.0.0 ; python_version >= "3.9" and python_version < "3.13"
rpds-py==0.24.0 ; python_version >= "3.9" and python_version < "3.13"
safetensors==0.5.3 ; python_version >= "3.9" and python_version < "3.13"
scikit-learn==1.6.1 ; python_version >= "3.9" and python_version < "3.13"
scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13"
sentence-transformers==3.3.1 ; python_version >= "3.9" and python_version < "3.13"
sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "3.13"
setuptools==78.1.0 ; python_version >= "3.12" and python_version < "3.13"
shellingham==1.5.4 ; python_version >= "3.9" and python_version < "3.13"
sympy==1.13.1 ; python_version >= "3.9" and python_version < "3.13"
threadpoolctl==3.6.0 ; python_version >= "3.9" and python_version < "3.13"
tokenizers==0.21.1 ; python_version >= "3.9" and python_version < "3.13"
tqdm==4.67.1 ; python_version >= "3.9" and python_version < "3.13"
transformers==4.52.4 ; python_version >= "3.9" and python_version < "3.13"
triton==3.2.0 ; python_version >= "3.9" and python_version < "3.13" and platform_system == "Linux" and platform_machine == "x86_64"
typer==0.15.2 ; python_version >= "3.9" and python_version < "3.13"
typing-extensions==4.13.2 ; python_version >= "3.9" and python_version < "3.13"
typing-inspection==0.4.0 ; python_version >= "3.9" and python_version < "3.13"
urllib3==2.4.0 ; python_version >= "3.9" and python_version < "3.13"
win32-setctime==1.2.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
wrapt==1.17.2 ; python_version >= "3.9" and python_version < "3.13"
zipp==3.21.0 ; python_version >= "3.9" and python_version < "3.13"


================================================
FILE: backends/gaudi/server/text_generation_server/__init__.py
================================================


================================================
FILE: backends/gaudi/server/text_generation_server/adapters/__init__.py
================================================
# Origin:   https://github.com/predibase/lorax
# Path:     lorax/server/lorax_server/adapters/__init__.py
# License:  Apache License Version 2.0, January 2004

from text_generation_server.adapters.weights import (
    AdapterBatchData,
    AdapterBatchMetadata,
)

__all__ = [
    "AdapterBatchData",
    "AdapterBatchMetadata",
]


================================================
FILE: backends/gaudi/server/text_generation_server/adapters/config.py
================================================
# Origin:   https://github.com/predibase/lorax
# Path:     lorax/server/lorax_server/adapters/config.py
# License:  Apache License Version 2.0, January 2004

from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Dict, Set, Tuple

import torch

from text_generation_server.adapters.weights import AdapterWeights


@dataclass
class ModuleMap:
    module_name: str
    module_weights: Dict[str, Tuple[torch.Tensor, str]]


@dataclass
class AdapterConfig(ABC):
    base_model_name_or_path: str

    @abstractmethod
    def map_weights_for_model(
        self,
        adapter_weights: Dict[int, AdapterWeights],
        weight_names: Tuple[str],
    ) -> Tuple[ModuleMap, Set[str]]:
        pass


================================================
FILE: backends/gaudi/server/text_generation_server/adapters/lora.py
================================================
# Origin:   https://github.com/predibase/lorax
# Path:     lorax/server/lorax_server/adapters/lora.py
# License:  Apache License Version 2.0, January 2004

from collections import defaultdict
from dataclasses import dataclass
from typing import Dict, List, Optional, Set, Tuple, Type, Union

import torch
from peft import LoraConfig as _LoraConfig
from torch.distributed import ProcessGroup

from text_generation_server.adapters.config import AdapterConfig, ModuleMap

from text_generation_server.adapters.weights import (
    AdapterBatchMetadata,
    AdapterWeights,
    BatchAdapterWeights,
)
from text_generation_server.utils.sgmv import (
    BGMV_MAX_RANK,
    MAX_RANK_CUSTOM,
    get_tmp_tensors,
    orient_for_rank,
    pad_rank,
    use_cutlass_shrink,
)


def get_start_stop_idxs_for_rank(offset, size, rank, world_size):
    block_size = size // world_size
    start = offset + rank * block_size
    stop = offset + (rank + 1) * block_size
    return start, stop


def shard_on_dim(
    t: torch.Tensor, dim: int, process_group: torch.distributed.ProcessGroup
):
    world_size = process_group.size()
    rank = process_group.rank()

    size = t.shape[dim]
    start, stop = get_start_stop_idxs_for_rank(0, size, rank, world_size)

    if dim == 0:
        tensor = t[start:stop]
    elif dim == 1:
        tensor = t[:, start:stop]
    else:
        raise NotImplementedError("Let's make that generic when needed")

    return tensor


def shard_lora_weights(
    weights_a: List[torch.Tensor],
    weights_b: List[torch.Tensor],
    split_dim: int,
    process_group: ProcessGroup,
) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
    # [hidden_size, r]
    weights_a = [
        shard_on_dim(w, dim=split_dim, process_group=process_group) for w in weights_a
    ]

    # [r, hidden_size]
    weights_b = [shard_on_dim(w, dim=1, process_group=process_group) for w in weights_b]

    return weights_a, weights_b


@dataclass
class LoraConfig(AdapterConfig):
    r: int
    target_modules: Optional[Union[List[str], str]]
    fan_in_fan_out: bool
    lora_alpha: int
    use_rslora: bool

    def map_weights_for_model(
        self,
        adapter_weights: Dict[int, AdapterWeights],
        weight_names: Tuple[str],
    ) -> Tuple[ModuleMap, Set[str]]:
        adapter_weight_names = set()
        module_map = {}
        for weight_name in weight_names:
            lora_a_name = f"base_model.model.{weight_name}.lora_A.weight"
            lora_b_name = f"base_model.model.{weight_name}.lora_B.weight"
            if lora_a_name not in adapter_weights or lora_b_name not in adapter_weights:
                continue

            module_map[weight_name] = {
                "lora_A": (adapter_weights[lora_a_name], lora_a_name),
                "lora_B": (adapter_weights[lora_b_name], lora_b_name),
            }
            adapter_weight_names.add(lora_a_name)
            adapter_weight_names.add(lora_b_name)
        return module_map, adapter_weight_names

    @classmethod
    def load(cls, adapter_id: str, api_token: str) -> "LoraConfig":
        hf_config = _LoraConfig.from_pretrained(adapter_id, token=api_token)
        return cls(
            base_model_name_or_path=hf_config.base_model_name_or_path,
            r=hf_config.r,
            target_modules=hf_config.target_modules,
            fan_in_fan_out=hf_config.fan_in_fan_out,
            lora_alpha=hf_config.lora_alpha,
            use_rslora=(
                hf_config.use_rslora if hasattr(hf_config, "use_rslora") else False
            ),
        )


class LoraWeights(AdapterWeights):
    """LoRA weights for a single adapter merged across all layers."""

    def __init__(
        self,
        weights_a: List[torch.Tensor],
        weights_b: List[torch.Tensor],
        adapter_config: LoraConfig,
    ):
        self.lora_a_r = weights_a[0].size(1) if len(weights_a) > 0 else 1
        self.lora_b_r = weights_b[0].size(0) if len(weights_a) > 0 else 1

        self._use_cutlass_shrink = use_cutlass_shrink(self.lora_a_r)
        self._is_transposed = False

        # [num_layers, hidden_size, r]
        weights_a = [orient_for_rank(w, w.size(1)).contiguous() for w in weights_a]
        self._weights_a = torch.stack(weights_a)

        # [num_layers, r, hidden_size]
        self._weights_b = torch.stack(weights_b)

        self.adapter_config = adapter_config

    @property
    def weights_a(self) -> torch.Tensor:
        if self._is_transposed:
            self._transpose_weights()
        return self._weights_a

    @property
    def weights_b(self) -> torch.Tensor:
        if self._is_transposed:
            self._transpose_weights()
        return self._weights_b

    @property
    def weights_a_t(self) -> torch.Tensor:
        if not self._is_transposed:
            self._transpose_weights()
        return self._weights_a

    @property
    def weights_b_t(self) -> torch.Tensor:
        if not self._is_transposed:
            self._transpose_weights()
        return self._weights_b

    def _transpose_weights(self):
        if self._use_cutlass_shrink:
            # If we're not using the cutlass shrink, then both SGMV and BGMV use the same orientation
            self._weights_a = self._weights_a.transpose(1, 2).contiguous()
        self._weights_b = self._weights_b.transpose(1, 2).contiguous()
        self._is_transposed = not self._is_transposed

    @classmethod
    def get_batch_types(cls) -> List[Type[BatchAdapterWeights]]:
        return [BatchLoraWeights]

    # prepare pre-loaded lora weights for use in the model.
    #
    # this method processes and organizes lora weights for a specific layer type across all layers:
    # - uses `config` (LoraConfig) to apply lora-specific settings like scaling factor.
    # - retrieves weights from `module_map` based on the `layer_type`.
    # - processes `nlayers` number of layers.
    # - converts weights to the specified `dtype`.
    # - shards weights across `world_size` number of processes using the `process_group`.
    # - maps weights to specific layers using `target_to_layer`.
    # - tracks `unused_weight_names` to identify any unused weights.
    #
    # the method handles weight transposition, scaling, and padding to ensure compatibility
    # with SGMV or BGMV operations.
    @classmethod
    def prepare_weights(
        cls,
        config: LoraConfig,
        module_map: Dict[str, Dict],
        layer_type: str,
        unused_weight_names: Set[str],
        nlayers: int,
        dtype: torch.dtype,
        world_size: int,
        process_group: ProcessGroup,
        target_to_layer: Dict[str, Tuple[str, torch.Tensor]],
    ) -> Optional[AdapterWeights]:
        lora_a_list = [None] * nlayers
        lora_b_list = [None] * nlayers

        for layer_id in range(nlayers):
            key = (layer_id, layer_type)
            weight_name, layer = target_to_layer[key]
            base_weight = layer.base_layer.linear.weight
            base_device = base_weight.device

            if weight_name not in module_map:
                # There is no LoRA weight for this layer type in the adapter
                return None

            lora_a, lora_a_name = module_map[weight_name]["lora_A"]
            lora_a = lora_a.to(base_device, dtype)

            lora_b, lora_b_name = module_map[weight_name]["lora_B"]
            lora_b = lora_b.to(base_device, dtype)

            scale = get_scaling_factor(
                config.lora_alpha,
                config.r,
                uses_rslora=config.use_rslora,
            )

            unused_weight_names.discard(lora_a_name)
            unused_weight_names.discard(lora_b_name)

            # Merge scaling factor into lora_b due to associativity of matrix multiplication:
            # (A * B) * C = A * (B * C)
            lora_a_list[layer_id] = lora_a.transpose(0, 1)
            lora_b_list[layer_id] = lora_b.transpose(0, 1) * scale

        # pad lora ranks to be compatible with sgmv
        lora_a_list = [pad_rank(w, dim=1, world_size=world_size) for w in lora_a_list]
        lora_b_list = [pad_rank(w, dim=0, world_size=world_size) for w in lora_b_list]

        if lora_a_list:
            # update rank if it was padded
            padded_rank = lora_a_list[0].size(1)
            config.r = padded_rank

        return LoraWeights(
            *shard_lora_weights(
                weights_a=lora_a_list,
                weights_b=lora_b_list,
                split_dim=0 if layer_type in {"o_proj", "down_proj", "lm_head"} else 1,
                process_group=process_group,
            ),
            config,
        )


@dataclass
class RankSegments:
    rank: int

    lora_a_ptr: torch.Tensor
    lora_b_ptr: torch.Tensor

    # prefill (sgmv)
    tmp_shrink: torch.Tensor
    tmp_expand: torch.Tensor
    segment_starts: torch.Tensor
    segment_ends: torch.Tensor

    # decode (bgmv)
    indices: torch.Tensor


@dataclass
class BatchLoraWeights(BatchAdapterWeights):
    lora_a: Dict[int, torch.Tensor]
    lora_b: Dict[int, torch.Tensor]
    adapter_index_configs: Dict[int, LoraConfig]
    rank_data: Dict[int, RankSegments]
    use_sgmv: bool

    def has_adapter(self, adapter_index: int) -> bool:
        return adapter_index in self.adapter_index_configs

    def can_vectorize(self, pg: ProcessGroup) -> bool:
        return all(
            rank_data.rank // pg.size() <= MAX_RANK_CUSTOM
            for rank_data in self.rank_data.values()
        )

    @classmethod
    def load(
        self,
        adapter_weights: Dict[int, AdapterWeights],
        meta: AdapterBatchMetadata,
        prefill: bool,
        prefill_head_indices: Optional[torch.Tensor],
    ) -> Optional["BatchLoraWeights"]:
        adapter_weights = {k: _convert_lora(v) for k, v in adapter_weights.items()}
        adapter_weights = {
            k: v for k, v in adapter_weights.items() if isinstance(v, LoraWeights)
        }
        if not adapter_weights:
            return None

        first_weights = next(iter(adapter_weights.values()))
        device = first_weights.weights_a.device
        segment_indices = meta.segment_indices

        lora_a = {
            idx: adapter_weights[idx].weights_a
            for idx in segment_indices
            if idx in adapter_weights
        }
        lora_b = {
            idx: adapter_weights[idx].weights_b
            for idx in segment_indices
            if idx in adapter_weights
        }

        max_rank = max(
            (
                adapter_weights[idx].lora_a_r
                for idx in segment_indices
                if idx in adapter_weights
            ),
            default=0,
        )

        if prefill or max_rank > BGMV_MAX_RANK:
            use_sgmv = True
            lora_a_ptr = torch.tensor(
                [
                    (
                        adapter_weights[idx].weights_a.data_ptr()
                        if idx in adapter_weights
                        else 0
                    )
                    for idx in segment_indices
                ],
                dtype=torch.int64,
                device=device,
            )
            lora_b_ptr = torch.tensor(
                [
                    (
                        adapter_weights[idx].weights_b.data_ptr()
                        if idx in adapter_weights
                        else 0
                    )
                    for idx in segment_indices
                ],
                dtype=torch.int64,
                device=device,
            )
        else:
            use_sgmv = False
            lora_a_ptr = torch.tensor(
                [
                    (
                        adapter_weights[idx].weights_a_t.data_ptr()
                        if idx in adapter_weights
                        else 0
                    )
                    for idx in segment_indices
                ],
                dtype=torch.int64,
                device=device,
            )
            lora_b_ptr = torch.tensor(
                [
                    (
                        adapter_weights[idx].weights_b_t.data_ptr()
                        if idx in adapter_weights
                        else 0
                    )
                    for idx in segment_indices
                ],
                dtype=torch.int64,
                device=device,
            )

        adapter_index_configs = {
            idx: adapter_weights[idx].adapter_config
            for idx in segment_indices
            if idx in adapter_weights
        }

        adapter_to_segment = {v: k for k, v in enumerate(segment_indices)}

        rank_indices = defaultdict(list)
        for segment_idx, adapter_idx in enumerate(segment_indices):
            if adapter_idx not in adapter_weights:
                continue
            rank_indices[adapter_weights[adapter_idx].lora_a_r].append(segment_idx)

        if prefill_head_indices is not None:
            j, prefill_head_segment_starts, prefill_head_segment_ends = 1, [0], [0]
            for head_index in prefill_head_indices:
                # j cannot go out of bounds as that would mean there are tokens without corresponding adapters
                if head_index < meta.adapter_segments[j]:
                    prefill_head_segment_ends[-1] += 1
                else:
                    prefill_head_segment_starts.append(prefill_head_segment_ends[-1])
                    prefill_head_segment_ends.append(prefill_head_segment_ends[-1] + 1)
                    j += 1

        rank_data = {}
        for rank, indices in rank_indices.items():
            tmp_shrink = None
            tmp_expand = None
            segment_starts = None
            segment_ends = None
            batch_indices = None

            if use_sgmv:
                lora_a_ptr_indices = lora_a_ptr[indices]
                tmp_shrink, tmp_expand = get_tmp_tensors(
                    lora_a_ptr_indices.size(0), rank, device
                )
                segment_starts = meta.adapter_segments[indices]
                segment_ends = meta.adapter_segments[[i + 1 for i in indices]]
                if prefill_head_indices is not None:
                    for i, segment_index in enumerate(indices):
                        segment_starts[i] = prefill_head_segment_starts[segment_index]
                        segment_ends[i] = prefill_head_segment_ends[segment_index]
            else:
                rank_indices = set(indices)
                batch_indices = [
                    adapter_to_segment[idx] for idx in meta.adapter_indices.tolist()
                ]
                batch_indices = [
                    idx if idx in rank_indices else -1 for idx in batch_indices
                ]
                batch_indices = torch.tensor(
                    batch_indices, dtype=torch.int64, device=device
                )

            rank_data[rank] = RankSegments(
                rank=rank,
                tmp_shrink=tmp_shrink,
                tmp_expand=tmp_expand,
                lora_a_ptr=lora_a_ptr[indices],
                lora_b_ptr=lora_b_ptr[indices],
                segment_starts=segment_starts,
                segment_ends=segment_ends,
                indices=batch_indices,
            )

        return BatchLoraWeights(
            lora_a=lora_a,
            lora_b=lora_b,
            adapter_index_configs=adapter_index_configs,
            rank_data=rank_data,
            use_sgmv=use_sgmv,
        )


def get_scaling_factor(
    lora_alpha: int,
    r: int,
    uses_rslora: bool = False,
) -> float:
    """Computes the scaling factor for the lora weights."""
    if uses_rslora:
        return lora_alpha / (r**0.5)
    return lora_alpha / r


def _convert_lora(v: AdapterWeights) -> AdapterWeights:
    if hasattr(v, "lora_weights"):
        return v.lora_weights
    return v


================================================
FILE: backends/gaudi/server/text_generation_server/adapters/weights.py
================================================
# Origin:   https://github.com/predibase/lorax
# Path:     lorax/server/lorax_server/adapters/weights.py
# License:  Apache License Version 2.0, January 2004

from abc import ABC, abstractclassmethod
from collections import defaultdict
from dataclasses import dataclass
from typing import Dict, List, Optional, Set, Type

import torch


@dataclass
class AdapterBatchMetadata:
    # [batch_size]
    adapter_indices: torch.Tensor

    # [num_adapters]
    adapter_set: Set[int]

    # [num_segments + 1]
    adapter_segments: torch.Tensor

    # [num_segments]
    # maps from segment index to adapter index, i.e.:
    # segment_indices[s] == adapter_indices[i]
    segment_indices: List[int]


class AdapterWeights(ABC):
    @abstractclassmethod
    def get_batch_types(cls) -> List[Type["BatchAdapterWeights"]]:
        pass

    @property
    def speculative_tokens(self) -> int:
        return 0


class BatchAdapterWeights(ABC):
    @abstractclassmethod
    def has_adapter(self, adapter_index: int) -> bool:
        pass

    @abstractclassmethod
    def load(
        cls,
        adapter_weights: Dict[int, AdapterWeights],
        meta: "AdapterBatchMetadata",
        prefill: bool,
        prefill_head_indices: torch.Tensor,
    ) -> Optional["BatchAdapterWeights"]:
        pass


class LayerAdapterWeights:
    """Adapter weights that apply to a particular layer."""

    def __init__(self):
        self.adapter_weights: Dict[int, AdapterWeights] = {}

    def add_adapter(self, adapter_idx: int, weights: AdapterWeights):
        self.adapter_weights[adapter_idx] = weights

    def remove_adapter(self, adapter_idx: int):
        if adapter_idx not in self.adapter_weights:
            return
        del self.adapter_weights[adapter_idx]

    def is_empty(self) -> bool:
        return len(self.adapter_weights) == 0

    def get_data(
        self,
        meta: AdapterBatchMetadata,
        prefill: bool,
        prefill_head_indices: Optional[torch.Tensor],
    ) -> Dict[str, BatchAdapterWeights]:
        # bucket adapters by batch class
        adapter_batch_types: Dict[
            Type[BatchAdapterWeights], Dict[int, AdapterWeights]
        ] = defaultdict(dict)
        for adapter_index, adapter_weights in self.adapter_weights.items():
            for batch_type in adapter_weights.get_batch_types():
                adapter_batch_types[batch_type][adapter_index] = adapter_weights

        batch_data = {}
        for batch_type, adapter_weights in adapter_batch_types.items():
            batched_weights = batch_type.load(
                adapter_weights, meta, prefill, prefill_head_indices
            )
            if batched_weights is not None:
                batch_data = batched_weights
        return batch_data


@dataclass
class AdapterBatchData:
    meta: AdapterBatchMetadata

    # layer type -> adapter type -> batch weight data
    data: Dict[str, Dict[str, BatchAdapterWeights]]

    prefill: bool

    @staticmethod
    def from_meta(
        meta: AdapterBatchMetadata,
        weights: Dict[str, LayerAdapterWeights],
        prefill: bool,
        prefill_head_indices: Optional[torch.Tensor],
    ) -> "AdapterBatchData":
        data = {}
        for k, v in weights.items():
            if v.is_empty():
                continue
            data[k] = v.get_data(
                meta, prefill, prefill_head_indices if k == "lm_head" else None
            )
        return AdapterBatchData(meta=meta, data=data, prefill=prefill)

    def ranks(self) -> Set[int]:
        # TODO(travis): refactor to be less coupled to lora implementation
        ranks = set()
        for lora_data in self.data.values():
            if lora_data is None:
                continue

            for rank_data in lora_data.rank_data.values():
                ranks.add(rank_data.rank)

        return ranks

    def layer_names(self) -> Set[str]:
        return set(self.data.keys())

    def adapter_keys(self) -> Set[str]:
        adapter_keys = set()
        for layer_data in self.data.values():
            adapter_keys.update(layer_data.keys())
        return adapter_keys

    @property
    def max_rank(self) -> int:
        ranks = self.ranks()
        return max(ranks) if len(ranks) > 0 else 0


================================================
FILE: backends/gaudi/server/text_generation_server/cache.py
================================================
import torch

from typing import Dict, Optional, TypeVar

from text_generation_server.models.types import Batch

B = TypeVar("B", bound=Batch)


class Cache:
    def __init__(self):
        self.cache: Dict[int, B] = {}

    def pop(self, batch_id: int) -> Optional[B]:
        return self.cache.pop(batch_id, None)

    def set(self, entry: B):
        if entry is not None:
            self.cache[entry.batch_id] = entry

    def delete(self, batch_id: int):
        batch = self.pop(batch_id)
        if batch is not None:
            del batch
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    def clear(self):
        keys = list(self.cache.keys())
        for k in keys:
            self.delete(k)

    def __len__(self):
        return len(self.cache.keys())


================================================
FILE: backends/gaudi/server/text_generation_server/cli.py
================================================
import os
import sys
import typer

from pathlib import Path
from loguru import logger
from typing import Optional
from enum import Enum
from huggingface_hub import hf_hub_download
from text_generation_server.utils.adapter import parse_lora_adapters


app = typer.Typer()


class Quantization(str, Enum):
    gptq = "gptq"
    awq = "awq"
    fp8 = "fp8"
    compressed_tensors = "compressed-tensors"


class Dtype(str, Enum):
    float16 = "float16"
    bloat16 = "bfloat16"


class KVCacheDtype(str, Enum):
    fp8_e4m3fn = "fp8_e4m3fn"
    fp8_e5m2 = "fp8_e5m2"


@app.command()
def serve(
    model_id: str,
    revision: Optional[str] = None,
    sharded: bool = False,
    quantize: Optional[Quantization] = None,
    speculate: Optional[int] = None,
    dtype: Optional[Dtype] = None,
    kv_cache_dtype: Optional[KVCacheDtype] = None,
    trust_remote_code: bool = False,
    uds_path: Path = "/tmp/text-generation-server",
    logger_level: str = "INFO",
    json_output: bool = False,
    otlp_endpoint: Optional[str] = None,
    otlp_service_name: str = "text-generation-inference.server",
    max_input_tokens: Optional[int] = None,
):
    if sharded:
        # assert (
        #     os.getenv("RANK", None) is not None
        # ), "RANK must be set when sharded is True"
        assert (
            os.getenv("WORLD_SIZE", None) is not None
        ), "WORLD_SIZE must be set when sharded is True"
        assert (
            os.getenv("MASTER_ADDR", None) is not None
        ), "MASTER_ADDR must be set when sharded is True"
        assert (
            os.getenv("MASTER_PORT", None) is not None
        ), "MASTER_PORT must be set when sharded is True"

    # Remove default handler
    logger.remove()
    logger.add(
        sys.stdout,
        format="{message}",
        filter="text_generation_server",
        level=logger_level,
        serialize=json_output,
        backtrace=True,
        diagnose=False,
    )

    # Import here after the logger is added to log potential import exceptions
    from text_generation_server import server
    from text_generation_server.tracing import setup_tracing

    # Setup OpenTelemetry distributed tracing
    if otlp_endpoint is not None:
        setup_tracing(otlp_service_name=otlp_service_name, otlp_endpoint=otlp_endpoint)

    lora_adapters = parse_lora_adapters(os.getenv("LORA_ADAPTERS"))

    # TODO: enable lora with cuda graphs. for now disable cuda graphs if lora is enabled
    # and warn the user
    if lora_adapters:
        logger.warning("LoRA adapters enabled (experimental feature).")

        if "CUDA_GRAPHS" in os.environ:
            logger.warning(
                "LoRA adapters incompatible with CUDA Graphs. Disabling CUDA Graphs."
            )
            global CUDA_GRAPHS
            CUDA_GRAPHS = None

    # Downgrade enum into str for easier management later on
    quantize = None if quantize is None else quantize.value
    dtype = "bfloat16" if dtype is None else dtype.value
    kv_cache_dtype = None if kv_cache_dtype is None else kv_cache_dtype.value
    logger.info(f"quantize={quantize} kv_cache_dtype={kv_cache_dtype}")
    if dtype is not None and quantize not in {
        None,
        "bitsandbytes",
        "bitsandbytes-nf4",
        "bitsandbytes-fp4",
        "gptq",
        "awq",
        "fp8",
        "compressed-tensors",
    }:
        raise RuntimeError(
            "Only 1 can be set between `dtype` and `quantize`, as they both decide how goes the final model."
        )
    server.serve(
        model_id,
        lora_adapters,
        revision,
        sharded,
        quantize,
        speculate,
        dtype,
        kv_cache_dtype,
        trust_remote_code,
        uds_path,
        max_input_tokens,
    )


@app.command()
def download_weights(
    model_id: str,
    revision: Optional[str] = None,
    extension: str = ".safetensors",
    auto_convert: bool = True,
    logger_level: str = "INFO",
    json_output: bool = False,
    trust_remote_code: bool = False,
    merge_lora: bool = False,
):
    # Remove default handler
    logger.remove()
    logger.add(
        sys.stdout,
        format="{message}",
        filter="text_generation_server",
        level=logger_level,
        serialize=json_output,
        backtrace=True,
        diagnose=False,
    )

    # Import here after the logger is added to log potential import exceptions
    from text_generation_server import utils

    # Test if files were already download
    try:
        utils.weight_files(model_id, revision, extension)
        logger.info("Files are already present on the host. " "Skipping download.")
        return
    # Local files not found
    except (utils.LocalEntryNotFoundError, FileNotFoundError, utils.EntryNotFoundError):
        pass

    is_local_model = (Path(model_id).exists() and Path(model_id).is_dir()) or os.getenv(
        "WEIGHTS_CACHE_OVERRIDE", None
    ) is not None

    if not is_local_model:
        # TODO: maybe reverse the default value of merge_lora?
        # currently by default we don't merge the weights with the base model
        if merge_lora:
            try:
                hf_hub_download(
                    model_id, revision=revision, filename="adapter_config.json"
                )
                utils.download_and_unload_peft(
                    model_id, revision, trust_remote_code=trust_remote_code
                )
                is_local_model = True
                utils.weight_files(model_id, revision, extension)
                return
            except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
                pass
        else:
            try:
                utils.peft.download_peft(
                    model_id, revision, trust_remote_code=trust_remote_code
                )
            except Exception:
                pass

        try:
            import json

            config = hf_hub_download(
                model_id, revision=revision, filename="config.json"
            )
            with open(config, "r") as f:
                config = json.load(f)

            base_model_id = config.get("base_model_name_or_path", None)
            if base_model_id and base_model_id != model_id:
                try:
                    logger.info(f"Downloading parent model {base_model_id}")
                    download_weights(
                        model_id=base_model_id,
                        revision="main",
                        extension=extension,
                        auto_convert=auto_convert,
                        logger_level=logger_level,
                        json_output=json_output,
                        trust_remote_code=trust_remote_code,
                    )
                except Exception:
                    pass
        except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
            pass

        # Try to download weights from the hub
        try:
            filenames = utils.weight_hub_files(model_id, revision, extension)
            utils.download_weights(filenames, model_id, revision)
            # Successfully downloaded weights
            return

        # No weights found on the hub with this extension
        except utils.EntryNotFoundError as e:
            # Check if we want to automatically convert to safetensors or if we can use .bin weights instead
            if not extension == ".safetensors" or not auto_convert:
                raise e

    elif (Path(model_id) / "adapter_config.json").exists():
        # Try to load as a local PEFT model
        try:
            utils.download_and_unload_peft(
                model_id, revision, trust_remote_code=trust_remote_code
            )
            utils.weight_files(model_id, revision, extension)
            return
        except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
            pass
    elif (Path(model_id) / "config.json").exists():
        # Try to load as a local Medusa model
        try:
            import json

            config = Path(model_id) / "config.json"
            with open(config, "r") as f:
                config = json.load(f)

            base_model_id = config.get("base_model_name_or_path", None)
            if base_model_id:
                try:
                    logger.info(f"Downloading parent model {base_model_id}")
                    download_weights(
                        model_id=base_model_id,
                        revision="main",
                        extension=extension,
                        auto_convert=auto_convert,
                        logger_level=logger_level,
                        json_output=json_output,
                        trust_remote_code=trust_remote_code,
                    )
                except Exception:
                    pass
        except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
            pass

    # Try to see if there are local pytorch weights
    try:
        # Get weights for a local model, a hub cached model and inside the WEIGHTS_CACHE_OVERRIDE
        try:
            local_pt_files = utils.weight_files(model_id, revision, ".bin")
        except Exception:
            local_pt_files = utils.weight_files(model_id, revision, ".pt")

    # No local pytorch weights
    except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
        if extension == ".safetensors":
            logger.warning(
                f"No safetensors weights found for model {model_id} at revision {revision}. "
                f"Downloading PyTorch weights."
            )

        # Try to see if there are pytorch weights on the hub
        pt_filenames = utils.weight_hub_files(model_id, revision, ".bin")
        # Download pytorch weights
        local_pt_files = utils.download_weights(pt_filenames, model_id, revision)

    if auto_convert:
        if not trust_remote_code:
            logger.warning(
                "🚨🚨BREAKING CHANGE in 2.0🚨🚨: Safetensors conversion is disabled without `--trust-remote-code` because "
                "Pickle files are unsafe and can essentially contain remote code execution!"
                "Please check for more information here: https://huggingface.co/docs/text-generation-inference/basic_tutorials/safety",
            )

        logger.warning(
            f"No safetensors weights found for model {model_id} at revision {revision}. "
            f"Converting PyTorch weights to safetensors."
        )

        # Safetensors final filenames
        local_st_files = [
            p.parent / f"{p.stem.lstrip('pytorch_')}.safetensors"
            for p in local_pt_files
        ]
        try:
            import transformers
            import json

            if is_local_model:
                config_filename = os.path.join(model_id, "config.json")
            else:
                config_filename = hf_hub_download(
                    model_id, revision=revision, filename="config.json"
                )
            with open(config_filename, "r") as f:
                config = json.load(f)
            architecture = config["architectures"][0]

            class_ = getattr(transformers, architecture)

            # Name for this varible depends on transformers version.
            discard_names = getattr(class_, "_tied_weights_keys", [])

        except Exception:
            discard_names = []
        # Convert pytorch weights to safetensors
        utils.convert_files(local_pt_files, local_st_files, discard_names)


@app.command()
def quantize(
    model_id: str,
    output_dir: str,
    revision: Optional[str] = None,
    logger_level: str = "INFO",
    json_output: bool = False,
    trust_remote_code: bool = False,
    upload_to_model_id: Optional[str] = None,
    percdamp: float = 0.01,
    act_order: bool = False,
    groupsize: int = 128,
):
    if revision is None:
        revision = "main"
    download_weights(
        model_id=model_id,
        revision=revision,
        logger_level=logger_level,
        json_output=json_output,
    )
    from text_generation_server.layers.gptq.quantize import quantize

    quantize(
        model_id=model_id,
        bits=4,
        groupsize=groupsize,
        output_dir=output_dir,
        revision=revision,
        trust_remote_code=trust_remote_code,
        upload_to_model_id=upload_to_model_id,
        percdamp=percdamp,
        act_order=act_order,
        sym=True,
    )


if __name__ == "__main__":
    app()


================================================
FILE: backends/gaudi/server/text_generation_server/interceptor.py
================================================
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.

import torch
import grpc

from google.rpc import status_pb2, code_pb2
from grpc_status import rpc_status
from grpc_interceptor.server import AsyncServerInterceptor
from loguru import logger
from typing import Callable, Any
import traceback
import os


class ExceptionInterceptor(AsyncServerInterceptor):
    async def intercept(
        self,
        method: Callable,
        request_or_iterator: Any,
        context: grpc.ServicerContext,
        method_name: str,
    ) -> Any:
        try:
            response = method(request_or_iterator, context)
            return await response
        except Exception as err:
            trace = " " + traceback.format_exc() if os.environ.get("DUMP_STACK") else ""
            method_name = method_name.split("/")[-1]
            logger.exception(f"Method {method_name} encountered an error.")

            # Runtime Error cannot be recovered from
            if isinstance(err, RuntimeError):
                exit(1)

            if torch.cuda.is_available():
                torch.cuda.empty_cache()

            from .utils.debug import dbg_trace

            dbg_trace("EXCEPTION", traceback.format_exc())
            await context.abort_with_status(
                rpc_status.to_status(
                    status_pb2.Status(code=code_pb2.INTERNAL, message=str(err) + trace)
                )
            )


================================================
FILE: backends/gaudi/server/text_generation_server/layers/__init__.py
================================================
from text_generation_server.layers.tensor_parallel import (
    TensorParallelColumnLinear,
    TensorParallelRowLinear,
    TensorParallelEmbedding,
)
from text_generation_server.layers.linear import (
    get_linear,
    FastLinear,
)
from text_generation_server.layers.speculative import SpeculativeHead

# Just to add the `load` methods.
from text_generation_server.layers.layernorm import load_layer_norm
from text_generation_server.layers.conv import load_conv2d
from text_generation_server.layers.fp8 import Fp8Linear

from text_generation_server.layers.lora import (
    LoraLinear,
    TensorParallelMultiAdapterLinear,
    TensorParallelAdapterRowLinear,
)

__all__ = [
    "get_linear",
    "FastLinear",
    "TensorParallelColumnLinear",
    "TensorParallelRowLinear",
    "TensorParallelEmbedding",
    "SpeculativeHead",
    "LoraLinear",
    "Fp8Linear",
    "TensorParallelMultiAdapterLinear",
    "TensorParallelAdapterRowLinear",
    "load_layer_norm",
    "load_conv2d",
]


================================================
FILE: backends/gaudi/server/text_generation_server/layers/attention/__init__.py
================================================
from .common import (
    Seqlen,
    HPUPagedAttentionMetadata,
    trim_attn_metadata,
    trim_seqlen_metadata,
    _async_h2d_tensor_copy,
)

from .hpu import (
    SUPPORTS_WINDOWING,
    attention,
    paged_attention,
    paged_attention_mla,
    set_block_mapping,
)


# KVCache needs `reshape_and_cache`, so ensure that it is defined already.
from .kv_cache import KVCache, get_kv_scales, KVCompressCache

__all__ = [
    "attention",
    "get_kv_scales",
    "paged_attention",
    "paged_attention_mla",
    "set_block_mapping",
    "SUPPORTS_WINDOWING",
    "KVCache",
    "KVCompressCache",
    "Seqlen",
    "HPUPagedAttentionMetadata",
    "trim_seqlen_metadata",
    "trim_attn_metadata",
    "_async_h2d_tensor_copy",
]


================================================
FILE: backends/gaudi/server/text_generation_server/layers/attention/common.py
================================================
from dataclasses import dataclass
import torch
from typing import Optional, List, Dict
import collections
import torch.nn.functional as F

_TYPE_CACHE = {}


@dataclass
class HPUPagedAttentionMetadata:
    """Metadata for PagedAttention."""

    block_list: Optional[torch.Tensor]
    block_mapping: Optional[torch.Tensor]
    block_usage: Optional[torch.Tensor]
    block_groups: Optional[torch.Tensor]
    attn_bias: Optional[torch.Tensor]
    slots_in_window_mask: Optional[torch.Tensor] = None
    block_list_in_window: Optional[torch.Tensor] = None
    block_mapping_in_window: Optional[torch.Tensor] = None
    block_usage_in_window: Optional[torch.Tensor] = None
    block_groups_in_window: Optional[torch.Tensor] = None
    attn_bias_in_window: Optional[torch.Tensor] = None


def subtuple(
    obj: object,
    typename: str,
    to_copy: List[str],
    to_override: Optional[Dict[str, object]] = None,
):
    if obj is None:
        return None
    if to_override is None:
        to_override = {}
    fields = set(to_copy) | set(to_override.keys())
    if isinstance(obj, dict):
        values = {key: obj[key] for key in fields if key in obj}
    else:
        values = {f: to_override.get(f, getattr(obj, f)) for f in fields}
    if typename not in _TYPE_CACHE:
        _TYPE_CACHE[typename] = collections.namedtuple(typename, " ".join(fields))
    return _TYPE_CACHE[typename](**values)


def trim_attn_metadata(metadata: HPUPagedAttentionMetadata) -> object:
    # NOTE(kzawora): To anyone working on this in the future:
    # Trimming metadata is required when using HPUGraphs.
    # Attention metadata is going to be hashed by PT bridge, and
    # appropriate HPUGraphs will be matched based on all inputs' hash.

    # Before you put more keys in here, make sure you know their
    # value type and make sure you know how it's going to be hashed.
    # You can find that information in input_hash function
    # in habana_frameworks/torch/hpu/graphs.py. You can also hash
    # it manually with torch.hpu.graphs.input_hash(attention_metadata)

    # If you use primitive types here - they will get hashed based
    # on their value. You *will* get lots of excessive graph captures
    # (and an OOM eventually) if you decide to put something like
    # seq_len int here.
    # If you absolutely need a scalar, put it in a tensor. Tensors
    # get hashed using their metadata, not their values:
    # input_hash(torch.tensor(123)) == input_hash(torch.tensor(321))
    # input_hash(123) != input_hash(321)
    # input_hash("abc") != input_hash("cba")
    attention_metadata = subtuple(
        metadata,
        "TrimmedAttentionMetadata",
        [
            "block_list",
            "block_mapping",
            "block_usage",
            "block_groups",
            "attn_bias",
            "slots_in_window_mask",
            "block_list_in_window",
            "block_mapping_in_window",
            "block_usage_in_window",
            "block_groups_in_window",
            "attn_bias_in_window",
        ],
    )
    return attention_metadata


@dataclass
class Seqlen:
    input_lengths: torch.Tensor
    attn_mask: Optional[torch.Tensor] = None

    def __init__(
        self,
        input_lengths,
    ):
        self.input_lengths = input_lengths

    def clamp(self, max):
        # Flash decoding doesn't need to clamp
        return self

    def make_sliding_window_bias(
        self,
        seq_lens: List[int],
        window_size: Optional[int],
        dtype: torch.dtype,
        padded_input_len: Optional[int],
        padded_bs: Optional[int],
    ) -> List[torch.Tensor]:
        attn_biases = []
        for seq_len in seq_lens:
            if seq_len != 0:
                tensor = torch.full(
                    (1, seq_len, seq_len),
                    dtype=dtype,
                    fill_value=1,
                )
                shift = 0
                mask = torch.tril(tensor, diagonal=shift).to(dtype)  # type: ignore
                if window_size is not None:
                    mask = torch.triu(mask, diagonal=shift - window_size + 1)
                mask = F.pad(
                    mask,
                    (
                        padded_input_len - seq_len,
                        0,
                        padded_input_len - seq_len,
                        0,
                        0,
                        0,
                    ),
                    value=0,
                )
            else:
                mask = torch.full(
                    (1, padded_input_len, padded_input_len),
                    dtype=dtype,
                    fill_value=0,
                )
            attn_biases.append(mask)
        attn_biases = torch.stack(attn_biases, dim=0)
        return attn_biases.to(torch.bool)


def _async_h2d_tensor_copy(source, device="hpu"):
    if source is None:
        return None
    if source.device.type == "hpu":
        return source
    assert source.device.type == "cpu", "Source tensor is not present in host memory!"
    target = torch.empty(source.shape, dtype=source.dtype, device=device)
    target.copy_(source, non_blocking=True)
    return target


def trim_seqlen_metadata(metadata: Seqlen) -> object:
    # NOTE(kzawora): To anyone working on this in the future:
    # Trimming metadata is required when using HPUGraphs.
    # Attention metadata is going to be hashed by PT bridge, and
    # appropriate HPUGraphs will be matched based on all inputs' hash.

    # Before you put more keys in here, make sure you know their
    # value type and make sure you know how it's going to be hashed.
    # You can find that information in input_hash function
    # in habana_frameworks/torch/hpu/graphs.py. You can also hash
    # it manually with torch.hpu.graphs.input_hash(attention_metadata)

    # If you use primitive types here - they will get hashed based
    # on their value. You *will* get lots of excessive graph captures
    # (and an OOM eventually) if you decide to put something like
    # seq_len int here.
    # If you absolutely need a scalar, put it in a tensor. Tensors
    # get hashed using their metadata, not their values:
    # input_hash(torch.tensor(123)) == input_hash(torch.tensor(321))
    # input_hash(123) != input_hash(321)
    # input_hash("abc") != input_hash("cba")
    attention_metadata = subtuple(
        metadata,
        "TrimmedSeqlen",
        [
            "input_lengths",
            "attn_mask",
        ],
    )
    return attention_metadata


================================================
FILE: backends/gaudi/server/text_generation_server/layers/attention/hpu.py
================================================
import torch
from text_generation_server.layers.attention import Seqlen, HPUPagedAttentionMetadata
from typing import Optional
from text_generation_server.layers.attention.kv_cache import KVCache, KVScales
from vllm_hpu_extension import ops
from vllm_hpu_extension.utils import Matmul
from habana_frameworks.torch.hpex.kernels import FusedSDPA
from vllm_hpu_extension.utils import ModuleFusedSDPA
import os
from text_generation_server.models.globals import BLOCK_SIZE
import math

SUPPORTS_WINDOWING = False


class FP8Matmul(torch.nn.Module):

    def __init__(self, scale_other):
        super().__init__()
        self.scale_input = torch.tensor(1.0, dtype=torch.bfloat16, device="hpu")
        self.scale_other = scale_other

    def quant_input(self, x, scale):
        return torch.ops.hpu.cast_to_fp8_v2(
            x, scale, False, False, torch.float8_e4m3fn
        )[0]

    def matmul_fp8(
        self, x, other, out_dtype, scale_input_inv=None, scale_other_inv=None
    ):
        return torch.ops.hpu.fp8_gemm_v2(
            A=x,
            trans_A=False,
            B=other,
            trans_B=False,
            D=None,
            out_dtype=out_dtype,
            A_scale_inv=scale_input_inv,
            B_scale_inv=scale_other_inv,
            bias=None,
            accumulate=False,
        )

    def forward(self, input, other):
        qinput = self.quant_input(input, self.scale_input)
        qother = self.quant_input(other, self.scale_other)
        output = self.matmul_fp8(
            qinput,
            qother,
            out_dtype=torch.bfloat16,
            scale_input_inv=1.0 / self.scale_input,
            scale_other_inv=1.0 / self.scale_other,
        )
        return output


class FetchFromCache(torch.nn.Module):

    def __init__(self, scale_inv):
        super().__init__()
        self.scale_inv = scale_inv

    def forward(self, cache, blocks):
        if os.environ.get("VLLM_CONTIGUOUS_PA", "true").lower() == "true":
            out = cache[: blocks.size(0)]
        else:
            out = cache.index_select(0, blocks)
        if out.dtype == torch.float8_e4m3fn:
            out = torch.ops.hpu.cast_from_fp8(out, self.scale_inv, torch.bfloat16)
        return out


def attention(
    *,
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    kv_cache: KVCache,
    kv_scales: KVScales,
    seqlen: Seqlen,
    softmax_scale: float,
    window_size_left: int = -1,
    causal: bool = True,
    softcap: Optional[float] = None,
):
    fsdpa_op = ModuleFusedSDPA(FusedSDPA)
    bs = seqlen.input_lengths.shape[0]
    _, head_num, head_size = query.shape
    _, kv_head_num, head_size = key.shape
    query = query.view(bs, -1, head_num, head_size).transpose(1, 2)
    key = key.view(bs, -1, kv_head_num, head_size).transpose(1, 2)
    value = value.view(bs, -1, kv_head_num, head_size).transpose(1, 2)
    attn_output = fsdpa_op(
        query,
        key,
        value,
        attn_mask=seqlen.attn_mask if window_size_left != -1 else None,
        dropout_p=0.0,
        is_causal=causal if window_size_left == -1 else False,
        scale=softmax_scale,
        softmax_mode="None",
        recompute_mode=None,
        valid_sequence_lengths=seqlen.input_lengths if window_size_left == -1 else None,
        padding_side="left",
    )
    attn_output = attn_output.transpose(1, 2).squeeze(0).contiguous()
    return attn_output


def set_block_mapping(hpu_attention_meta: HPUPagedAttentionMetadata, batch_size):
    block_mapping = torch.nn.functional.one_hot(
        hpu_attention_meta.block_groups, num_classes=batch_size
    )
    dtype = hpu_attention_meta.block_usage.dtype
    device = hpu_attention_meta.block_usage.device
    mask = torch.arange(0, BLOCK_SIZE, device=device, dtype=torch.int32).unsqueeze(0)
    mask = mask >= hpu_attention_meta.block_usage.unsqueeze(-1)
    attn_bias = torch.zeros_like(mask, dtype=dtype).masked_fill_(mask, -math.inf)
    hpu_attention_meta = hpu_attention_meta._replace(
        attn_bias=attn_bias, block_mapping=block_mapping.to(dtype)
    )
    if hpu_attention_meta.block_groups_in_window is not None:
        block_mapping = torch.nn.functional.one_hot(
            hpu_attention_meta.block_groups_in_window, num_classes=batch_size
        )
        attn_bias = torch.log(hpu_attention_meta.slots_in_window_mask.float())
        hpu_attention_meta = hpu_attention_meta._replace(
            attn_bias_in_window=attn_bias,
            block_mapping_in_window=block_mapping.to(dtype),
        )
    return hpu_attention_meta


def paged_attention(
    query: torch.Tensor,
    kv_cache: KVCache,
    kv_head_mapping: torch.Tensor,
    softmax_scale: float,
    seqlen: Seqlen,
    *,
    kv_scales: KVScales,
    softcap: Optional[float] = None,
    hpu_attention_meta: HPUPagedAttentionMetadata,
    window_size_left: int = -1,
):
    batch_size, head_num, head_size = query.shape
    fp8_kv = kv_cache.dtype == torch.float8_e4m3fn
    output = ops.flat_pa(
        query=query.view(batch_size, 1, head_num * head_size),
        key_cache=kv_cache.key,
        value_cache=kv_cache.value,
        block_list=(
            hpu_attention_meta.block_list
            if window_size_left == -1
            else hpu_attention_meta.block_list_in_window
        ),
        block_mapping=(
            hpu_attention_meta.block_mapping
            if window_size_left == -1
            else hpu_attention_meta.block_mapping_in_window
        ),
        block_bias=(
            hpu_attention_meta.attn_bias
            if window_size_left == -1
            else hpu_attention_meta.attn_bias_in_window
        ),
        block_groups=(
            hpu_attention_meta.block_groups
            if window_size_left == -1
            else hpu_attention_meta.block_groups_in_window
        ),
        block_size=BLOCK_SIZE,
        scale=softmax_scale,
        matmul_qk_op=FP8Matmul(kv_scales.key_scale) if fp8_kv else Matmul(),
        matmul_av_op=FP8Matmul(kv_scales.value_scale) if fp8_kv else Matmul(),
        batch2block_matmul_op=Matmul(),
        block2batch_matmul_op=Matmul(),
        keys_fetch_func=FetchFromCache(1.0 / kv_scales.key_scale_cpu),
        values_fetch_func=FetchFromCache(1.0 / kv_scales.value_scale_cpu),
    )
    # Reshape the output tensor.
    return output.view(batch_size, head_num, head_size)


def paged_attention_mla(
    query: torch.Tensor,
    kv_cache: KVCache,
    kv_head_mapping: torch.Tensor,
    softmax_scale: float,
    seqlen: Seqlen,
    *,
    kv_scales: KVScales,
    softcap: Optional[float] = None,
    hpu_attention_meta: HPUPagedAttentionMetadata,
    kv_lora_rank: int = 0,
):
    batch_size, head_num, head_size = query.shape
    fp8_kv = kv_cache.dtype == torch.float8_e4m3fn
    output = ops.flat_pa_mla(
        query=query,
        key_cache=kv_cache.key,
        value_cache=None,
        block_list=hpu_attention_meta.block_list,
        block_mapping=hpu_attention_meta.block_mapping,
        block_bias=hpu_attention_meta.attn_bias,
        block_groups=hpu_attention_meta.block_groups,
        block_size=BLOCK_SIZE,
        scale=softmax_scale,
        matmul_qk_op=FP8Matmul(kv_scales.key_scale) if fp8_kv else Matmul(),
        matmul_av_op=FP8Matmul(kv_scales.value_scale) if fp8_kv else Matmul(),
        batch2block_matmul_op=Matmul(),
        block2batch_matmul_op=Matmul(),
        keys_fetch_func=FetchFromCache(1.0 / kv_scales.key_scale_cpu),
        values_fetch_func=None,
        kv_lora_rank=kv_lora_rank,
    )
    # Reshape the output tensor.
    return output.view(batch_size, head_num, -1)


__all__ = [
    "SUPPORTS_WINDOWING",
    "attention",
    "paged_attention",
    "paged_attention_mla",
    "set_block_mapping",
]


================================================
FILE: backends/gaudi/server/text_generation_server/layers/attention/kv_cache.py
================================================
from typing import Tuple
from dataclasses import dataclass, field

import torch

from text_generation_server.models.globals import BLOCK_SIZE
from text_generation_server.utils.weights import Weights


@dataclass
class KVScales:
    """
    Key-value scales for FP8 KV cache.

    This data class stores key and value scales both as a GPU tensor and
    as a GPU float. This inconvenience is necessary because some functions
    (e.g. scaling kernels) take scales as a GPU tensor, whereas others
    (e.g. flashinfer) take scales as a CPU scalar.
    """

    key_scale: torch.Tensor
    value_scale: torch.Tensor
    key_scale_cpu: float = field(init=False)
    value_scale_cpu: float = field(init=False)

    def __post_init__(self):
        if self.key_scale.numel() != 1 or self.value_scale.numel() != 1:
            raise ValueError("Key and value scales must be scalar tensors.")

        self.key_scale_cpu = self.key_scale.item()
        self.value_scale_cpu = self.value_scale.item()


class KVCache:
    """
    Key-value cache for attention layers.
    """

    kv_cache: Tuple[torch.Tensor, torch.Tensor]

    def __init__(
        self,
        *,
        num_blocks: int,
        num_heads: int,
        head_size: int,
        dtype: torch.dtype,
        device: torch.device,
    ):
        """Construct the key-value cache for a layer."""
        ## TODO FP8 kv cache support
        if dtype is torch.float8_e5m2:
            raise ValueError("torch.float8_e5m2 is not supported in hpu. ")

        self.kv_cache = (
            torch.zeros(
                (num_blocks * BLOCK_SIZE, num_heads, head_size),
                dtype=dtype,
                device=device,
            ),
            torch.zeros(
                (num_blocks * BLOCK_SIZE, num_heads, head_size),
                dtype=dtype,
                device=device,
            ),
        )

    @property
    def dtype(self):
        """Get the data type of the cache."""
        return self.kv_cache[0].dtype

    @property
    def key(self):
        """Get the key cache."""

        return self.kv_cache[0]

    @property
    def value(self):
        """Get the value cache."""

        return self.kv_cache[1]

    def store(
        self,
        *,
        key: torch.Tensor,
        value: torch.Tensor,
        slots: torch.Tensor,
        kv_scales: KVScales,
    ):
        """Store the key and value at the given slots."""
        ## TODO FP8 kv cache support

        key_cache = self.kv_cache[0]
        value_cache = self.kv_cache[1]

        paged_reshape_and_cache(
            key,
            value,
            key_cache,
            value_cache,
            slots,
            kv_scales.key_scale,
            kv_scales.value_scale,
        )


class KVCompressCache(KVCache):
    """
    Key-value cache for attention layers.
    """

    kv_cache: torch.Tensor

    def __init__(
        self,
        *,
        num_blocks: int,
        head_size: int,
        dtype: torch.dtype,
        device: torch.device,
    ):
        """Construct the key-value cache for a layer."""
        ## TODO FP8 kv cache support
        if dtype is torch.float8_e5m2:
            raise ValueError("torch.float8_e5m2 is not supported in hpu. ")

        self.kv_cache = torch.zeros(
            (num_blocks * BLOCK_SIZE, 1, head_size),
            dtype=dtype,
            device=device,
        )

    @property
    def dtype(self):
        """Get the data type of the cache."""
        return self.kv_cache.dtype

    @property
    def key(self):
        """Get the key cache."""

        return self.kv_cache

    @property
    def value(self):
        """Get the value cache."""

        return self.kv_cache

    def store(
        self,
        *,
        key: torch.Tensor,
        value: torch.Tensor,
        slots: torch.Tensor,
        kv_scales: KVScales,
    ):
        """Store the key and value at the given slots."""
        ## TODO FP8 kv cache support
        if self.kv_cache.dtype == torch.float8_e4m3fn:
            key = torch.ops.hpu.cast_to_fp8_v2(
                key, kv_scales.key_scale, False, False, torch.float8_e4m3fn
            )[0]
        self.kv_cache.index_copy_(0, slots, key)


def paged_reshape_and_cache(
    key: torch.Tensor,
    value: torch.Tensor,
    key_cache: torch.Tensor,
    value_cache: torch.Tensor,
    slots: torch.Tensor,
    k_scale: torch.Tensor,
    v_scale: torch.Tensor,
):
    if key_cache.dtype == torch.float8_e4m3fn:
        key = torch.ops.hpu.cast_to_fp8_v2(
            key, k_scale, False, False, torch.float8_e4m3fn
        )[0]
        value = torch.ops.hpu.cast_to_fp8_v2(
            value, v_scale, False, False, torch.float8_e4m3fn
        )[0]
    key_cache.index_copy_(0, slots, key)
    value_cache.index_copy_(0, slots, value)


def get_kv_scales(weights: Weights, prefix: str) -> KVScales:
    """Load KV cache scales."""

    key_scale = torch.tensor(1.0, dtype=torch.float32, device=weights.device)
    value_scale = key_scale
    if weights.has_tensor(f"{prefix}.k_scale") and weights.has_tensor(
        f"{prefix}.v_scale"
    ):
        key_scale = weights.get_tensor(f"{prefix}.k_scale", to_dtype=False).float()
        value_scale = weights.get_tensor(f"{prefix}.v_scale", to_dtype=False).float()
    elif weights.has_tensor(f"{prefix}.kv_scale"):
        # Fall back to older more coarse-grained scale when available.
        key_scale = weights.get_tensor(f"{prefix}.kv_scale").float()
        value_scale = key_scale

    return KVScales(key_scale=key_scale, value_scale=value_scale)


================================================
FILE: backends/gaudi/server/text_generation_server/layers/awq/conversion_utils.py
================================================
import torch
from typing import List


AWQ_PACK_ORDER = [0, 2, 4, 6, 1, 3, 5, 7]
REVERSE_AWQ_PACK_ORDER = [0, 4, 1, 5, 2, 6, 3, 7]


def pack(imatrix: torch.Tensor, direction: str = "column"):
    """
    Packs a 4-bit integer matrix into a packed 32-bit integer matrix.
    Args:
        imatrix (torch.Tensor): matrix of integers
        direction (str): direction of packing, either "column" or "row"
    Returns:
        qmatrix (torch.Tensor): packed matrix of integers
    """
    shifts = torch.arange(0, 32, 4, dtype=torch.int32, device=imatrix.device)

    imatrix = imatrix.to(torch.int8) & 0x0F  # eventually correct overflow

    if direction == "column":
        imatrix = imatrix.view(-1, imatrix.shape[1] // (32 // 4), (32 // 4))
        qmatrix = torch.bitwise_left_shift(imatrix, shifts[None, None, :]).sum(dim=-1)

    elif direction == "row":
        imatrix = imatrix.view(imatrix.shape[0] // (32 // 4), (32 // 4), -1)
        qmatrix = torch.bitwise_left_shift(imatrix, shifts[None, :, None]).sum(dim=1)

    qmatrix = qmatrix.to(torch.int32)

    return qmatrix


def unpack(qmatrix: torch.Tensor, direction: str = "column"):
    """
    Unpacks a 32-bit packed integer matrix into a 4-bit integer matrix.
    Args:
        qmatrix (torch.Tensor): matrix of packed integers
        direction (str): direction of unpacking, either "column" or "row"
    Returns:
        imatrix (torch.Tensor): matrix of integers
    """
    shifts = torch.arange(0, 32, 4, device=qmatrix.device)

    if direction == "column":
        imatrix = torch.bitwise_right_shift(
            qmatrix[:, :, None], shifts[None, None, :]
        ).view(qmatrix.shape[0], -1)

    elif direction == "row":
        imatrix = torch.bitwise_right_shift(
            qmatrix[:, None, :], shifts[None, :, None]
        ).view(-1, qmatrix.shape[-1])

    imatrix = imatrix.to(torch.int8) & 0x0F  # eventually correct overflow

    return imatrix


def apply_order(
    imatrix: torch.Tensor,
    direction: str = "column",
    order: List[int] = AWQ_PACK_ORDER,
):
    """
    Applies the order to a 4-bit integer matrix.
    Args:
        imatrix (torch.Tensor): matrix of integers
        direction (str): direction of applying order, either "column" or "row"
        order (List[int]): order to apply, default is AWQ_PACK_ORDER
    Returns:
        imatrix (torch.Tensor): matrix of integers
    """
    if direction == "column":
        imatrix = imatrix.view(-1, (32 // 4))[:, order].view(imatrix.shape)
    elif direction == "row":
        imatrix = imatrix.view((32 // 4), -1)[order, :].view(imatrix.shape)

    return imatrix


def fast_awq_to_gptq(qweight, qzeros):
    # awq uses column packing for both weights and zeros
    izeros = unpack(qzeros, direction="column")
    iweights = unpack(qweight, direction="column")

    # Reverse the order of the iweight and izeros tensors
    izeros = apply_order(izeros, direction="column", order=REVERSE_AWQ_PACK_ORDER)
    iweights = apply_order(iweights, direction="column", order=REVERSE_AWQ_PACK_ORDER)
    # Subtract 1 from the izeros tensor (gptq adds 1 to the zeros)
    izeros = izeros - 1
    # exllama uses row packing for weights and column packing for zeros
    qzeros = pack(izeros, direction="column")
    qweight = pack(iweights, direction="row")

    return qweight, qzeros


================================================
FILE: backends/gaudi/server/text_generation_server/layers/awq/quantize/__init__.py
================================================
from .hpu import WQLinear

__all__ = ["WQLinear"]


================================================
FILE: backends/gaudi/server/text_generation_server/layers/awq/quantize/hpu.py
================================================
from typing import Optional
import torch
import torch.nn as nn

try:
    import habana_frameworks.torch.hpu  # noqa: F401

    convert_from_uint4 = torch.ops.hpu.convert_from_uint4
except Exception as e:
    hpu_import_exception = e

    def error_raiser_hpu(*args, **kwargs):
        raise ValueError(
            f"Trying to use HPU, but could not import the HPU framework with the following error: {hpu_import_exception}"
        )

    convert_from_uint4 = error_raiser_hpu

AWQ_REVERSE_ORDER = [0, 4, 1, 5, 2, 6, 3, 7]


def unpack_awq(qweight: torch.Tensor, qzeros: torch.Tensor, bits: int):
    shifts = torch.arange(0, 32, bits, device=qzeros.device)

    # unpacking columnwise
    iweights = torch.bitwise_right_shift(qweight[:, :, None], shifts[None, None, :]).to(
        torch.int8  # smallest dtype available
    )
    iweights = iweights.view(iweights.shape[0], -1)

    # unpacking columnwise
    if qzeros is not None:
        izeros = torch.bitwise_right_shift(
            qzeros[:, :, None], shifts[None, None, :]
        ).to(
            torch.int8  # smallest dtype available
        )
        izeros = izeros.view(izeros.shape[0], -1)
    else:
        izeros = qzeros

    return iweights, izeros


def reverse_awq_order(iweights: torch.Tensor, izeros: torch.Tensor, bits: int):
    reverse_order_tensor = torch.arange(
        iweights.shape[-1],
        dtype=torch.int32,
        device=izeros.device,
    )
    reverse_order_tensor = reverse_order_tensor.view(-1, 32 // bits)
    reverse_order_tensor = reverse_order_tensor[:, AWQ_REVERSE_ORDER]
    reverse_order_tensor = reverse_order_tensor.view(-1)

    if izeros is not None:
        izeros = izeros[:, reverse_order_tensor]
    iweights = iweights[:, reverse_order_tensor]

    return iweights, izeros


def unpack_weight_and_zeros(qweight, qzeros, bits):
    # Unpack the qweight and qzeros tensors
    iweight, izeros = unpack_awq(qweight, qzeros, bits)
    # Reverse the order of the iweight and izeros tensors
    iweight, izeros = reverse_awq_order(iweight, izeros, bits)

    # overflow checks
    iweight = torch.bitwise_and(iweight, (2**bits) - 1)
    izeros = torch.bitwise_and(izeros, (2**bits) - 1)

    return iweight, izeros


def pack_tensor(input, bits=4):
    normal = input.to(torch.int32)
    q = torch.zeros(
        (normal.shape[0], normal.shape[1] // 32 * bits),
        dtype=torch.int32,
        device=input.device,
    )
    i = 0
    col = 0
    while col < q.shape[1]:
        for j in range(i, i + (32 // bits)):
            q[:, col] |= normal[:, j] << (bits * (j - i))
        i += 32 // bits
        col += 1
    q = q.to(torch.int32)
    return q


class WQLinear(nn.Module):
    def __init__(
        self, w_bit, group_size, qweight, qzeros, scales, bias: Optional[torch.Tensor]
    ):
        super().__init__()

        if w_bit not in [4]:
            raise NotImplementedError("Only 4-bit are supported for now.")

        self.in_features = qweight.shape[0]
        self.out_features = qweight.shape[1] * 32 // w_bit

        self.w_bit = w_bit
        self.group_size = group_size if group_size != -1 else self.in_features
        # quick sanity check (make sure aligment)
        assert self.in_features % self.group_size == 0
        assert self.out_features % (32 // self.w_bit) == 0

        self.qweight = qweight
        self.qzeros = qzeros
        self.scales = scales
        self.bias = bias
        self._preprocessing()

    def _preprocessing(self):
        device = self.qweight.device
        weight, zeros = unpack_weight_and_zeros(
            self.qweight.cpu(), self.qzeros.cpu(), self.w_bit
        )
        self.qweight = pack_tensor(weight).to(device)
        self.qzeros = pack_tensor(zeros).to(device)

    @torch.no_grad()
    def forward(self, x):
        out_shape = x.shape[:-1] + (self.out_features,)
        x = x.reshape(-1, x.shape[-1])
        weights = convert_from_uint4(self.qweight, self.scales, self.qzeros, x.dtype)
        outputs = torch.matmul(x, weights)

        outputs = outputs + self.bias if self.bias is not None else outputs
        outputs = outputs.reshape(out_shape)
        return outputs


================================================
FILE: backends/gaudi/server/text_generation_server/layers/bnb.py
================================================
from dataclasses import dataclass

import bitsandbytes as bnb
import torch
from bitsandbytes.nn import Int8Params, Params4bit
from text_generation_server.utils.weights import UnquantizedWeight


@dataclass
class BNBWeight(UnquantizedWeight):
    weight: torch.Tensor

    def get_linear(self, bias: torch.Tensor):
        return Linear8bitLt(self.weight, bias, has_fp16_weights=False, threshold=6.0)


class Linear8bitLt(torch.nn.Module):
    def __init__(
        self,
        weight,
        bias,
        has_fp16_weights=True,
        memory_efficient_backward=False,
        threshold=0.0,
        index=None,
    ):
        super().__init__()
        assert (
            not memory_efficient_backward
        ), "memory_efficient_backward is no longer required and the argument is deprecated in 0.37.0 and will be removed in 0.39.0"
        self.state = bnb.MatmulLtState()
        self.index = index

        # Necessary for stacked layers
        self.state.threshold = threshold
        self.state.has_fp16_weights = has_fp16_weights
        self.state.memory_efficient_backward = memory_efficient_backward
        if threshold > 0.0 and not has_fp16_weights:
            self.state.use_pool = True

        self.weight = Int8Params(
            weight.data,
            has_fp16_weights=has_fp16_weights,
            requires_grad=has_fp16_weights,
        )
        self.weight.cuda(weight.device)
        self.bias = bias

    def init_8bit_state(self):
        self.state.CB = self.weight.CB
        self.state.SCB = self.weight.SCB
        self.weight.CB = None
        self.weight.SCB = None

    def forward(self, x: torch.Tensor):
        self.state.is_training = self.training
        if self.weight.CB is not None:
            self.init_8bit_state()

        # weights are cast automatically as Int8Params, but the bias has to be cast manually
        if self.bias is not None and self.bias.dtype != x.dtype:
            self.bias.data = self.bias.data.to(x.dtype)

        out = bnb.matmul(x, self.weight, bias=self.bias, state=self.state)

        if not self.state.has_fp16_weights:
            if self.state.CB is not None and self.state.CxB is not None:
                # we converted 8-bit row major to turing/ampere format in the first inference pass
                # we no longer need the row-major weight
                del self.state.CB
                self.weight.data = self.state.CxB
        return out


@dataclass
class BNBFP4Weight(UnquantizedWeight):
    weight: torch.Tensor

    def get_linear(self, bias: torch.Tensor):
        return Linear4bit(self.weight, bias, quant_type="fp4")


@dataclass
class BNBNF4Weight(UnquantizedWeight):
    weight: torch.Tensor

    def get_linear(self, bias: torch.Tensor):
        return Linear4bit(self.weight, bias, quant_type="nf4")


class Linear4bit(torch.nn.Module):
    def __init__(self, weight, bias, quant_type):
        super().__init__()
        self.weight = Params4bit(
            weight.data,
            requires_grad=False,
            compress_statistics=True,
            quant_type=quant_type,
        )
        self.compute_dtype = None
        self.weight.cuda(weight.device)
        self.bias = bias

    def forward(self, x: torch.Tensor):
        # weights are cast automatically as Int8Params, but the bias has to be cast manually
        if self.bias is not None and self.bias.dtype != x.dtype:
            self.bias.data = self.bias.data.to(x.dtype)

        if getattr(self.weight, "quant_state", None) is None:
            print(
                "FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first."
            )
        inp_dtype = x.dtype
        if self.compute_dtype is not None:
            x = x.to(self.compute_dtype)

        bias = None if self.bias is None else self.bias.to(self.compute_dtype)
        out = bnb.matmul_4bit(
            x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state
        )

        out = out.to(inp_dtype)

        return out


================================================
FILE: backends/gaudi/server/text_generation_server/layers/compressed_tensors/__init__.py
================================================
from .loader import CompressedTensorsLoader

__all__ = ["CompressedTensorsLoader"]


================================================
FILE: backends/gaudi/server/text_generation_server/layers/compressed_tensors/loader.py
================================================
from typing import Any, Dict, List, Union

from compressed_tensors import QuantizationConfig, QuantizationStatus
from compressed_tensors.config import CompressionFormat
from compressed_tensors.quantization import (
    QuantizationScheme,
    QuantizationType,
    find_name_or_class_matches,
)
from loguru import logger
from pydantic import ValidationError
from torch import nn

from text_generation_server.layers.compressed_tensors.w8an_fp import W8ANFpLoader
from text_generation_server.utils.log import log_once
from text_generation_server.utils.weights import (
    DefaultWeightsLoader,
    UnquantizedWeight,
    Weights,
    WeightsLoader,
)

# compressed-tensors can match modules as quantization targets. However,
# they need to be objects rather than classes or class names. Since we
# need to match `Linear` targets, make an instance that can be re-used.
_EMPTY_LINEAR: nn.Module = nn.Linear(0, 0)


class CompressedTensorsLoader(WeightsLoader):
    """Loader for checkpoints stored in the compressed-tensors format."""

    def __init__(self, config: Dict[str, Any]):
        quantization_config_raw = config.get("quantization_config")
        if quantization_config_raw is None:
            # `compression_config` was renamed to `quantization_config`; support
            # retained for backward compatibility.
            quantization_config_raw = config.get("compression_config")
        if quantization_config_raw is None:
            raise ValueError(
                "Checkpoint does not have compressed-tensors configuration"
            )

        try:
            quantization_config = QuantizationConfig.model_validate(
                quantization_config_raw
            )
        except ValidationError as e:
            raise ValueError("Cannot parse compressed-tensors configuration") from e

        if quantization_config.quantization_status not in (
            QuantizationStatus.COMPRESSED,
            QuantizationStatus.FROZEN,
        ):
            raise ValueError(
                f"Model quantization was not finished, status was: {quantization_config.quantization_status}"
            )

        self.ignore = (
            quantization_config.ignore if quantization_config.ignore is not None else []
        )
        self.loaders = self._get_target_loaders(quantization_config)

        for target, loader in self.loaders.items():
            log_once(
                logger.info,
                f"Using {loader} for compressed-tensors target '{target}'",
            )

    def get_weights(self, weights: Weights, prefix: str):
        loader = self._lookup_loader(prefix)
        return loader.get_weights(weights, prefix)

    def get_weights_col_packed(
        self,
        weights: "Weights",
        prefix: str,
        block_sizes: Union[int, List[int]],
    ):
        loader = self._lookup_loader(prefix)
        return loader.get_weights_col_packed(weights, prefix, block_sizes)

    def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
        loader = self._lookup_loader(prefixes[0])
        return loader.get_multi_weights_col(weights, prefixes, dim)

    def get_multi_weights(self, weights: Weights, prefixes: List[str], dim: int):
        loader = self._lookup_loader(prefixes[0])
        return loader.get_multi_weights(weights, prefixes, dim)

    def get_weights_row(self, weights: Weights, prefix: str):
        loader = self._lookup_loader(prefix)
        return loader.get_weights_row(weights, prefix)

    def _get_target_loaders(
        self, quantization_config: QuantizationConfig
    ) -> Dict[str, WeightsLoader]:
        """
        A compressed-tensors checkpoint can use different quantizations
        for different targets. This method returns a dictionary with a
        loader per target.
        """

        loaders: Dict[str, WeightsLoader] = {}

        format = quantization_config.format

        for group_name, group in quantization_config.config_groups.items():
            # The group configuration can be a string, but does that ever
            # happen in a serialized quantization config?
            assert isinstance(group, QuantizationScheme)

            loader = self._create_loader_for_group(format, group_name, group)

            # A quantized parameter group can have multiple targets, add the
            # loader for all the targets.
            for target in group.targets:
                if target in loaders:
                    raise ValueError(
                        f"Target '{target} has multiple configured loaders'"
                    )
                loaders[target] = loader

        return loaders

    def _create_loader_for_group(
        self, format: str, group_name: str, group: QuantizationScheme
    ) -> WeightsLoader:
        """
        Find and create a loader for the group with the given quantization
        scheme.
        """
        # NOTE: we ignore group.output_activations because we don't support
        #       output quantization yet.

        input_activations = group.input_activations
        weights = group.weights
        if (
            format
            in {
                CompressionFormat.float_quantized.value,
                CompressionFormat.naive_quantized.value,
            }
            and weights is not None
            and weights.type == QuantizationType.FLOAT
            and weights.num_bits == 8
        ):
            # FP W8A8 or W8A16.
            return W8ANFpLoader(input_activations=input_activations, weights=weights)
        else:
            raise ValueError(
                f"Group '{group_name}' has unsupported compressed-tensors configurtion"
            )

    def _lookup_loader(self, prefix: str) -> WeightsLoader:
        """
        Look up the loader to use for a given parameter name (prefix).
        """

        if len(find_name_or_class_matches(prefix, _EMPTY_LINEAR, self.ignore)) > 0:
            return DefaultWeightsLoader(UnquantizedWeight)

        # We currently only handle linear layers, so unconditionally pass
        # a `Linear` instance.
        targets = find_name_or_class_matches(prefix, _EMPTY_LINEAR, self.loaders.keys())
        if len(targets) == 0:
            raise ValueError(
                f"Cannot find compressed-tensors target for prefix: {prefix}"
            )
        return self.loaders[targets[0]]


================================================
FILE: backends/gaudi/server/text_generation_server/layers/compressed_tensors/w8an_fp.py
================================================
from typing import List, Optional, Union

import torch
from compressed_tensors.quantization import QuantizationArgs, QuantizationType

from text_generation_server.layers.fp8 import (
    Fp8Weight,
    _load_scalar_or_matrix_scale,
    requantize_with_max_scale,
)
from text_generation_server.utils.weights import Weights, WeightsLoader


class W8ANFpLoader(WeightsLoader):
    """
    Loader for W8A8/W8A16 FP compressed-tensors parameters.
    """

    def __init__(
        self,
        *,
        input_activations: Optional[QuantizationArgs],
        weights: QuantizationArgs,
    ):
        assert weights.type == QuantizationType.FLOAT and weights.num_bits == 8

        # We ignore the `strategy` option which sets the scales to be
        # per-tensor, per-channel or per-token. What scales are supported
        # is dependent on the kernels used (e.g. cutlass can do tokenwise,
        # Torch cannot, and FP8-Marlin does not quantize inputs at all).
        # So, instead we try to use the best-possible configuration.

        self.load_weight_scale = not weights.dynamic
        self.load_input_scale = (
            input_activations is not None and not input_activations.dynamic
        )
        self.force_w8a16 = (
            input_activations is not None and input_activations.num_bits == 16
        )

    def __str__(self) -> str:
        def scale_to_str(scale):
            return "static" if scale else "dynamic"

        quantization_type = f"W8A{16 if self.force_w8a16 else 8}"

        return f"{self.__class__.__name__} ({quantization_type}, weight: {scale_to_str(self.load_weight_scale)}, input: {scale_to_str(self.load_input_scale)})"

    def get_weights(self, weights: "Weights", prefix: str):
        w = weights.get_tensor(f"{prefix}.weight")

        weight_scale = None
        if self.load_weight_scale:
            weight_scale = (
                weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
                .reshape(-1)
                .expand(w.shape[0])
            )
            logical_widths = [w.shape[0]]
            w, weight_scale = requantize_with_max_scale(
                w,
                weight_scale.unsqueeze(-1).to(weights.device),
                logical_widths,
                weights.dtype,
            )

        input_scale = None
        if self.load_input_scale:
            input_scale = weights.get_tensor(
                f"{prefix}.input_scale", to_dtype=False
            ).reshape(-1)

        return Fp8Weight(
            weight=w,
            weight_scale=weight_scale,
            input_scale=input_scale,
            dtype=weights.dtype,
            force_w8a16=self.force_w8a16,
        )

    def get_weights_col_packed(
        self,
        weights: Weights,
        prefix: str,
        block_sizes: Union[int, List[int]],
    ):
        w = weights.get_packed_sharded(
            f"{prefix}.weight", dim=0, block_sizes=block_sizes
        )

        weight_scale = None
        if self.load_weight_scale:
            weight_scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
            if weight_scale.numel() > 1:
                weight_scale = weights.get_packed_sharded(
                    f"{prefix}.weight_scale",
                    dim=0,
                    block_sizes=block_sizes,
                    to_dtype=False,
                )
            weight_scale = weight_scale.reshape(-1).expand(w.shape[0])
            logical_widths = [w.shape[0]]
            w, weight_scale = requantize_with_max_scale(
                w,
                weight_scale.unsqueeze(-1).to(weights.device),
                logical_widths,
                weights.dtype,
            )

        input_scale = None
        if self.load_input_scale:
            input_scale = weights.get_tensor(f"{prefix}.input_scale", to_dtype=False)
            if input_scale.numel() > 1:
                input_scale = weights.get_packed_sharded(
                    f"{prefix}.input_scale",
                    dim=0,
                    block_sizes=block_sizes,
                    to_dtype=False,
                )
            input_scale = input_scale.reshape(-1).max()

        return Fp8Weight(
            weight=w,
            weight_scale=weight_scale,
            input_scale=input_scale,
            dtype=weights.dtype,
            force_w8a16=self.force_w8a16,
        )

    def get_multi_weights_col(self, weights: "Weights", prefixes: List[str], dim: int):
        # FIXME: Force to_device to false as fp8 weights do not support torch.cat on device yet
        w = [
            weights.get_sharded(f"{p}.weight", dim=0, to_device=False) for p in prefixes
        ]
        shapes = [x.shape for x in w]

        # Concat then send to the device
        w = torch.cat(w, dim=dim).to(weights.device)

        weight_scale = None
        if self.load_weight_scale:
            weight_scale = [
                _load_scalar_or_matrix_scale(weights, f"{p}.weight_scale", shape)
                for p, shape in zip(prefixes, shapes)
            ]
            weight_scale = torch.cat(weight_scale, dim=0).reshape(-1)
            logical_widths = [x[0] for x in shapes]
            w, weight_scale = requantize_with_max_scale(
                w,
                weight_scale.unsqueeze(-1).to(weights.device),
                logical_widths,
                weights.dtype,
            )

        input_scale = None
        if self.load_input_scale:
            input_scale = [
                _load_scalar_or_matrix_scale(weights, f"{p}.input_scale", shape)
                for p, shape in zip(prefixes, shapes)
                if weights.has_tensor(f"{p}.input_scale")
            ]
            assert len(input_scale) == 0 or len(input_scale) == len(prefixes)
            input_scale = (
                torch.cat(input_scale, dim=0).reshape(-1).max()
                if len(input_scale) != 0
                else None
            )

        return Fp8Weight(
            weight=w,
            weight_scale=weight_scale,
            input_scale=input_scale,
            dtype=weights.dtype,
            force_w8a16=self.force_w8a16,
        )

    def get_multi_weights(self, weights: "Weights", prefixes: List[str], dim: int):
        # FIXME: Force to_device to false as fp8 weights do not support torch.cat on device yet
        w = [weights.get_tensor(f"{p}.weight", to_device=False) for p in prefixes]
        shapes = [x.shape for x in w]

        # Concat then send to the device
        w = torch.cat(w, dim=dim).to(weights.device)

        weight_scale = None

        if self.load_weight_scale:
            weight_scale = [
                weights.get_tensor(f"{p}.weight_scale", to_dtype=False)
                .reshape(-1)
                .expand(shape[0])
                for p, shape in zip(prefixes, shapes)
            ]
            weight_scale = torch.cat(weight_scale, dim=0).reshape(-1)
            logical_widths = [x[0] for x in shapes]
            w, weight_scale = requantize_with_max_scale(
                w,
                weight_scale.unsqueeze(-1).to(weights.device),
                logical_widths,
                weights.dtype,
            )

        input_scale = None
        if self.load_input_scale:
            input_scale = [
                weights.get_tensor(f"{p}.input_scale", to_dtype=False)
                .reshape(-1)
                .expand(shape[0])
                for p, shape in zip(prefixes, shapes)
                if weights.has_tensor(f"{p}.input_scale")
            ]
            assert len(input_scale) == 0 or len(input_scale) == len(prefixes)
            input_scale = (
                torch.cat(input_scale, dim=0).reshape(-1).max()
                if len(input_scale) != 0
                else None
            )

        return Fp8Weight(
            weight=w,
            weight_scale=weight_scale,
            input_scale=input_scale,
            dtype=weights.dtype,
            force_w8a16=self.force_w8a16,
        )

    def get_weights_row(self, weights: "Weights", prefix: str):
        w = weights.get_sharded(f"{prefix}.weight", dim=1)
        weight_scale = None
        if self.load_weight_scale:
            weight_scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
            weight_scale = weight_scale.reshape(-1).expand(w.shape[0])
            logical_widths = [w.shape[0]]
            w, weight_scale = requantize_with_max_scale(
                w,
                weight_scale.unsqueeze(-1).to(weights.device),
                logical_widths,
                weights.dtype,
            )

        input_scale = None
        if self.load_input_scale:
            input_scale = weights.get_tensor(
                f"{prefix}.input_scale", to_dtype=False
            ).reshape(-1)

        return Fp8Weight(
            weight=w,
            weight_scale=weight_scale,
            input_scale=input_scale,
            dtype=weights.dtype,
            force_w8a16=self.force_w8a16,
        )


================================================
FILE: backends/gaudi/server/text_generation_server/layers/conv.py
================================================
from accelerate import init_empty_weights
import torch


@classmethod
def load_conv2d(cls, prefix, weights, in_channels, out_channels, kernel_size, stride):
    weight = weights.get_tensor(f"{prefix}.weight")
    bias = weights.get_tensor(f"{prefix}.bias")
    with init_empty_weights():
        conv2d = cls(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
        )

    conv2d.weight = torch.nn.Parameter(weight)
    conv2d.bias = torch.nn.Parameter(bias)
    return conv2d


@classmethod
def load_conv2d_no_bias(
    cls, prefix, weights, in_channels, out_channels, kernel_size, stride
):
    weight = weights.get_tensor(f"{prefix}.weight")
    with init_empty_weights():
        conv2d = cls(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
        )

    conv2d.weight = torch.nn.Parameter(weight)
    conv2d.bias = None
    return conv2d


torch.nn.Conv2d.load = load_conv2d
torch.nn.Conv2d.load_no_bias = load_conv2d_no_bias


================================================
FILE: backends/gaudi/server/text_generation_server/layers/exl2.py
================================================
from dataclasses import dataclass
from typing import List, Union

import torch
from text_generation_server.utils.weights import Weight, Weights, WeightsLoader


@dataclass
class Exl2Weight(Weight):
    """
    Exllama2 exl2 quantized weights.
    """

    q_weight: torch.Tensor
    q_scale: torch.Tensor
    q_invperm: torch.Tensor
    q_scale_max: torch.Tensor
    q_groups: torch.Tensor

    def __post_init__(self):
        self.q_scale_max /= 256
        self.q_invperm = self.q_invperm.short()

    @property
    def device(self) -> torch.device:
        return self.q_weight.device

    def get_linear(self, bias: torch.Tensor):
        from text_generation_server.layers.gptq import ExllamaQuantLinear

        return ExllamaQuantLinear(self, bias)


class Exl2WeightsLoader(WeightsLoader):
    """Loader for exl2-quantized weights."""

    def get_weights(self, weights: "Weights", prefix: str):
        """
        Get weights at the given prefix and apply without tensor paralllism.
        """
        try:
            q_weight = weights.get_tensor(f"{prefix}.q_weight")
        except RuntimeError:
            raise RuntimeError(
                "Cannot load `exl2`-quantized weight, make sure the model is already quantized."
            )

        q_scale = weights.get_tensor(f"{prefix}.q_scale")
        q_invperm = weights.get_tensor(f"{prefix}.q_invperm")
        q_scale_max = weights.get_tensor(f"{prefix}.q_scale_max")
        q_groups = weights.get_tensor(f"{prefix}.q_groups")

        return Exl2Weight(
            q_weight=q_weight,
            q_scale=q_scale,
            q_invperm=q_invperm,
            q_scale_max=q_scale_max,
            q_groups=q_groups,
        )

    def get_weights_col_packed(
        self,
        weights: Weights,
        prefix: str,
        block_sizes: Union[int, List[int]],
    ):
        raise RuntimeError("Column-packed weights are not supported for exl")

    def get_weights_col(self, weights: Weights, prefix: str):
        # Sharding is not yet supported, so we return the weights as-is.
        return self.get_weights(weights, prefix)

    def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
        raise ValueError("get_multi_weights_col is not supported for exl2")

    def get_weights_row(self, weights: Weights, prefix: str):
        # Sharding is not yet supported, so we return the weights as-is.
        return self.get_weights(weights, prefix)


================================================
FILE: backends/gaudi/server/text_generation_server/layers/fp8.py
================================================
from dataclasses import dataclass
from typing import Optional, Tuple, Type, Union, List

import torch

from text_generation_server.utils.weights import (
    Weight,
    WeightsLoader,
    UnquantizedWeight,
    Weights,
)

from vllm_hpu_extension.ops import scaled_fp8_quant
from vllm_hpu_extension.scales import get_hpu_gaudi2_scale_factor, is_hpu_gaudi2

quant_dtype: torch.dtype = torch.float8_e4m3fn
FP8_MAX = torch.finfo(torch.float8_e4m3fn).max
if is_hpu_gaudi2():
    FP8_MAX = torch.finfo(torch.float8_e4m3fnuz).max


def pad_weight(weight, block_size):
    """Pads a matrix to make its dimensions multiples of block_size."""
    M, N = weight.shape[-2:]
    block_size_m, block_size_n = block_size
    pad_M = (block_size_m - M % block_size_m) % block_size_m
    pad_N = (block_size_n - N % block_size_n) % block_size_n

    if pad_M == 0 and pad_N == 0:
        return weight, M, N  # No padding needed
    padded_weight = torch.nn.functional.pad(
        weight, (0, pad_N, 0, pad_M), mode="constant", value=0
    )
    return padded_weight, M, N  # Return original dimensions for unpadding


def unpad_weight(weight, original_M, original_N, keep_first_dim=False):
    """Removes padding from the matrix to restore its original shape."""
    if (weight.shape[-2] == original_M) and (weight.shape[-1] == original_N):
        return weight
    if keep_first_dim:
        return weight[:, :original_M, :original_N]
    else:
        return weight[:original_M, :original_N]


def pad_block_fp8_weight_naive(weight, weight_scale, block_size):

    assert len(block_size) == 2

    block_size_m, block_size_n = block_size
    weight_scale_m, weight_scale_n = weight_scale.shape[-2:]

    weight, orig_M, orig_N = pad_weight(weight, block_size)
    M, N = weight.shape[-2:]

    assert weight_scale_m == M // block_size_m
    assert weight_scale_n == N // block_size_n

    return weight, orig_M, orig_N


def dynamic_quant(data, single_scale=False):
    if single_scale:
        scale = ((torch.abs(data)).max() + 1e-8) / FP8_MAX
    else:
        scale = ((torch.abs(data)).max(dim=-1).values + 1e-8) / FP8_MAX
        scale = scale.unsqueeze(-1)
    data_fp8 = torch.ops.hpu.cast_to_fp8_v2(
        data, 1.0 / scale, False, False, torch.float8_e4m3fn
    )[0]
    return data_fp8, scale.float()


def dequant_block_fp8_weight_naive(
    weight,
    weight_scale,
    block_size,
    dtype=torch.bfloat16,
    original_M=None,
    original_N=None,
    do_unpad=False,
):
    if weight_scale is None:
        return weight
    assert len(block_size) == 2

    weight_shape_len = len(weight.shape)

    block_size_m, block_size_n = block_size

    # mul scale
    if weight_shape_len == 2:
        weight_scale_m, weight_scale_n = weight_scale.shape
        weight_scale = weight_scale.view(weight_scale_m, 1, weight_scale_n, 1)
        weight = weight.view(weight_scale_m, block_size_m, weight_scale_n, block_size_n)
        if is_hpu_gaudi2():
            fake_weight = weight.cpu().to(dtype).to(weight.device)
            dequant_weight = fake_weight * weight_scale.to(dtype)
        else:
            dequant_weight = weight.to(dtype) * weight_scale.to(dtype)
        dequant_weight = dequant_weight.view(
            weight_scale_m * block_size_m, weight_scale_n * block_size_n
        )
        keep_first_dim = False
    elif weight_shape_len == 3:
        fd, weight_scale_m, weight_scale_n = weight_scale.shape
        weight_scale = weight_scale.view(fd, weight_scale_m, 1, weight_scale_n, 1)
        weight = weight.view(
            fd, weight_scale_m, block_size_m, weight_scale_n, block_size_n
        )
        if is_hpu_gaudi2():
            fake_weight = weight.cpu().to(dtype).to(weight.device)
            dequant_weight = fake_weight * weight_scale.to(dtype)
        else:
            dequant_weight = weight.to(dtype) * weight_scale.to(dtype)
        dequant_weight = dequant_weight.view(
            fd, weight_scale_m * block_size_m, weight_scale_n * block_size_n
        )
        keep_first_dim = True
    else:
        raise ValueError("Only support original weight shape is either 2 or 3")

    if do_unpad:
        dequant_weight = unpad_weight(
            dequant_weight, original_M, original_N, keep_first_dim=keep_first_dim
        )

    return dequant_weight


def apply_block_fp8_linear_hpu_dynamic(
    input: torch.Tensor,
    weight: torch.Tensor,
    weight_scale: torch.Tensor,
    input_scale: Optional[torch.Tensor] = None,
    bias: Optional[torch.Tensor] = None,
) -> torch.Tensor:
    # View input as 2D matrix for fp8 methods
    input_2d = input.view(-1, input.shape[-1])
    output_shape = [*input.shape[:-1], weight.shape[0]]

    x_fp8, x_scale = dynamic_quant(input_2d)

    output = torch.ops.hpu.fp8_gemm_v2(
        x_fp8,
        False,
        weight,
        True,
        None,
        torch.bfloat16,
        x_scale,
        weight_scale,
        None,
        False,
    )
    if bias is not None:
        output = output + bias
    return output.to(dtype=input.dtype).view(*output_shape)


def get_fp8_linear(force_w8a16: bool = False) -> Type[torch.nn.Module]:
    """
    Return an FP8 linear `Module` that is compatible with the current system.
    """
    # On other systems let Torch decide if the hardware supports FP8.
    return Fp8Linear


def normalize_e4m3fn_to_native_float8(
    weight: torch.Tensor,
    weight_scale: torch.Tensor,
    input_scale: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
    return weight, weight_scale, input_scale


def per_tensor_dequantize(
    tensor: torch.Tensor,
    inv_scale: Union[float, torch.Tensor],
    dtype: torch.dtype = torch.float16,
) -> torch.Tensor:
    device = tensor.device
    dtype = torch.bfloat16
    if is_hpu_gaudi2():
        # dequant on cpu to avoid nan on gaudi2
        tensor = tensor.to("cpu")

    fake_qweight = tensor.to(dtype).to(device)
    dq_weight = fake_qweight * inv_scale
    return dq_weight


def requantize_with_max_scale(
    weight: torch.Tensor,
    weight_scale: torch.Tensor,
    logical_widths: int,
    dtype: torch.dtype,
) -> Tuple[torch.Tensor, torch.Tensor]:
    # Max scale to be used for requanitzation.
    max_w_scale = weight_scale.max()

    if is_hpu_gaudi2():
        max_w_scale = max_w_scale * get_hpu_gaudi2_scale_factor()

    start = 0
    for idx, logical_width in enumerate(logical_widths):
        end = start + logical_width
        weight_dq = per_tensor_dequantize(
            weight[start:end, :], weight_scale[start:end, :], dtype
        )
        weight[start:end, :], max_w_scale_normalized = fp8_quantize(
            weight_dq, max_w_scale
        )
        start = end

    return weight, max_w_scale_normalized


def fp8_quantize(
    weight: torch.Tensor,
    scale: Optional[torch.Tensor] = None,
    scale_upper_bound: Optional[torch.Tensor] = None,
    qdtype: torch.dtype = torch.float8_e4m3fn,
    scalar: bool = False,
):
    """
    This function returns a reciprocal of the scale, so that a tensor can be unscaled
    by multiplying it with the returned scale. If a scale is given through the `scale`
    argument, it must also be a reciprocal (so that scales from an FP8 checkpoint can
    be used without modification).
    """
    shape = weight.shape
    qweight, scale = scaled_fp8_quant(
        weight.reshape(-1, shape[-1]),
        scale=scale,
        scale_ub=scale_upper_bound,
        # TODO: don't do this when we have to use the Torch kernel.
        use_per_token_if_dynamic=not scalar,
    )

    return qweight.reshape(shape), scale


class HybridFP8UnquantLoader(WeightsLoader):
    """Weight loader that loads FP8 and unquantized Torch tensors."""

    def __init__(
        self,
        activation_scale_ub: Optional[float],
        to_fp8: bool,
        weight_block_size: Optional[List[int]] = None,
    ):
        self.activation_scale_ub = activation_scale_ub
        self.to_fp8 = to_fp8
        self.weight_block_size = weight_block_size

    def get_weights(self, weights: "Weights", prefix: str):
        w = weights.get_tensor(f"{prefix}.weight")

        if w.dtype == torch.float8_e4m3fn:
            if self.weight_block_size is not None:
                scale = weights.get_tensor(f"{prefix}.weight_scale_inv")
                return Fp8Weight(
                    weight=w,
                    weight_scale=scale,
                    activation_scale_ub=self.activation_scale_ub,
                    dtype=weights.dtype,
                    weight_block_size=self.weight_block_size,
                )
            # FP8 branch
            scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
            scale = scale.reshape(-1).expand(w.shape[0])
            logical_widths = [w.shape[0]]
            w, scale = requantize_with_max_scale(
                w, scale.unsqueeze(-1).to(weights.device), logical_widths, weights.dtype
            )

            input_scale = None
            if weights.has_tensor(f"{prefix}.input_scale"):
                input_scale = (
                    weights.get_tensor(f"{prefix}.input_scale", to_dtype=False)
                    .reshape(-1)
                    .max()
                )

            return Fp8Weight(
                weight=w,
                weight_scale=scale,
                input_scale=input_scale,
                activation_scale_ub=self.activation_scale_ub,
                dtype=weights.dtype,
            )
        if self.to_fp8:
            return Fp8Weight(weight=w, dtype=weights.dtype)

        return UnquantizedWeight(w)

    def get_weights_col_packed(
        self,
        weights: Weights,
        prefix: str,
        block_sizes: Union[int, List[int]],
    ):
        w = weights.get_packed_sharded(
            f"{prefix}.weight", dim=0, block_sizes=block_sizes
        )

        if w.dtype == torch.float8_e4m3fn:
            # FP8 branch
            scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)

            if scale.numel() > 1:
                scale = weights.get_packed_sharded(
                    f"{prefix}.weight_scale",
                    dim=0,
                    block_sizes=block_sizes,
                    to_dtype=False,
                )
            scale = scale.reshape(-1).expand(w.shape[0])
            logical_widths = [w.shape[0]]
            w, scale = requantize_with_max_scale(
                w, scale.unsqueeze(-1).to(weights.device), logical_widths, weights.dtype
            )

            input_scale = None
            if weights.has_tensor(f"{prefix}.input_scale"):
                input_scale = weights.get_tensor(
                    f"{prefix}.input_scale", to_dtype=False
                )
                if input_scale.numel() > 1:
                    input_scale = weights.get_packed_sharded(
                        f"{prefix}.input_scale",
                        dim=0,
                        block_sizes=block_sizes,
                        to_dtype=False,
                    )
                input_scale = input_scale.reshape(-1).max()

            return Fp8Weight(
                weight=w,
                weight_scale=scale,
                input_scale=input_scale,
                activation_scale_ub=self.activation_scale_ub,
                dtype=weights.dtype,
            )
        if self.to_fp8:
            return Fp8Weight(weight=w, dtype=weights.dtype)

        return UnquantizedWeight(w)

    def get_multi_weights_col(self, weights: "Weights", prefixes: List[str], dim: int):
        # FIXME: Force to_device to false as fp8 weights do not support torch.cat on device yet
        w = [
            weights.get_sharded(f"{p}.weight", dim=0, to_device=False) for p in prefixes
        ]
        shapes = [x.shape for x in w]

        # Concat then send to the device
        w = torch.cat(w, dim=dim).to(weights.device)

        # FP8 branch
        if w.dtype == torch.float8_e4m3fn:
            if self.weight_block_size is not None:
                scale = [
                    weights.get_sharded(f"{p}.weight_scale_inv", dim=0, to_device=False)
                    for p in prefixes
                ]
                scale = torch.cat(scale, dim=dim)
                scale = scale.to(weights.device)
                return Fp8Weight(
                    weight=w,
                    weight_scale=scale,
                    activation_scale_ub=self.activation_scale_ub,
                    dtype=weights.dtype,
                    weight_block_size=self.weight_block_size,
                )

            scale = [
                _load_scalar_or_matrix_scale(weights, f"{p}.weight_scale", shape)
                for p, shape in zip(prefixes, shapes)
            ]
            scale = torch.cat(scale, dim=0).reshape(-1)

            logical_widths = [x[0] for x in shapes]
            w, scale = requantize_with_max_scale(
                w, scale.unsqueeze(-1).to(weights.device), logical_widths, weights.dtype
            )

            input_scale = [
                _load_scalar_or_matrix_scale(weights, f"{p}.input_scale", shape)
                for p, shape in zip(prefixes, shapes)
                if weights.has_tensor(f"{p}.input_scale")
            ]
            assert len(input_scale) == 0 or len(input_scale) == len(prefixes)
            input_scale = (
                torch.cat(input_scale, dim=0).reshape(-1).max()
                if len(input_scale) != 0
                else None
            )

            return Fp8Weight(
                weight=w,
                weight_scale=scale,
                input_scale=input_scale,
                activation_scale_ub=self.activation_scale_ub,
                dtype=weights.dtype,
            )
        if self.to_fp8:
            return Fp8Weight(weight=w, dtype=weights.dtype)

        return UnquantizedWeight(w)

    def get_multi_weights(self, weights: "Weights", prefixes: List[str], dim: int):
        # FIXME: Force to_device to false as fp8 weights do not support torch.cat on device yet
        w = [weights.get_tensor(f"{p}.weight", to_device=False) for p in prefixes]
        shapes = [x.shape for x in w]

        # Concat then send to the device
        w = torch.cat(w, dim=dim).to(weights.device)

        # FP8 branch
        if w.dtype == torch.float8_e4m3fn:
            if self.weight_block_size is not None:
                scale = [
                    weights.get_tensor(f"{p}.weight_scale_inv", to_device=False)
                    for p in prefixes
                ]
                scale = torch.cat(scale, dim=dim)
                scale = scale.to(weights.device)
                return Fp8Weight(
                    weight=w,
                    weight_scale=scale,
                    activation_scale_ub=self.activation_scale_ub,
                    dtype=weights.dtype,
                    weight_block_size=self.weight_block_size,
                )

            scale = [
                weights.get_tensor(f"{p}.weight_scale", to_dtype=False)
                .reshape(-1)
                .expand(shape[0])
                for p, shape in zip(prefixes, shapes)
            ]
            scale = torch.cat(scale, dim=0).reshape(-1)

            logical_widths = [x[0] for x in shapes]
            w, scale = requantize_with_max_scale(
                w, scale.unsqueeze(-1).to(weights.device), logical_widths, weights.dtype
            )

            input_scale = [
                weights.get_tensor(f"{p}.input_scale", to_dtype=False).reshape(-1)
                for p in prefixes
                if weights.has_tensor(f"{p}.input_scale")
            ]
            assert len(input_scale) == 0 or len(input_scale) == len(prefixes)
            input_scale = (
                torch.cat(input_scale, dim=0).reshape(-1).max()
                if len(input_scale) != 0
                else None
            )

            return Fp8Weight(
                weight=w,
                weight_scale=scale,
                input_scale=input_scale,
                activation_scale_ub=self.activation_scale_ub,
                dtype=weights.dtype,
            )
        if self.to_fp8:
            return Fp8Weight(weight=w, dtype=weights.dtype)

        return UnquantizedWeight(w)

    def get_weights_row(self, weights: "Weights", prefix: str):
        w = weights.get_sharded(f"{prefix}.weight", dim=1)
        # FP8 branch
        if w.dtype == torch.float8_e4m3fn:
            if self.weight_block_size is not None:
                # XXX: Yes the weights is named scale_inv, but corresponds to scale it seems.
                scale = weights.get_sharded(f"{prefix}.weight_scale_inv", dim=1)

                return Fp8Weight(
                    weight=w,
                    weight_scale=scale,
                    activation_scale_ub=self.activation_scale_ub,
                    dtype=weights.dtype,
                    weight_block_size=self.weight_block_size,
                )

            scale = (
                weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
                .reshape(-1)
                .expand(w.shape[0])
            )
            logical_widths = [w.shape[0]]
            w, scale = requantize_with_max_scale(
                w, scale.unsqueeze(-1).to(weights.device), logical_widths, weights.dtype
            )

            input_scale = None
            if weights.has_tensor(f"{prefix}.input_scale"):
                input_scale = (
                    weights.get_tensor(f"{prefix}.input_scale", to_dtype=False)
                    .reshape(-1)
                    .max()
                )

            return Fp8Weight(
                weight=w,
                weight_scale=scale,
                input_scale=input_scale,
                activation_scale_ub=self.activation_scale_ub,
                dtype=weights.dtype,
            )
        if self.to_fp8:
            return Fp8Weight(weight=w, dtype=weights.dtype)

        return UnquantizedWeight(w)


@dataclass
class Fp8Weight(Weight):
    weight: torch.Tensor
    dtype: torch.dtype
    weight_scale: Optional[torch.Tensor] = None
    input_scale: Optional[torch.Tensor] = None
    activation_scale_ub: Optional[float] = None
    force_w8a16: bool = False
    weight_block_size: Optional[List[int]] = None

    def get_linear(self, bias: torch.Tensor):
        if self.weight_scale is None:
            return get_fp8_linear(force_w8a16=self.force_w8a16).from_unquant(
                self.weight, bias, self.dtype
            )
        # This is not checked by the fbgemm kernels, but they require contiguous
        # memory. Can be non-contiguous when we e.g. expand from scalars.
        self.weight_scale = self.weight_scale.contiguous()
        return get_fp8_linear(force_w8a16=self.force_w8a16).from_fp8(
            weight=self.weight,
            scale=self.weight_scale,
            dtype=self.dtype,
            bias=bias,
            input_scale=self.input_scale,
            scale_upper_bound=self.activation_scale_ub,
            weight_block_size=self.weight_block_size,
        )


class Fp8Linear(torch.nn.Module):
    _device_identity_cache = {}

    def __init__(
        self,
        qweight: torch.Tensor,
        scale: torch.Tensor,
        dtype: torch.dtype,
        bias: Optional[torch.Tensor] = None,
        input_scale: Optional[torch.Tensor] = None,
        scale_upper_bound: Optional[float] = None,
        weight_block_size: Optional[List[int]] = None,
    ) -> None:
        super().__init__()

        self.dtype = dtype
        self.qweight = qweight
        self.scale = scale.float()
        self.input_scale = input_scale.float() if input_scale is not None else None
        self.weight_block_size = weight_block_size
        self.scale_upper_bound = scale_upper_bound

        self.bias = bias if bias is not None else None

    @classmethod
    def from_unquant(cls, weight, bias, dtype):
        qweight, scale = fp8_quantize(weight, scalar=True)
        return cls(
            qweight=qweight,
            scale=scale,
            dtype=dtype,
            bias=bias,
            input_scale=None,
            scale_upper_bound=None,
        )

    @classmethod
    def from_fp8(
        cls,
        weight: torch.Tensor,
        scale: torch.Tensor,
        dtype: torch.dtype,
        bias: Optional[torch.Tensor] = None,
        **kwargs,
    ) -> "Fp8Linear":
        input_scale = kwargs.get("input_scale", None)
        scale_upper_bound = kwargs.get("scale_upper_bound", None)
        weight_block_size = kwargs.get("weight_block_size", None)

        if weight_block_size is not None:
            weight, orig_M, orig_N = pad_block_fp8_weight_naive(
                weight, scale, weight_block_size
            )
            weight, scale = dynamic_quant(
                dequant_block_fp8_weight_naive(
                    weight,
                    scale,
                    weight_block_size,
                    original_M=orig_M,
                    original_N=orig_N,
                    do_unpad=True,
                )
            )
            scale = scale.squeeze(-1)

        return cls(
            qweight=weight,
            scale=scale,
            input_scale=input_scale,
            scale_upper_bound=scale_upper_bound,
            bias=bias,
            dtype=dtype,
            weight_block_size=weight_block_size,
        )

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        if self.weight_block_size is not None or self.input_scale is None:
            return apply_block_fp8_linear_hpu_dynamic(
                input, self.qweight, self.scale, self.input_scale, self.bias
            )

        x_fp8 = torch.ops.hpu.cast_to_fp8_v2(
            input, 1.0 / self.input_scale, False, False, torch.float8_e4m3fn
        )[0]
        return torch.ops.hpu.fp8_gemm_v2(
            A=x_fp8,
            trans_A=False,
            B=self.qweight,
            trans_B=True,
            D=None,
            out_dtype=input.dtype,
            A_scale_inv=self.input_scale,
            B_scale_inv=self.scale,
            bias=self.bias,
            accumulate=False,
        )


def _load_scalar_or_matrix_scale(weights: Weights, prefix: str, shape: torch.Size):
    scale = weights.get_tensor(prefix, to_dtype=False)

    if scale.numel() > 1:
        scale = weights.get_sharded(prefix, dim=0, to_dtype=False)
    return scale.reshape(-1).expand(shape[0])


================================================
FILE: backends/gaudi/server/text_generation_server/layers/gptq/__init__.py
================================================
from dataclasses import dataclass
from typing import List, Optional, Union

import torch
from loguru import logger
from text_generation_server.utils.log import log_once
from text_generation_server.utils.weights import (
    Weight,
    Weights,
    WeightsLoader,
    DefaultWeightsLoader,
)


from .hpu import QuantLinear


@dataclass
class GPTQWeight(Weight):
    qweight: torch.Tensor
    qzeros: torch.Tensor
    scales: torch.Tensor
    g_idx: Optional[torch.Tensor]
    bits: int
    groupsize: int
    use_awq_kernel: bool
    use_exllama: bool

    def __post_init__(self):
        if self.scales.dtype == torch.float:
            self.scales = self.scales.half()

    @property
    def device(self) -> torch.device:
        return self.qweight.device

    def get_linear(self, bias: torch.Tensor):
        if self.use_awq_kernel:
            try:
                from text_generation_server.layers.awq.quantize import WQLinear

                return WQLinear(
                    w_bit=self.bits,
                    group_size=self.groupsize,
                    qweight=self.qweight,
                    qzeros=self.qzeros,
                    scales=self.scales,
                    bias=bias,
                )
            except ImportError:
                raise NotImplementedError(
                    "You do not seem to have awq installed, either install it (cd server &&  make install-awq), or try using GPTQ `---quantize gptq` a conversion AWQ->GPTQ will happen on the fly"
                )
        else:
            return QuantLinear(
                self.qweight,
                self.qzeros,
                self.scales,
                self.g_idx,
                bias,
                self.bits,
                self.groupsize,
            )


class GPTQWeightsLoader(WeightsLoader):
    """
    Loader for GPTQ- and AWQ-quantized weights.
    """

    def __init__(
        self,
        *,
        bits: int,
        desc_act: bool,
        groupsize: int,
        quant_method: str,
        quantize: str,
        sym: bool,
        modules_to_not_convert: List[str],
    ):
        self.bits = bits
        self.desc_act = desc_act
        self.groupsize = groupsize
        self.quant_method = quant_method
        self.quantize = quantize
        self.sym = sym
        self.modules_to_not_convert = modules_to_not_convert

    def is_layer_skipped_quantization(
        self, prefix: str, modules_to_not_convert: List[str]
    ):
        return any(module_name in prefix for module_name in modules_to_not_convert)

    def get_weights(self, weights: Weights, prefix: str):
        self._get_gptq_params(weights)

        use_exllama = True
        if self.bits != 4:
            use_exllama = False

        if self.desc_act:
            log_once(logger.warning, "Disabling exllama because desc_act=True")
            use_exllama = False

        if self.is_layer_skipped_quantization(prefix, self.modules_to_not_convert):
            return DefaultWeightsLoader.get_weights(weights, prefix)

        try:
            qweight = weights.get_tensor(f"{prefix}.qweight")
        except RuntimeError:
            raise RuntimeError(
                "Cannot load `gptq` weight, make sure the model is already quantized, or quantize it with `text-generation-server quantize ORIGINAL_MODEL_ID NEW_MODEL_ID`"
            )

        if self.quantize == "gptq" and self.quant_method == "gptq":
            g_idx = weights.get_tensor(f"{prefix}.g_idx")
        else:
            g_idx = None

        qzeros = weights.get_tensor(f"{prefix}.qzeros")
        scales = weights.get_tensor(f"{prefix}.scales")

        if use_exllama and g_idx is not None:
            g_idx = g_idx - g_idx[0]

        if self.quantize == "gptq" and self.quant_method == "awq":
            log_once(
                logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
            )
            from text_generation_server.layers.awq.conversion_utils import (
                fast_awq_to_gptq,
            )

            qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
            if use_exllama:
                g_idx = None
            else:
                g_idx = (
                    torch.arange(
                        qweight.shape[0] * (32 // self.bits),
                        device=qweight.device,
                    )
                    // self.groupsize
                ).to(dtype=torch.int32)

        return GPTQWeight(
            qweight=qweight,
            qzeros=qzeros,
            scales=scales,
            g_idx=g_idx,
            bits=self.bits,
            groupsize=self.groupsize,
            use_exllama=use_exllama,
        )

    def get_weights_col_packed(
        self,
        weights: Weights,
        prefix: str,
        block_sizes: Union[int, List[int]],
    ):
        if self.is_layer_skipped_quantization(prefix, self.modules_to_not_convert):
            return DefaultWeightsLoader.get_weights_col_packed(
                weights, prefix, block_sizes
            )
        try:
            qweight = weights.get_packed_sharded(
                f"{prefix}.qweight", dim=1, block_sizes=block_sizes
            )
        except RuntimeError:
            raise RuntimeError(
                f"Cannot load `{self.quantize}` weight, make sure the model is already quantized."
            )
        scales = weights.get_packed_sharded(
            f"{prefix}.scales", dim=1, block_sizes=block_sizes
        )
        scales = scales.to(dtype=weights.dtype)

        self._get_gptq_params(weights)

        qzeros = weights.get_packed_sharded(
            f"{prefix}.qzeros", dim=1, block_sizes=block_sizes
        )
        if self.quantize == "gptq" and self.quant_method == "gptq":
            g_idx = weights.get_tensor(f"{prefix}.g_idx")
        elif self.quantize == "gptq" and self.quant_method == "awq":
            log_once(
                logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
            )
            from text_generation_server.layers.awq.conversion_utils import (
                fast_awq_to_gptq,
            )

            qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
            g_idx = (
                torch.arange(
                    qweight.shape[0] * (32 // self.bits),
                    device=qweight.device,
                )
                // self.groupsize
            ).to(dtype=torch.int32)
        else:
            g_idx = None

        return GPTQWeight(
            qweight=qweight,
            qzeros=qzeros,
            scales=scales,
            g_idx=g_idx,
            bits=self.bits,
            groupsize=self.groupsize,
            use_awq_kernel=self.quantize == "awq",
            use_exllama=False,
        )

    def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
        if self.is_layer_skipped_quantization(prefixes[0], self.modules_to_not_convert):
            return DefaultWeightsLoader.get_multi_weights_col(weights, prefixes, dim)
        try:
            qweight = torch.cat(
                [weights.get_sharded(f"{p}.qweight", dim=1) for p in prefixes], dim=1
            )
        except RuntimeError:
            raise RuntimeError(
                f"Cannot load `{self.quantize}` weight, make sure the model is already quantized"
            )

        scales = torch.cat(
            [weights.get_sharded(f"{p}.scales", dim=1) for p in prefixes], dim=1
        )

        self._get_gptq_params(weights)

        qzeros = torch.cat(
            [weights.get_sharded(f"{p}.qzeros", dim=1) for p in prefixes], dim=1
        )

        use_exllama = self.bits == 4 and self.quantize == "gptq" and not self.desc_act

        if self.quantize == "gptq" and self.quant_method == "gptq":
            w = [weights.get_tensor(f"{p}.g_idx") for p in prefixes]
            for w2 in w[1:]:
                torch.testing.assert_close(w2, w[0])
            g_idx = w[0]
        elif self.quantize == "gptq" and self.quant_method == "awq":
            log_once(
                logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
            )
            from text_generation_server.layers.awq.conversion_utils import (
                fast_awq_to_gptq,
            )

            qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
            if use_exllama:
                g_idx = None
            else:
                g_idx = (
                    torch.arange(
                        qweight.shape[0] * (32 // self.bits),
                        device=qweight.device,
                    )
                    // self.groupsize
                ).to(dtype=torch.int32)
        else:
            g_idx = None

        return GPTQWeight(
            qweight=qweight,
            qzeros=qzeros,
            scales=scales,
            g_idx=g_idx,
            bits=self.bits,
            groupsize=self.groupsize,
            use_awq_kernel=self.quantize == "awq",
            use_exllama=use_exllama,
        )

    def get_multi_weights(self, weights: Weights, prefixes: List[str], dim: int):
        if self.is_layer_skipped_quantization(prefixes[0], self.modules_to_not_convert):
            return DefaultWeightsLoader.get_multi_weights(weights, prefixes, dim)
        try:
            qweight = torch.cat(
                [weights.get_tensor(f"{p}.qweight") for p in prefixes], dim=1
            )
        except RuntimeError:
            raise RuntimeError(
                f"Cannot load `{self.quantize}` weight, make sure the model is already quantized"
            )

        scales = torch.cat([weights.get_tensor(f"{p}.scales") for p in prefixes], dim=1)

        self._get_gptq_params(weights)

        qzeros = torch.cat([weights.get_tensor(f"{p}.qzeros") for p in prefixes], dim=1)

        use_exllama = self.bits == 4 and self.quantize == "gptq" and not self.desc_act

        if self.quantize == "gptq" and self.quant_method == "gptq":
            w = [weights.get_tensor(f"{p}.g_idx") for p in prefixes]
            for w2 in w[1:]:
                torch.testing.assert_close(w2, w[0])
            g_idx = w[0]
        elif self.quantize == "gptq" and self.quant_method == "awq":
            log_once(
                logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
            )
            from text_generation_server.layers.awq.conversion_utils import (
                fast_awq_to_gptq,
            )

            qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
            if use_exllama:
                g_idx = None
            else:
                g_idx = (
                    torch.arange(
                        qweight.shape[0] * (32 // self.bits),
                        device=qweight.device,
                    )
                ).to(dtype=torch.int32)
        else:
            g_idx = None

        return GPTQWeight(
            qweight=qweight,
            qzeros=qzeros,
            scales=scales,
            g_idx=g_idx,
            bits=self.bits,
            groupsize=self.groupsize,
            use_awq_kernel=self.quantize == "awq",
            use_exllama=use_exllama,
        )

    def get_weights_row(self, weights: Weights, prefix: str):
        self._get_gptq_params(weights)

        use_exllama = True
        desc_act = self.desc_act
        if self.bits != 4:
            use_exllama = False

        if self.is_layer_skipped_quantization(prefix, self.modules_to_not_convert):
            return DefaultWeightsLoader.get_weights_row(weights, prefix)

        if self.desc_act:
            log_once(logger.warning, "Disabling exllama because desc_act=True")
            use_exllama = False

        try:
            qweight = weights.get_sharded(f"{prefix}.qweight", dim=0)
        except RuntimeError:
            raise RuntimeError(
                "Cannot load `gptq` weight, make sure the model is already quantized, or quantize it with `text-generation-server quantize ORIGINAL_MODEL_ID NEW_MODEL_ID`"
            )

        if self.quantize == "gptq" and self.quant_method == "gptq":
            g_idx = weights.get_sharded(f"{prefix}.g_idx", dim=0)
        else:
            g_idx = None

        if weights.process_group.size() > 1:
            if g_idx is not None:
                if (
                    not torch.equal(
                        # Remove g_idx[0] to adapt the check with TP>1.
                        (g_idx - g_idx[0]).cpu(),
                        torch.tensor(
                            [i // self.groupsize for i in range(g_idx.shape[0])],
                            dtype=torch.int32,
                        ),
                    )
                    and not (g_idx == 0).all()
                ):
                    # Exllama implementation does not support row tensor parallelism with act-order, as
                    # it would require to reorder input activations that are split unto several GPUs
                    use_exllama = False
                    desc_act = True

        from text_generation_server.layers.gptq import (
            GPTQWeight,
        )

        if not desc_act and self.groupsize != -1:
            qzeros = weights.get_sharded(f"{prefix}.qzeros", dim=0)
            scales = weights.get_sharded(f"{prefix}.scales", dim=0)
            if g_idx is not None:
                # qzeros, scales sharded, and g_idx must be adjusted accordingly
                g_idx = g_idx - g_idx[0]
        else:
            qzeros = weights.get_tensor(f"{prefix}.qzeros")
            scales = weights.get_tensor(f"{prefix}.scales")

        if self.quantize == "gptq" and self.quant_method == "awq":
            log_once(
                logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
            )
            from text_generation_server.layers.awq.conversion_utils import (
                fast_awq_to_gptq,
            )

            qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
            if use_exllama:
                g_idx = None
            else:
                g_idx = (
                    torch.arange(
                        qweight.shape[0] * (32 // self.bits),
                        device=qweight.device,
                    )
                    // self.groupsize
                ).to(dtype=torch.int32)

        return GPTQWeight(
            qweight=qweight,
            qzeros=qzeros,
            scales=scales,
            g_idx=g_idx,
            bits=self.bits,
            groupsize=self.groupsize,
            use_awq_kernel=self.quantize == "awq",
            use_exllama=use_exllama,
        )

    def _get_gptq_params(self, weights: Weights):
        if weights.has_tensor("gptq_bits") and weights.has_tensor("gptq_groupsize"):
            self.bits = weights.get_tensor("gptq_bits").item()
            self.groupsize = weights.get_tensor("gptq_groupsize").item()
            self.desc_act = False
            # `server quantize` used asymmetric quantization unconditionally
            # before the `gptq_sym` setting tensor was added.
            self.sym = (
                weights.get_tensor("gptq_sym").item()
                if weights.has_tensor("gptq_sym")
                else False
            )
            self.quant_method = "gptq"


================================================
FILE: backends/gaudi/server/text_generation_server/layers/gptq/hpu.py
================================================
import math
import numpy as np
import torch
import torch.nn as nn

try:

    convert_from_uint4 = torch.ops.hpu.convert_from_uint4
except Exception as e:
    hpu_import_exception = e

    def error_raiser_hpu(*args, **kwargs):
        raise ValueError(
            f"Trying to use HPU, but could not import the HPU framework with the following error: {hpu_import_exception}"
        )

    convert_from_uint4 = error_raiser_hpu


def pack_tensor(input, bits=4):
    normal = input.to(torch.int32)
    q = torch.zeros((normal.shape[0], normal.shape[1] // 32 * bits), dtype=torch.int32)
    i = 0
    col = 0
    while col < q.shape[1]:
        for j in range(i, i + (32 // bits)):
            q[:, col] |= normal[:, j] << (bits * (j - i))
        i += 32 // bits
        col += 1
    q = q.to(torch.int32)
    return q


class QuantLinear(nn.Module):
    def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize):
        super().__init__()
        self.register_buffer("qweight", qweight)
        self.register_buffer("qzeros", qzeros)
        self.register_buffer("scales", scales)
        self.register_buffer("g_idx", g_idx)
        if bias is not None:
            self.register_buffer("bias", bias)
        else:
            self.bias = None
        if bits not in [4]:
            raise NotImplementedError("Only 4 bits are supported.")
        self.bits = bits
        self.maxq = 2**self.bits - 1
        self.groupsize = groupsize

        self.outfeatures = qweight.shape[1]
        self.infeatures = qweight.shape[0] * 32 // bits
        self.wf = torch.tensor(
            list(range(0, 32, self.bits)), dtype=torch.int32
        ).unsqueeze(0)
        self._preprocessing()

    def unpack_zeros_from_cuda_old_format(self):
        zeros = torch.bitwise_right_shift(
            torch.unsqueeze(self.qzeros, 2).expand(-1, -1, 32 // self.bits),
            self.wf.unsqueeze(0),
        ).to(torch.int16 if self.bits == 8 else torch.int8)

        zeros = zeros + 1
        zeros = torch.bitwise_and(zeros, (2**self.bits) - 1).to(
            self.scales.dtype
        )  # NOTE: It appears that casting here after the `zeros = zeros + 1` is important.
        zeros = zeros.reshape(-1, zeros.shape[1] * zeros.shape[2])
        return zeros

    def unpack_weight_from_cuda_old_format(self):
        weight = torch.bitwise_right_shift(
            torch.unsqueeze(self.qweight, 1).expand(-1, 32 // self.bits, -1),
            self.wf.unsqueeze(-1),
        ).to(torch.int16 if self.bits == 8 else torch.int8)
        weight = torch.bitwise_and(weight, (2**self.bits) - 1)
        weight = weight.reshape((weight.shape[0] * weight.shape[1], weight.shape[2]))
        return weight

    def _preprocessing(self):
        orig_device = self.qweight.device
        self.qweight = self.qweight.cpu()
        weight = self.unpack_weight_from_cuda_old_format()
        new_qweight = pack_tensor(weight)
        self.qweight = new_qweight.to(orig_device)
        # TODO: Support group indexing and remove the check
        columns = self.qweight.shape[0]
        g_idx_trivial = [i // self.groupsize for i in range(columns)]
        g_idx_trivial = torch.tensor(
            g_idx_trivial, dtype=torch.int32, device=self.g_idx.device
        )
        sort_zeros = not (torch.equal(self.g_idx, g_idx_trivial))
        self.qzeros = self.qzeros.cpu()
        zeros = self.unpack_zeros_from_cuda_old_format()
        if sort_zeros:
            zeros_group_1 = torch.zeros(
                (self.infeatures, self.outfeatures),
                dtype=zeros.dtype,
                device=zeros.device,
            )
            scales = self.scales.cpu()
            scale_group_1 = torch.zeros(
                (self.infeatures, self.outfeatures),
                dtype=scales.dtype,
                device=scales.device,
            )
            for i in range(self.infeatures):
                zeros_group_1[i] = zeros[self.g_idx[i]]
                scale_group_1[i] = self.scales[self.g_idx[i]]
            self.qzeros = pack_tensor(zeros_group_1).to(orig_device)
            self.scales = scale_group_1.to(orig_device)
            self.groupsize = 1
            self.g_idx = None
        else:
            new_qzeros = pack_tensor(zeros)
            self.qzeros = new_qzeros.to(orig_device)

    @classmethod
    def new(cls, bits, groupsize, infeatures, outfeatures, bias):
        if bits not in [4]:
            raise NotImplementedError("Only 4 bits are supported.")

        qweight = torch.zeros((infeatures // 32 * bits, outfeatures), dtype=torch.int32)
        qzeros = torch.zeros(
            (math.ceil(infeatures / groupsize), outfeatures // 32 * bits),
            dtype=torch.int32,
        )
        scales = torch.zeros(
            (math.ceil(infeatures / groupsize), outfeatures), dtype=torch.float16
        )
        g_idx = torch.tensor(
            [i // groupsize for i in range(infeatures)], dtype=torch.int32
        )
        if bias:
            bias = torch.zeros((outfeatures), dtype=torch.float16)
        else:
            bias = None
        return cls(qweight, qzeros, scales, g_idx, bias, bits, groupsize)

    def pack(self, linear, scales, zeros, g_idx=None):
        self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx

        scales = scales.t().contiguous()
        zeros = zeros.t().contiguous()
        scale_zeros = zeros * scales
        self.scales = scales.clone().half()
        if linear.bias is not None:
            self.bias = linear.bias.clone().half()

        intweight = []
        for idx in range(self.infeatures):
            intweight.append(
                torch.round(
                    (linear.weight.data[:, idx] + scale_zeros[self.g_idx[idx]])
                    / self.scales[self.g_idx[idx]]
                ).to(torch.int)[:, None]
            )
        intweight = torch.cat(intweight, dim=1)
        intweight = intweight.t().contiguous()
        intweight = intweight.numpy().astype(np.uint32)
        qweight = np.zeros(
            (intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32
        )
        i = 0
        row = 0
        while row < qweight.shape[0]:
            if self.bits in [4]:
                for j in range(i, i + (32 // self.bits)):
                    qweight[row] |= intweight[j] << (self.bits * (j - i))
                i += 32 // self.bits
                row += 1
            else:
                raise NotImplementedError("Only 4 bits are supported.")

        qweight = qweight.astype(np.int32)
        self.qweight = torch.from_numpy(qweight)

        zeros -= 1
        zeros = zeros.numpy().astype(np.uint32)
        qzeros = np.zeros(
            (zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32
        )
        i = 0
        col = 0
        while col < qzeros.shape[1]:
            if self.bits in [4]:
                for j in range(i, i + (32 // self.bits)):
                    qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
                i += 32 // self.bits
                col += 1
            else:
                raise NotImplementedError("Only 4 bits are supported.")

        qzeros = qzeros.astype(np.int32)
        self.qzeros = torch.from_numpy(qzeros)

    def forward(self, x):
        out_shape = x.shape[:-1] + (self.outfeatures,)
        x = x.reshape(-1, x.shape[-1])
        weight = convert_from_uint4(self.qweight, self.scales, self.qzeros, x.dtype)
        out = torch.matmul(x, weight)
        out = out.reshape(out_shape)
        out = out + self.bias if self.bias is not None else out
        return out


================================================
FILE: backends/gaudi/server/text_generation_server/layers/gptq/quantize.py
================================================
import time
import torch.nn as nn
import math
import json
import os
import torch
import transformers

from texttable import Texttable
from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer
from huggingface_hub import HfApi
from accelerate import init_empty_weights
from text_generation_server.utils import initialize_torch_distributed, Weights
from text_generation_server.utils.hub import weight_files
from text_generation_server.layers.gptq import QuantLinear
from loguru import logger
from typing import Optional
from text_generation_server.layers.gptq.utils import torch_snr_error

from text_generation_server.utils.weights import DefaultWeightsLoader, UnquantizedWeight

DEV = torch.device("cuda:0")


class Quantizer(nn.Module):
    def __init__(self, shape=1):
        super(Quantizer, self).__init__()
        self.register_buffer("maxq", torch.tensor(0))
        self.register_buffer("scale", torch.zeros(shape))
        self.register_buffer("zero", torch.zeros(shape))

    def configure(
        self,
        bits,
        perchannel=False,
        sym=True,
        mse=False,
        norm=2.4,
        grid=100,
        maxshrink=0.8,
        trits=False,
    ):
        self.maxq = torch.tensor(2**bits - 1)
        self.perchannel = perchannel
        self.sym = sym
        self.mse = mse
        self.norm = norm
        self.grid = grid
        self.maxshrink = maxshrink
        if trits:
            self.maxq = torch.tensor(-1)
        self.scale = torch.zeros_like(self.scale)

    def _quantize(self, x, scale, zero, maxq):
        if maxq < 0:
            return (x > scale / 2).float() * scale + (x < zero / 2).float() * zero
        q = torch.clamp(torch.round(x / scale) + zero, 0, maxq)
        return scale * (q - zero)

    def find_params(self, x, weight=False):
        dev = x.device
        self.maxq = self.maxq.to(dev)

        shape = x.shape
        if self.perchannel:
            if weight:
                x = x.flatten(1)
            else:
                if len(shape) == 4:
                    x = x.permute([1, 0, 2, 3])
                    x = x.flatten(1)
                if len(shape) == 3:
                    x = x.reshape((-1, shape[-1])).t()
                if len(shape) == 2:
                    x = x.t()
        else:
            x = x.flatten().unsqueeze(0)

        tmp = torch.zeros(x.shape[0], device=dev)
        xmin = torch.minimum(x.min(1)[0], tmp)
        xmax = torch.maximum(x.max(1)[0], tmp)

        if self.sym:
            xmax = torch.maximum(torch.abs(xmin), xmax)
            tmp = xmin < 0
            if torch.any(tmp):
                xmin[tmp] = -xmax[tmp]
        tmp = (xmin == 0) & (xmax == 0)
        xmin[tmp] = -1
        xmax[tmp] = +1

        if self.maxq < 0:
            self.scale = xmax
            self.zero = xmin
        else:
            self.scale = (xmax - xmin) / self.maxq
            if self.sym:
                self.zero = torch.full_like(self.scale, (self.maxq + 1) / 2)
            else:
                self.zero = torch.round(-xmin / self.scale)

        if self.mse:
            best = torch.full([x.shape[0]], float("inf"), device=dev)
            for i in range(int(self.maxshrink * self.grid)):
                p = 1 - i / self.grid
                xmin1 = p * xmin
                xmax1 = p * xmax
                scale1 = (xmax1 - xmin1) / self.maxq
                zero1 = torch.round(-xmin1 / scale1) if not self.sym else self.zero
                q = self._quantize(
                    x, scale1.unsqueeze(1), zero1.unsqueeze(1), self.maxq
                )
                q -= x
                q.abs_()
                q.pow_(self.norm)
                err = torch.sum(q, 1)
                tmp = err < best
                if torch.any(tmp):
                    best[tmp] = err[tmp]
                    self.scale[tmp] = scale1[tmp]
                    self.zero[tmp] = zero1[tmp]
        if not self.perchannel:
            if weight:
                tmp = shape[0]
            else:
                tmp = shape[1] if len(shape) != 3 else shape[2]
            self.scale = self.scale.repeat(tmp)
            self.zero = self.zero.repeat(tmp)

        if weight:
            shape = [-1] + [1] * (len(shape) - 1)
            self.scale = self.scale.reshape(shape)
            self.zero = self.zero.reshape(shape)
            return
        if len(shape) == 4:
            self.scale = self.scale.reshape((1, -1, 1, 1))
            self.zero = self.zero.reshape((1, -1, 1, 1))
        if len(shape) == 3:
            self.scale = self.scale.reshape((1, 1, -1))
            self.zero = self.zero.reshape((1, 1, -1))
        if len(shape) == 2:
            self.scale = self.scale.unsqueeze(0)
            self.zero = self.zero.unsqueeze(0)

    def quantize(self, x):
        if self.ready():
            return self._quantize(x, self.scale, self.zero, self.maxq)

        return x

    def enabled(self):
        return self.maxq > 0

    def ready(self):
        return torch.all(self.scale != 0)


class GPTQ:
    def __init__(self, layer, observe=False):
        self.layer = layer
        self.dev = self.layer.weight.device
        W = layer.weight.data.clone()
        if isinstance(self.layer, nn.Conv2d):
            W = W.flatten(1)
        if isinstance(self.layer, transformers.Conv1D):
            W = W.t()
        self.rows = W.shape[0]
        self.columns = W.shape[1]
        self.H = torch.zeros((self.columns, self.columns), device=self.dev)
        self.nsamples = 0
        self.quantizer = Quantizer()
        self.observe = observe

    def add_batch(self, inp, out):
        # Hessian H = 2 X XT + λ I
        if self.observe:
            self.inp1 = inp
            self.out1 = out
        else:
            self.inp1 = None
            self.out1 = None

        if len(inp.shape) == 2:
            inp = inp.unsqueeze(0)
        tmp = inp.shape[0]
        if isinstance(self.layer, nn.Linear) or isinstance(
            self.layer, transformers.Conv1D
        ):
            if len(inp.shape) == 3:
                inp = inp.reshape((-1, inp.shape[-1]))
            inp = inp.t()
        if isinstance(self.layer, nn.Conv2d):
            unfold = nn.Unfold(
                self.layer.kernel_size,
                dilation=self.layer.dilation,
                padding=self.layer.padding,
                stride=self.layer.stride,
            )
            inp = unfold(inp)
            inp = inp.permute([1, 0, 2])
            inp = inp.flatten(1)
        self.H *= self.nsamples / (self.nsamples + tmp)
        self.nsamples += tmp
        # inp = inp.float()
        inp = math.sqrt(2 / self.nsamples) * inp.float()
        # self.H += 2 / self.nsamples * inp.matmul(inp.t())
        self.H += inp.matmul(inp.t())

    def print_loss(self, name, q_weight, weight_error, timecost):
        table = Texttable()
        length = 28
        name = (
            (name + " " * (length - len(name)))
            if len(name) <= length
            else name[:length]
        )

        table.header(["name", "weight_error", "fp_inp_SNR", "q_inp_SNR", "time"])

        # assign weight
        self.layer.weight.data = q_weight.reshape(self.layer.weight.shape).to(
            self.layer.weight.data.dtype
        )

        if self.inp1 is not None:
            # quantize input to int8
            quantizer = Quantizer()
            quantizer.configure(8, perchannel=False, sym=True, mse=False)
            quantizer.find_params(self.inp1)
            q_in = quantizer.quantize(self.inp1).type(torch.float16)
            q_out = self.layer(q_in)

            # get kinds of SNR
            q_SNR = torch_snr_error(q_out, self.out1).item()
            fp_SNR = torch_snr_error(self.layer(self.inp1), self.out1).item()
        else:
            q_SNR = "-"
            fp_SNR = "-"

        table.add_row([name, weight_error, fp_SNR, q_SNR, timecost])
        print(table.draw().split("\n")[-2])

    def fasterquant(
        self, blocksize=128, percdamp=0.01, groupsize=-1, act_order=False, name=""
    ):
        self.layer.to(self.dev)

        W = self.layer.weight.data.clone()
        if isinstance(self.layer, nn.Conv2d):
            W = W.flatten(1)
        if isinstance(self.layer, transformers.Conv1D):
            W = W.t()
        W = W.float()

        tick = time.time()

        if not self.quantizer.ready():
            self.quantizer.find_params(W, weight=True)

        H = self.H
        if not self.observe:
            del self.H
        dead = torch.diag(H) == 0
        H[dead, dead] = 1
        W[:, dead] = 0

        if act_order:
            perm = torch.argsort(torch.diag(H), descending=True)
            W = W[:, perm]
            H = H[perm][:, perm]

        Losses = torch.zeros_like(W)
        Q = torch.zeros_like(W)

        damp = percdamp * torch.mean(torch.diag(H))
        diag = torch.arange(self.columns, device=self.dev)
        H[diag, diag] += damp
        H = torch.linalg.cholesky(H)
        H = torch.cholesky_inverse(H)
        try:
            H = torch.linalg.cholesky(H, upper=True)
        except Exception:
            # Addition because Falcon fails on h_to_4h
            H = torch.linalg.cholesky(
                H + 1e-5 * torch.eye(H.shape[0]).to(H.device), upper=True
            )
        Hinv = H

        g_idx = []
        scale = []
        zero = []
        now_idx = 1

        for i1 in range(0, self.columns, blocksize):
            i2 = min(i1 + blocksize, self.columns)
            count = i2 - i1

            W1 = W[:, i1:i2].clone()
            Q1 = torch.zeros_like(W1)
            Err1 = torch.zeros_like(W1)
            Losses1 = torch.zeros_like(W1)
            Hinv1 = Hinv[i1:i2, i1:i2]

            for i in range(count):
                w = W1[:, i]
                d = Hinv1[i, i]

                if groupsize != -1:
                    if (i1 + i) % groupsize == 0:
                        self.quantizer.find_params(
                            W[:, (i1 + i) : (i1 + i + groupsize)], weight=True
                        )

                    if ((i1 + i) // groupsize) - now_idx == -1:
                        scale.append(self.quantizer.scale)
                        zero.append(self.quantizer.zero)
                        now_idx += 1

                q = self.quantizer.quantize(w.unsqueeze(1)).flatten()
                Q1[:, i] = q
                Losses1[:, i] = (w - q) ** 2 / d**2

                err1 = (w - q) / d
                W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0))
                Err1[:, i] = err1

            Q[:, i1:i2] = Q1
            Losses[:, i1:i2] = Losses1 / 2

            W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:])

        torch.cuda.synchronize()
        error = torch.sum(Losses).item()

        groupsize = groupsize if groupsize != -1 else self.columns
        g_idx = [i // groupsize for i in range(self.columns)]
        g_idx = torch.tensor(g_idx, dtype=torch.int32, device=Q.device)
        if act_order:
            invperm = torch.argsort(perm)
            Q = Q[:, invperm]
            g_idx = g_idx[invperm]

        if isinstance(self.layer, transformers.Conv1D):
            Q = Q.t()

        self.print_loss(
            name=name, q_weight=Q, weight_error=error, timecost=(time.time() - tick)
        )

        if scale == []:
            scale.append(self.quantizer.scale)
            zero.append(self.quantizer.zero)
        scale = torch.cat(scale, dim=1)
        zero = torch.cat(zero, dim=1)
        return scale, zero, g_idx, error

    def free(self):
        self.inp1 = None
        self.out1 = None
        self.H = None
        self.Losses = None
        self.Trace = None
        torch.cuda.empty_cache()


def get_wikitext2(nsamples, seed, seqlen, model_id, trust_remote_code):
    from datasets import load_dataset

    traindata = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
    testdata = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")

    try:
        tokenizer = AutoTokenizer.from_pretrained(
            model_id, use_fast=False, trust_remote_code=trust_remote_code
        )
    except Exception:
        tokenizer = AutoTokenizer.from_pretrained(
            model_id, use_fast=True, trust_remote_code=trust_remote_code
        )

    trainenc = tokenizer("\n\n".join(traindata["text"]), return_tensors="pt")
    testenc = tokenizer("\n\n".join(testdata["text"]), return_tensors="pt")

    import random

    random.seed(seed)
    trainloader = []
    for _ in range(nsamples):
        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
        j = i + seqlen
        inp = trainenc.input_ids[:, i:j]
        tar = inp.clone()
        tar[:, :-1] = -100
        trainloader.append((inp, tar))
    return trainloader, testenc


def get_ptb(nsamples, seed, seqlen, model_id, trust_remote_code):
    from datasets import load_dataset

    traindata = load_dataset("ptb_text_only", "penn_treebank", split="train")
    valdata = load_dataset("ptb_text_only", "penn_treebank", split="validation")

    try:
        tokenizer = AutoTokenizer.from_pretrained(
            model_id, use_fast=False, trust_remote_code=trust_remote_code
        )
    except Exception:
        tokenizer = AutoTokenizer.from_pretrained(
            model_id, use_fast=True, trust_remote_code=trust_remote_code
        )

    trainenc = tokenizer("\n\n".join(traindata["sentence"]), return_tensors="pt")
    testenc = tokenizer("\n\n".join(valdata["sentence"]), return_tensors="pt")

    import random

    random.seed(seed)
    trainloader = []
    for _ in range(nsamples):
        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
        j = i + seqlen
        inp = trainenc.input_ids[:, i:j]
        tar = inp.clone()
        tar[:, :-1] = -100
        trainloader.append((inp, tar))
    return trainloader, testenc


def get_c4(nsamples, seed, seqlen, model_id, trust_remote_code):
    from datasets import load_dataset

    traindata = load_dataset(
        "allenai/c4",
        "allenai--c4",
        data_files={"train": "en/c4-train.00000-of-01024.json.gz"},
        split="train",
        use_auth_token=False,
    )
    valdata = load_dataset(
        "allenai/c4",
        "allenai--c4",
        data_files={"validation": "en/c4-validation.00000-of-00008.json.gz"},
        split="validation",
        use_auth_token=False,
    )

    try:
        tokenizer = AutoTokenizer.from_pretrained(
            model_id, use_fast=False, trust_remote_code=trust_remote_code
        )
    except Exception:
        tokenizer = AutoTokenizer.from_pretrained(
            model_id, use_fast=True, trust_remote_code=trust_remote_code
        )

    import random

    random.seed(seed)
    trainloader = []
    for _ in range(nsamples):
        while True:
            i = random.randint(0, len(traindata) - 1)
            trainenc = tokenizer(traindata[i]["text"], return_tensors="pt")
            if trainenc.input_ids.shape[1] >= seqlen:
                break
        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
        j = i + seqlen
        inp = trainenc.input_ids[:, i:j]
        tar = inp.clone()
        tar[:, :-1] = -100
        trainloader.append((inp, tar))

    import random

    random.seed(0)
    valenc = []
    for _ in range(256):
        while True:
            i = random.randint(0, len(valdata) - 1)
            tmp = tokenizer(valdata[i]["text"], return_tensors="pt")
            if tmp.input_ids.shape[1] >= seqlen:
                break
        i = random.randint(0, tmp.input_ids.shape[1] - seqlen - 1)
        j = i + seqlen
        valenc.append(tmp.input_ids[:, i:j])
    valenc = torch.hstack(valenc)

    class TokenizerWrapper:
        def __init__(self, input_ids):
            self.input_ids = input_ids

    valenc = TokenizerWrapper(valenc)

    return trainloader, valenc


def get_ptb_new(nsamples, seed, seqlen, model_id, trust_remote_code):
    from datasets import load_dataset

    traindata = load_dataset("ptb_text_only", "penn_treebank", split="train")
    testdata = load_dataset("ptb_text_only", "penn_treebank", split="test")

    try:
        tokenizer = AutoTokenizer.from_pretrained(
            model_id, use_fast=False, trust_remote_code=trust_remote_code
        )
    except Exception:
        tokenizer = AutoTokenizer.from_pretrained(
            model_id, use_fast=True, trust_remote_code=trust_remote_code
        )

    trainenc = tokenizer(" ".join(traindata["sentence"]), return_tensors="pt")
    testenc = tokenizer(" ".join(testdata["sentence"]), return_tensors="pt")

    import random

    random.seed(seed)
    trainloader = []
    for _ in range(nsamples):
        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
        j = i + seqlen
        inp = trainenc.input_ids[:, i:j]
        tar = inp.clone()
        tar[:, :-1] = -100
        trainloader.append((inp, tar))
    return trainloader, testenc


def get_c4_new(nsamples, seed, seqlen, model_id, trust_remote_code):
    from datasets import load_dataset

    traindata = load_dataset(
        "allenai/c4",
        "allenai--c4",
        data_files={"train": "en/c4-train.00000-of-01024.json.gz"},
        split="train",
    )
    valdata = load_dataset(
        "allenai/c4",
        "allenai--c4",
        data_files={"validation": "en/c4-validation.00000-of-00008.json.gz"},
        split="validation",
    )

    try:
        tokenizer = AutoTokenizer.from_pretrained(
            model_id, use_fast=False, trust_remote_code=trust_remote_code
        )
    except Exception:
        tokenizer = AutoTokenizer.from_pretrained(
            model_id, use_fast=True, trust_remote_code=trust_remote_code
        )

    import random

    random.seed(seed)
    trainloader = []
    for _ in range(nsamples):
        while True:
            i = random.randint(0, len(traindata) - 1)
            trainenc = tokenizer(traindata[i]["text"], return_tensors="pt")
            if trainenc.input_ids.shape[1] >= seqlen:
                break
        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
        j = i + seqlen
        inp = trainenc.input_ids[:, i:j]
        tar = inp.clone()
        tar[:, :-1] = -100
        trainloader.append((inp, tar))

    valenc = tokenizer(" ".join(valdata[:1100]["text"]), return_tensors="pt")
    valenc = valenc.input_ids[:, : (256 * seqlen)]

    class TokenizerWrapper:
        def __init__(self, input_ids):
            self.input_ids = input_ids

    valenc = TokenizerWrapper(valenc)

    return trainloader, valenc


def get_loaders(
    name, nsamples=128, seed=0, seqlen=2048, model_id="", trust_remote_code=False
):
    if "wikitext2" in name:
        return get_wikitext2(nsamples, seed, seqlen, model_id, trust_remote_code)
    if "ptb" in name:
        if "new" in name:
            return get_ptb_new(nsamples, seed, seqlen, model_id, trust_remote_code)
        return get_ptb(nsamples, seed, seqlen, model_id, trust_remote_code)
    if "c4" in name:
        if "new" in name:
            return get_c4_new(nsamples, seed, seqlen, model_id, trust_remote_code)
        return get_c4(nsamples, seed, seqlen, model_id, trust_remote_code)


def find_layers(module, layers=(nn.Conv2d, nn.Linear), name=""):
    # Skip last lm_head linear
    # Need isintance Falcon is inheriting Linear.
    if isinstance(module, layers) and "lm_head" not in name:
        return {name: module}
    res = {}
    for name1, child in module.named_children():
        res.update(
            find_layers(
                child, layers=layers, name=name + "." + name1 if name != "" else name1
            )
        )
    return res


@torch.no_grad()
def sequential(
    model,
    dataloader,
    dev,
    nsamples,
    bits,
    groupsize,
    *,
    hooks,
    percdamp=0.01,
    sym: bool = False,
    act_order: bool = False,
):
    print("Starting ...")

    use_cache = model.config.use_cache
    model.config.use_cache = False
    try:
        layers = model.model.layers
        prefix = "model.layers"
    except Exception:
        layers = model.transformer.h
        prefix = "transformer.h"

    dtype = next(iter(model.parameters())).dtype
    inps = torch.zeros(
        (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
    )

    cache = {"i": 0}
    extra = {}

    class Catcher(nn.Module):
        def __init__(self, module):
            super().__init__()
            self.module = module

        def forward(self, inp, **kwargs):
            inps[cache["i"]] = inp
            cache["i"] += 1
            extra.update(kwargs.copy())
            raise ValueError

    layers[0] = Catcher(layers[0])
    for batch in dataloader:
        try:
            model(batch[0].cuda())
        except ValueError:
            pass
    layers[0] = layers[0].module

    # layers[0] = layers[0].cpu()
    # model.model.embed_tokens = model.model.embed_tokens.cpu()
    # model.model.norm = model.model.norm.cpu()
    torch.cuda.empty_cache()
    for hook in hooks:
        hook.remove()

    outs = torch.zeros_like(inps)

    extra = {
        k: v.to(dev) if isinstance(v, torch.Tensor) else v for k, v in extra.items()
    }

    print("Ready.")

    quantizers = {}
    for i in range(len(layers)):
        print(f"Quantizing layer {i+1}/{len(layers)}..")
        print("+------------------+--------------+------------+-----------+-------+")
        print("|       name       | weight_error | fp_inp_SNR | q_inp_SNR | time  |")
        print("+==================+==============+============+===========+=======+")

        layer = layers[i]
        layer.load()
        full = find_layers(layer)
        sequential = [list(full.keys())]

        for names in sequential:
            subset = {n: full[n] for n in names}
            gptq = {}
            for name in subset:
                gptq[name] = GPTQ(subset[name])
                gptq[name].quantizer.configure(
                    bits, perchannel=True, sym=sym, mse=False
                )
                pass

            def add_batch(name):
                nonlocal gptq

                def tmp(_, inp, out):
                    gptq[name].add_batch(inp[0].data, out.data)

                return tmp

            handles = []
            for name in subset:
                handles.append(subset[name].register_forward_hook(add_batch(name)))
            for j in range(nsamples):
                outs[j] = layer(inps[j].unsqueeze(0), **extra)[0]
            for h in handles:
                h.remove()

            for name in subset:
                scale, zero, g_idx, error = gptq[name].fasterquant(
                    percdamp=percdamp,
                    groupsize=groupsize,
                    act_order=act_order,
                    name=name,
                )
                quantizers[f"{prefix}.{i}.{name}"] = (
                    gptq[name].quantizer.cpu(),
                    scale.cpu(),
                    zero.cpu(),
                    g_idx.cpu(),
                    bits,
                    groupsize,
                )

                gptq[name].free()

        for j in range(nsamples):
            outs[j] = layer(inps[j].unsqueeze(0), **extra)[0]

        layer.unload()
        del layer
        del gptq
        torch.cuda.empty_cache()

        inps, outs = outs, inps
        print("+------------------+--------------+------------+-----------+-------+")
        print("\n")

    model.config.use_cache = use_cache

    return quantizers


def make_quant_linear(module, names, bits, groupsize, name=""):
    if isinstance(module, QuantLinear):
        return
    for attr in dir(module):
        tmp = getattr(module, attr)
        name1 = name + "." + attr if name != "" else attr
        if name1 in names:
            delattr(module, attr)
            setattr(
                module,
                attr,
                QuantLinear.new(
                    bits,
                    groupsize,
                    tmp.in_features,
                    tmp.out_features,
                    tmp.bias is not None,
                ),
            )
    for name1, child in module.named_children():
        make_quant_linear(
            child, names, bits, groupsize, name + "." + name1 if name != "" else name1
        )


# TODO: perform packing on GPU
def pack(model, quantizers, bits, groupsize):
    layers = find_layers(model)
    layers = {n: layers[n] for n in quantizers}
    make_quant_linear(model, quantizers, bits, groupsize)
    qlayers = find_layers(model, (QuantLinear,))
    print("Packing ...")
    for name in qlayers:
        print(name)
        quantizers[name], scale, zero, g_idx, _, _ = quantizers[name]
        qlayers[name].pack(layers[name], scale, zero, g_idx)
    print("Done.")
    return model


def setdeepattr(module, full_name, tensor):
    current = module
    tokens = full_name.split(".")
    for token in tokens[:-1]:
        current = getattr(current, token)
    setattr(current, tokens[-1], tensor)


def getdeepattr(module, full_name):
    current = module
    tokens = full_name.split(".")
    for token in tokens:
        current = getattr(current, token)
    return current


def load_weights_pre_hook(module_name, weights, recursive=False):
    def inner(module, args):
        print(f"Pre hook {module_name}")
        local_params = {}
        for k, v in module.named_parameters():
            if not recursive and k.count(".") != 1:
                continue
            local_params[k] = v
        for k, v in module.named_buffers():
            if not recursive and k.count(".") != 1:
                continue
            local_params[k] = v

        for local_param in local_params:
            current_tensor = getdeepattr(module, local_param)
            if current_tensor.device == torch.device("meta"):
                # print(f"Loading {local_param}")
                if module_name:
                    tensor_name = f"{module_name}.{local_param}"
                else:
                    tensor_name = local_param
                tensor = weights.get_tensor(tensor_name)
                setdeepattr(module, local_param, nn.Parameter(tensor))
            else:
                tensor = current_tensor.to(device=torch.device("cuda:0"))
                if current_tensor.requires_grad:
                    tensor = nn.Parameter(tensor)
                setdeepattr(module, local_param, tensor)

    return inner


def load_weights_post_hook(module_name, weights, recursive=False):
    def inner(module, args, output):
        print(f"Post hook {module_name}")
        local_params = {}
        for k, v in module.named_parameters():
            if not recursive and k.count(".") != 1:
                continue
            local_params[k] = v
        for k, v in module.named_buffers():
            if not recursive and k.count(".") != 1:
                continue
            local_params[k] = v
        for local_param in local_params:
            # print(f"Unloading {local_param}")
            current_tensor = getdeepattr(module, local_param)
            setdeepattr(
                module,
                local_param,
                nn.Parameter(current_tensor.to(device=torch.device("cpu"))),
            )
        return output

    return inner


def quantize(
    model_id: str,
    bits: int,
    groupsize: int,
    output_dir: str,
    revision: str,
    trust_remote_code: bool,
    upload_to_model_id: Optional[str],
    percdamp: float,
    act_order: bool,
    sym: bool,
):
    print("loading model")
    config = AutoConfig.from_pretrained(
        model_id,
        trust_remote_code=trust_remote_code,
    )

    with init_empty_weights():
        model = AutoModelForCausalLM.from_config(
            config, torch_dtype=torch.float16, trust_remote_code=trust_remote_code
        )
    model = model.eval()

    print("LOADED model")
    files = weight_files(model_id, revision, extension=".safetensors")
    process_group, _, _ = initialize_torch_distributed()
    weights = Weights(
        files,
        device=torch.device("cuda:0"),
        dtype=torch.float16,
        process_group=process_group,
        aliases={"embed_tokens.weight": ["lm_head.weight"]},
        weights_loader=DefaultWeightsLoader(UnquantizedWeight),
    )
    hooks = []
    for name, module in model.named_modules():

        def load(module, name):
            def _load():
                load_weights_pre_hook(name, weights, recursive=True)(module, None)

            return _load

        def unload(module, name):
            def _unload():
                load_weights_post_hook(name, weights, recursive=True)(
                    module, None, None
                )

            return _unload

        module.load = load(module, name)
        module.unload = unload(module, name)
        hooks.append(
            module.register_forward_pre_hook(load_weights_pre_hook(name, weights))
        )
        hooks.append(
            module.register_forward_hook(load_weights_post_hook(name, weights))
        )
    model.seqlen = 2048

    dataset = "wikitext2"
    nsamples = 128
    seed = None

    dataloader, testloader = get_loaders(
        dataset,
        nsamples=nsamples,
        seed=seed,
        model_id=model_id,
        seqlen=model.seqlen,
        trust_remote_code=trust_remote_code,
    )

    tick = time.time()
    quantizers = sequential(
        model,
        dataloader,
        DEV,
        nsamples,
        bits,
        groupsize,
        percdamp=percdamp,
        act_order=act_order,
        hooks=hooks,
        sym=sym,
    )
    print(time.time() - tick)

    pack(model, quantizers, bits, groupsize)
    from safetensors.torch import save_file
    from huggingface_hub import split_torch_state_dict_into_shards

    state_dict = model.state_dict()
    state_dict = {k: v.cpu().contiguous() for k, v in state_dict.items()}

    max_shard_size = "10GB"
    state_dict_split = split_torch_state_dict_into_shards(
        state_dict,
        filename_pattern="model.safetensors",
        max_shard_size=max_shard_size,
    )
    index = None
    if state_dict_split.is_sharded:
        index = {
            "metadata": state_dict_split.metadata,
            "weight_map": state_dict_split.tensor_to_filename,
        }
    shards = state_dict_split.filename_to_tensors
    os.makedirs(output_dir, exist_ok=True)
    for shard_file, shard in shards.items():
        save_file(
            shard,
            os.path.join(output_dir, shard_file),
            metadata={
                "format": "pt",
                "quantized": "gptq",
                "origin": "text-generation-inference",
            },
        )
    if index is None:
        path_to_weights = os.path.join(output_dir, "model.safetensors")
        logger.info(f"Model weights saved in {path_to_weights}")
    else:
        save_index_file = "model.safetensors.index.json"
        save_index_file = os.path.join(output_dir, save_index_file)
        with open(save_index_file, "w", encoding="utf-8") as f:
            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
            f.write(content)
        logger.info(
            f"The model is bigger than the maximum size per checkpoint ({max_shard_size}) and is going to be "
            f"split in {len(shards)} checkpoint shards. You can find where each parameters has been saved in the "
            f"index located at {save_index_file}."
        )
    config = AutoConfig.from_pretrained(model_id, trust_remote_code=trust_remote_code)
    config.quantization_config = {
        "bits": bits,
        "group_size": groupsize,
        "damp_percent": percdamp,
        "desc_act": act_order,
        "static_groups": False,
        "sym": sym,
        "quant_method": "gptq",
    }
    config.save_pretrained(output_dir)
    logger.info("Saved config")
    logger.info("Saving tokenizer")
    tokenizer = AutoTokenizer.from_pretrained(
        model_id, trust_remote_code=trust_remote_code
    )
    tokenizer.save_pretrained(output_dir)
    logger.info("Saved tokenizer")

    if upload_to_model_id:
        api = HfApi()

        api.upload_folder(
            folder_path=output_dir, repo_id=upload_to_model_id, repo_type="model"
        )


================================================
FILE: backends/gaudi/server/text_generation_server/layers/gptq/utils.py
================================================
import torch


# copied from https://github.com/openppl-public/ppq/blob/master/ppq/quantization/measure/norm.py
def torch_snr_error(
    y_pred: torch.Tensor, y_real: torch.Tensor, reduction: str = "mean"
) -> torch.Tensor:
    """
    Compute SNR between y_pred(tensor) and y_real(tensor)

    SNR can be calcualted as following equation:

        SNR(pred, real) = (pred - real) ^ 2 / (real) ^ 2

    if x and y are matrixs, SNR error over matrix should be the mean value of SNR error over all elements.

        SNR(pred, real) = mean((pred - real) ^ 2 / (real) ^ 2)

    Args:
        y_pred (torch.Tensor): _description_
        y_real (torch.Tensor): _description_
        reduction (str, optional): _description_. Defaults to 'mean'.

    Raises:
        ValueError: _description_
        ValueError: _description_

    Returns:
        torch.Tensor: _description_
    """
    if y_pred.shape != y_real.shape:
        raise ValueError(
            f"Can not compute snr loss for tensors with different shape. "
            f"({y_pred.shape} and {y_real.shape})"
        )
    reduction = str(reduction).lower()

    if y_pred.ndim == 1:
        y_pred = y_pred.unsqueeze(0)
        y_real = y_real.unsqueeze(0)

    y_pred = y_pred.flatten(start_dim=1)
    y_real = y_real.flatten(start_dim=1)

    noise_power = torch.pow(y_pred - y_real, 2).sum(dim=-1)
    signal_power = torch.pow(y_real, 2).sum(dim=-1)
    snr = (noise_power) / (signal_power + 1e-7)

    if reduction == "mean":
        return torch.mean(snr)
    elif reduction == "sum":
        return torch.sum(snr)
    elif reduction == "none":
        return snr
    else:
        raise ValueError("Unsupported reduction method.")


================================================
FILE: backends/gaudi/server/text_generation_server/layers/layernorm.py
================================================
import torch
from torch import nn
from accelerate import init_empty_weights


# Monkey patching
@classmethod
def load_layer_norm(cls, prefix, weights, eps):
    weight = weights.get_tensor(f"{prefix}.weight")
    bias = weights.get_tensor(f"{prefix}.bias")
    with init_empty_weights():
        ln = cls(weight.shape, eps=eps)

    ln.weight = torch.nn.Parameter(weight)
    ln.bias = torch.nn.Parameter(bias)
    return ln


@classmethod
def load_layer_norm_no_bias(cls, prefix, weights, eps):
    weight = weights.get_tensor(f"{prefix}.weight")
    with init_empty_weights():
        ln = cls(weight.shape, eps=eps)

    ln.weight = torch.nn.Parameter(weight)
    ln.bias = None
    return ln


torch.nn.LayerNorm.load = load_layer_norm
torch.nn.LayerNorm.load_no_bias = load_layer_norm_no_bias


class FastLayerNorm(nn.LayerNorm):
    def forward(self, hidden_states, residual=None):
        if residual is not None:
            hidden_states += residual
        residual = hidden_states

        return super().forward(hidden_states), residual


class FastRMSNorm(nn.Module):
    def __init__(self, weight: torch.Tensor, eps: float):
        super().__init__()

        self.weight = nn.Parameter(weight)
        self.variance_epsilon = eps

    @classmethod
    def load(cls, prefix, weights, eps=1e-6):
        weight = weights.get_tensor(f"{prefix}.weight")
        return cls(weight, eps)

    def forward(self, hidden_states, residual=None):
        if residual is not None:
            hidden_states += residual
        residual = hidden_states
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(self.weight.dtype), residual


================================================
FILE: backends/gaudi/server/text_generation_server/layers/linear.py
================================================
import torch
from torch.nn import functional as F


class FastLinear(torch.nn.Module):
    def __init__(
        self,
        weight,
        bias,
    ) -> None:
        super().__init__()
        self.weight = torch.nn.Parameter(weight, requires_grad=False)
        if bias is not None:
            self.bias = torch.nn.Parameter(bias, requires_grad=False)
        else:
            self.bias = None

    @classmethod
    def load(cls, config, prefix: str, weights, bias: bool):
        weight = weights.get_tensor(f"{prefix}.weight")
        if bias:
            bias = weights.get_tensor(f"{prefix}.bias")
        else:
            bias = None
        return cls(weight, bias)

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        return F.linear(input, self.weight, self.bias)


def get_linear(weight, bias):
    # Weights that are loaded through methods that are not
    # quantization-aware are still bare tensors. We may want
    # to change this in the future.
    if isinstance(weight, torch.Tensor):
        return FastLinear(weight, bias)

    return weight.get_linear(bias)


================================================
FILE: backends/gaudi/server/text_generation_server/layers/lora.py
================================================
from typing import TYPE_CHECKING, Optional, List

import torch
import torch.distributed
from torch import nn
from torch.distributed import ProcessGroup

from text_generation_server.utils.sgmv import (
    add_lora_a_bgmv,
    add_lora_b_bgmv,
    has_sgmv,
    lora_a_sgmv_cutlass,
    lora_b_sgmv_cutlass,
    orient_for_rank,
)

if TYPE_CHECKING:
    from text_generation_server.adapters import AdapterBatchData
    from text_generation_server.adapters.lora import BatchLoraWeights


class LoraLinear(nn.Module):
    def __init__(
        self, base_layer: nn.Module, layer_id: int, process_group: ProcessGroup
    ):
        super().__init__()
        self.base_layer = base_layer
        self.layer_id = layer_id
        self.process_group = process_group

    def forward_layer_type(
        self,
        result: torch.Tensor,
        input: torch.Tensor,
        adapter_data: "AdapterBatchData",
        layer_type: str,
        start_idx: int,
        end_idx: int,
    ) -> torch.Tensor:
        if adapter_data is None:
            return result
        data: Optional["BatchLoraWeights"] = adapter_data.data.get(layer_type)

        if has_sgmv() and data is not None and data.can_vectorize(self.process_group):
            # In tensor-parallel configurations, each GPU processes a specific segment of the output.
            # The 'result' tensor represents the full output, which can vary in size based on
            # the layer type (e.g., attention vs. feed-forward layers). We define the current
            # segment using start_idx and end_idx. If the segment size doesn't match this GPU's
            # slice of 'result', we create a zero tensor of the correct size for LoRA computation.
            # This approach ensures accurate LoRA application across various layer sizes and
            # configurations, adapting to different model architectures and parallelization strategies.
            #
            # Example scenarios where this is necessary:
            # 1. The adapter's size doesn't evenly divide across GPUs.
            # 2. We're processing the last segment which might be smaller.
            # 3. Different projection layers (q, k, v) have different sizes.
            if end_idx - start_idx != result.shape[1]:
                proj = torch.zeros_like(result[:, start_idx:end_idx])
            else:
                proj = result

            for r, rank_segments in data.rank_data.items():
                lora_a_ptr = rank_segments.lora_a_ptr
                lora_b_ptr = rank_segments.lora_b_ptr

                if lora_a_ptr is None or lora_b_ptr is None:
                    raise ValueError("LoRA data is missing")

                if data.use_sgmv:
                    # Use SGMV for prefill
                    v = lora_a_sgmv_cutlass(
                        input,
                        rank_segments.tmp_shrink,
                        lora_a_ptr,
                        rank_segments.segment_starts,
                        rank_segments.segment_ends,
                        self.layer_id,
                        r,
                    )

                    if self.process_group.size() > 1:
                        v = self.collect_lora_a(v)

                    lora_b_sgmv_cutlass(
                        proj,
                        v,
                        rank_segments.tmp_expand,
                        lora_b_ptr,
                        rank_segments.segment_starts,
                        rank_segments.segment_ends,
                        self.layer_id,
                    )
                else:
                    # Use BGMV for decode
                    v = torch.zeros(
                        (input.size(0), r), dtype=input.dtype, device=input.device
                    )
                    # TODO: error with [-1, 0], but not [0, -1]
                    add_lora_a_bgmv(
                        v,
                        input,
                        lora_a_ptr,
                        rank_segments.indices,
                        self.layer_id,
                    )

                    if self.process_group.size() > 1:
                        v = self.collect_lora_a(v)

                    add_lora_b_bgmv(
                        proj,
                        v,
                        lora_b_ptr,
                        rank_segments.indices,
                        self.layer_id,
                    )

            if end_idx - start_idx != result.shape[1]:
                result[:, start_idx:end_idx] += proj
        else:
            for adapter_index in adapter_data.meta.adapter_set:
                if data is not None and data.has_adapter(adapter_index):
                    adapter_mask = (
                        (adapter_data.meta.adapter_indices == adapter_index)
                        .to(input.dtype)
                        .view(-1, 1)
                    )
                    layer_result = self.forward_lora(
                        input, data, adapter_index, adapter_mask
                    )
                    result[:, start_idx:end_idx] += layer_result

        return result

    def forward_lora(
        self,
        input: torch.Tensor,
        data: "BatchLoraWeights",
        adapter_index: int,
        adapter_mask: torch.Tensor,
    ) -> torch.Tensor:
        lora_a = data.lora_a[adapter_index][self.layer_id, :, :]
        lora_b = data.lora_b[adapter_index][self.layer_id, :, :]

        lora_a = orient_for_rank(lora_a, lora_b.size(0))

        a_out = input @ lora_a
        if self.process_group.size() > 1:
            a_out = self.collect_lora_a(a_out)

        result = (a_out @ lora_b) * adapter_mask
        return result

    def collect_lora_a(self, a_out: torch.Tensor) -> torch.Tensor:
        raise NotImplementedError("Implemented in subclasses")


class TensorParallelMultiAdapterLinear(LoraLinear):
    def __init__(
        self,
        base_layer: nn.Module,
        layer_id: int,
        layer_names: List[str],
        sizes: List[int],
        process_group: ProcessGroup,
    ):
        super().__init__(base_layer, layer_id, process_group)
        self.layer_names = layer_names
        self.sizes = sizes

    @classmethod
    def load(
        cls,
        base_layer: nn.Module,
        layer_id: int,
        layer_names: List[str],
        sizes: List[int],
        process_group: ProcessGroup,
    ):
        return TensorParallelMultiAdapterLinear(
            base_layer, layer_id, layer_names, sizes, process_group
        )

    def forward(
        self, input: torch.Tensor, adapter_data: "AdapterBatchData"
    ) -> torch.Tensor:
        result = self.base_layer(input)

        # noop if no layer names are provided (e.g. for models without adapters)
        if self.layer_names is None:
            return result

        # handle models like Bloom that have inputs of shape
        # (batch_size, sequence_length, hidden_size)
        # we need to reshape them to (batch_size * sequence_length, hidden_size)
        # for the LoRA computation, then reshape back
        prev_shape = result.shape
        is_3d = len(input.shape) >= 3
        if is_3d:
            input = input.reshape(-1, input.shape[-1])
            result = result.reshape(-1, result.shape[-1])

        offset = 0
        for i, layer_name in enumerate(self.layer_names):
            start_idx = offset // self.process_group.size()
            # The 'sizes' parameter is essential in tensor-parallel setups for handling multiple
            # projection layers (q_proj, k_proj, v_proj) by defining their output dimensions. It
            # ensures correct slicing of the result tensor, accommodating variations like grouped-query
            # attention where k_proj and v_proj differ from q_proj. This allows precise application of
            # LoRA adapters to each sub-component of the multi-head attention mechanism, managing the
            # different projection sizes across layers and model architectures.
            if self.sizes is not None:
                offset += self.sizes[i]
                end_idx = offset // self.process_group.size()
            else:
                end_idx = result.shape[1]

            result = self.forward_layer_type(
                result, input, adapter_data, layer_name, start_idx, end_idx
            )

        if is_3d:
            result = result.reshape(prev_shape)

        return result

    def collect_lora_a(self, a_out: torch.Tensor) -> torch.Tensor:
        # Tensor parallel implementation of X @ A@B, where A and B are sharded column-wise.
        # We use an all-gather between X@A and (X@A)@B to ensure alignment across ranks.
        #
        # TODO(travis): this is not very efficient as we do an all-gather for every adapter,
        #   instead we could pre-allocate a (B, a, r) tensor for all adapters with the same
        #   rank, compute `a_out` on each, and then slice them into the buffer as shown here:
        #   https://discuss.pytorch.org/t/concatenate-tensors-without-memory-copying/34609
        gathered_tensors = [
            torch.empty_like(a_out) for _ in range(self.process_group.size())
        ]
        torch.distributed.all_gather(gathered_tensors, a_out)
        return torch.cat(gathered_tensors, dim=1)


class TensorParallelAdapterRowLinear(LoraLinear):
    def __init__(self, base_layer, layer_id, layer_name, process_group):
        super().__init__(base_layer, layer_id, process_group)
        self.layer_name = layer_name

    @classmethod
    def load(cls, base_layer, layer_id, layer_name, process_group):
        return cls(base_layer, layer_id, layer_name, process_group)

    def forward(
        self, input: torch.Tensor, adapter_data: "AdapterBatchData"
    ) -> torch.Tensor:
        result = self.base_layer(input)

        if self.layer_name is None:
            return result

        # Fused all-gather + all-reduce from S-LoRA paper: https://arxiv.org/abs/2311.03285
        stride = result.shape[-1] // self.process_group.size()
        start_idx = self.process_group.rank() * stride
        end_idx = (self.process_group.rank() + 1) * stride

        self.forward_layer_type(
            result, input, adapter_data, self.layer_name, start_idx, end_idx
        )

        return result

    def collect_lora_a(self, a_out: torch.Tensor) -> torch.Tensor:
        # Tensor parallel implementation of X @ A@B, where A and B are sharded row-wise.
        # We use an all-reduce between X@A and (X@A)@B to ensure alignment across ranks.
        #
        # TODO(travis): this is not very efficient as we do an all-reduce for every adapter,
        #   instead we could pre-allocate a (B, a, r) tensor for all adapters with the same
        #   rank, compute `a_out` on each, and then slice them into the buffer as shown here:
        #   https://discuss.pytorch.org/t/concatenate-tensors-without-memory-copying/34609
        torch.distributed.all_reduce(a_out, group=self.process_group)
        return a_out


================================================
FILE: backends/gaudi/server/text_generation_server/layers/medusa.py
================================================
import torch
from torch import nn
from typing import Tuple, Optional
from text_generation_server.utils.speculate import get_speculate
from text_generation_server.layers.linear import FastLinear
from text_generation_server.layers.tensor_parallel import (
    TensorParallelHead,
    TensorParallelColumnLinear,
)


class ResBlock(torch.nn.Module):
    def __init__(self, config, prefix, weights):
        super().__init__()
        self.linear = FastLinear.load(
            config, prefix=f"{prefix}.linear", weights=weights, bias=True
        )
        self.act = torch.nn.SiLU()

    def forward(self, x):
        return x + self.act(self.linear(x))


class MedusaModel(torch.nn.Module):
    def __init__(self, config, medusa_config, weights):
        super().__init__()
        self.heads = torch.nn.ModuleList(
            [
                MedusaHead(config, medusa_config, prefix=f"{i}", weights=weights)
                for i in range(get_speculate())
            ]
        )

    def forward(self, x):
        if not self.heads:
            return None
        speculative_logits = torch.stack([head(x) for head in self.heads], dim=1)
        return speculative_logits


class MedusaHead(torch.nn.Module):
    def __init__(self, config, medusa_config, prefix, weights):
        super().__init__()
        self.blocks = torch.nn.ModuleList(
            [
                ResBlock(config, prefix=f"{prefix}.{i}", weights=weights)
                for i in range(medusa_config["medusa_num_layers"])
            ]
        )
        n = len(self.blocks)
        self.out = FastLinear.load(
            config, prefix=f"{prefix}.{n}", weights=weights, bias=False
        )

    def forward(self, x):
        for block in self.blocks:
            x = block(x)
        x = self.out(x)
        return x


class MedusaHeadV1(nn.Module):
    def __init__(self, lm_head, medusa):
        super().__init__()
        self.lm_head = lm_head
        self.medusa = medusa

    @staticmethod
    def load(config, prefix: str, weights):
        from pathlib import Path
        from safetensors import safe_open
        import json

        speculator = config.speculator

        path = speculator["path"]
        medusa_config = str(Path(path) / "config.json")

        for fname in speculator["model_paths"]:
            filename = str(Path(path) / fname)

            with open(medusa_config, "r") as f:
                medusa_config = json.load(f)
            routing = weights.routing
            with safe_open(filename, framework="pytorch") as f:
                for k in f.keys():
                    if k in routing and routing[k] != filename:
                        raise RuntimeError(
                            f"Key {k} was found in multiple files: {filename} and {routing[k]}"
                        )
                    routing[k] = filename

        medusa = MedusaModel(config, medusa_config, weights)
        lm_head = TensorParallelHead.load(config, prefix, weights)
        return MedusaHeadV1(lm_head, medusa)

    def forward(
        self, input: torch.Tensor
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        logits = self.lm_head(input)
        # If we have too many tokens, we skip speculative logits
        if input.shape[0] > 128:
            return logits, None

        speculative_logits = self.medusa(input)
        return logits, speculative_logits


class MedusaHeadV2(nn.Module):
    def __init__(self, config, prefix, weights):
        super().__init__()
        from pathlib import Path
        from safetensors import safe_open
        import json

        speculator_path = config.speculator["path"]

        medusa_config = str(Path(speculator_path) / "config.json")
        filename = str(Path(speculator_path) / "medusa_lm_head.safetensors")

        with open(medusa_config, "r") as f:
            medusa_config = json.load(f)
        routing = weights.routing
        with safe_open(filename, framework="pytorch") as f:
            for k in f.keys():
                if k in routing and routing[k] != filename:
                    raise RuntimeError(
                        f"Key {k} was found in multiple files: {filename} and {routing[k]}"
                    )
                routing[k] = filename

        self.n_medusa_heads = get_speculate()

        assert medusa_config["medusa_num_layers"] == 1
        self.linear = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{i}.0.linear" for i in range(self.n_medusa_heads)],
            dim=0,
            weights=weights,
            bias=True,
        )
        self.process_group = weights.process_group
        self.world_size = self.process_group.size()
        self.rank = self.process_group.rank()

        self.act = torch.nn.SiLU()

        self.lm_head = TensorParallelHead.load(config, prefix, weights)

    def forward(self, x):
        # If we have too many tokens, we skip speculative logits
        if x.shape[0] > 128:
            logits = self.lm_head(x)
            return logits, None

        size = x.shape[-1]
        block_size = (size + self.world_size - 1) // self.world_size
        start = self.rank * block_size
        stop = (self.rank + 1) * block_size

        x_block = x[:, start:stop]

        # Compute all medusa heads at the same time, then reshape and move the n_medusa_heads dim to dim 1
        medusa_res = self.act(self.linear(x)).reshape(
            *x_block.shape[:-1], self.n_medusa_heads, x_block.shape[-1]
        )

        # Apply all residual medusa heads
        output = x[:, start:stop].unsqueeze(-2) + medusa_res

        # Gather medusa heads
        world_output = [
            torch.empty_like(output) for _ in range(self.process_group.size())
        ]
        torch.distributed.all_gather(world_output, output, group=self.process_group)
        world_output = torch.cat(world_output, dim=-1)

        # Stack x and medusa residual x
        stacked_x = torch.cat([x.unsqueeze(-2), world_output], dim=-2)

        # Compute lm head on x + medusa residual x
        logits = self.lm_head(stacked_x)

        # Finally, split logits from speculative logits
        logits, speculative_logits = torch.split(
            logits, [1, self.n_medusa_heads], dim=-2
        )
        # Squeeze added dimension
        logits = logits.squeeze(-2)

        return logits, speculative_logits


================================================
FILE: backends/gaudi/server/text_generation_server/layers/mlp.py
================================================
import torch
import math
from torch import nn
from torch.nn import functional as F
from typing import Optional, Tuple
from text_generation_server.layers import TensorParallelEmbedding, FastLinear
from text_generation_server.layers.tensor_parallel import TensorParallelHead
from text_generation_server.utils.speculate import get_speculate


class MLPSpeculatorLayerNorm(nn.Module):
    """
    A L2 normalization implementation
    ...
    Args
    ----
    normalized_shape : int
        Dimensionality of input data (size of final tensor axis)
    elementwise_scale_weight : torch.Tensor
        learned scaling term after normalization?
    elementwise_shift_bias : torch.Tensor
        learned bias term after normalization?
    eps : float
        Safety term to prevent division by zero. Make sure the chosen value fits in the range of your encoding scheme (i.e. fp16 requires eps >= 6e-8).
    """

    def __init__(
        self,
        prefix,
        config,
        weights,
        eps=1e-06,
    ):
        super(MLPSpeculatorLayerNorm, self).__init__()
        self.weight = weights.get_tensor(f"{prefix}.weight")
        self.bias = weights.get_tensor(f"{prefix}.bias")
        self.eps = eps

    def forward(self, x):
        xf = x
        xf = xf * torch.rsqrt(xf.pow(2).mean(-1, keepdim=True) + self.eps)
        x = xf.type_as(x)
        x = self.weight * x
        x = x + self.bias
        return x


INV_SQRT2 = 2**-0.5


def simple_norm(x: torch.Tensor, eps=1e-06):
    xf = x
    xf = xf * torch.rsqrt(xf.pow(2).mean(-1, keepdim=True) + eps)
    x = xf.type_as(x)
    return x * INV_SQRT2


class MLPSpeculatorModelTied(torch.nn.Module):
    def __init__(self, config, prefix, weights):
        super().__init__()
        self.config = config
        self.n_predict = get_speculate()
        self.hidden_size = config.hidden_size

        self.emb = TensorParallelEmbedding(f"{prefix}.emb.0", weights)
        self.proj0 = FastLinear.load(
            config,
            prefix=f"{prefix}.proj.0",
            weights=weights,
            bias=False,
        )
        self.proj1 = FastLinear.load(
            config,
            prefix=f"{prefix}.proj.1",
            weights=weights,
            bias=False,
        )
        self.head = FastLinear.load(config, f"{prefix}.head.0", weights, bias=False)
        self.ln = MLPSpeculatorLayerNorm(
            prefix=f"{prefix}.ln.0",
            config=config,
            weights=weights,
        )

        # Weights ensure that state_0 accounts for 50% of state magnitude by final head in expectation
        self.state_weight = 0.5 ** (0.5 / self.n_predict) if self.n_predict > 0 else 1
        self.activation = nn.GELU()
        self.vsize = config.vocab_size
        self.inner_dim = config.speculator_config["inner_dim"]
        self.top_k_tokens_per_head = [1] * self.n_predict
        self.emb_weight = math.sqrt(1 - self.state_weight**2) * math.sqrt(
            self.inner_dim / 2
        )
        self.emb.weight *= self.emb_weight

    def forward(
        self,
        hidden_states: torch.Tensor,
        input_ids: torch.Tensor,
    ):
        top_k_tokens_per_head = self.top_k_tokens_per_head

        # k indicates # of candidates
        # h indicates # of generated tokens
        state = hidden_states
        b = state.size(0)
        ind = input_ids.unsqueeze(0)
        all_probs = torch.empty(
            b, self.n_predict, self.vsize, device=state.device
        )  # b k h v
        assert (
            len(top_k_tokens_per_head) == self.n_predict
        ), f"You must provide a topk number for each head ({self.n_predict} heads, {len(top_k_tokens_per_head)} provided)"
        for i in range(self.n_predict):
            # Project and predict
            z = self.emb(ind)
            # z = z.mul(self.emb_weight)  # b k d
            if i == 0:
                state = self.proj0(state) * self.state_weight + z
            else:
                state = self.proj1(state) * self.state_weight + z
            state = self.activation(self.ln(state))  # b k d
            probs = F.log_softmax(self.head(state), dim=-1)  # b k v
            _probs, preds = probs.topk(top_k_tokens_per_head[i], dim=-1)  # b k k'

            # Update candidate set with new predictions

            # Update distribution set with new logits
            all_probs[:, i] = probs.exp()

            # Update state, log_probs and ind for new predictions
            state = state.unsqueeze(2).expand(
                -1, -1, top_k_tokens_per_head[i], -1
            )  # b k k' d
            state = state.reshape(-1, b, state.size(3))  # b kk' d
            ind = preds.view(-1, b)  # b kk'

        speculative_logits = all_probs
        return speculative_logits


class MLPSpeculatorModel(torch.nn.Module):
    def __init__(self, config, prefix, weights):
        super().__init__()
        self.config = config
        self.n_predict = get_speculate()
        self.hidden_size = config.hidden_size

        self.emb = nn.ModuleList(
            [
                TensorParallelEmbedding(f"{prefix}.emb.{i}", weights)
                for i in range(self.n_predict)
            ]
        )
        self.proj = [
            FastLinear.load(
                config,
                prefix=f"{prefix}.proj.{i}",
                weights=weights,
                bias=False,
            )
            for i in range(self.n_predict)
        ]
        self.head = nn.ModuleList(
            [
                FastLinear.load(config, f"{prefix}.head.{i}", weights, bias=False)
                for i in range(self.n_predict)
            ]
        )
        self.ln = nn.ModuleList(
            [
                MLPSpeculatorLayerNorm(
                    prefix=f"{prefix}.ln.{i}",
                    config=config,
                    weights=weights,
                )
                for i in range(self.n_predict)
            ]
        )

        # Weights ensure that state_0 accounts for 50% of state magnitude by final head in expectation
        self.state_weight = 0.5 ** (0.5 / self.n_predict) if self.n_predict > 0 else 1
        self.activation = nn.GELU()
        self.vsize = config.vocab_size
        self.inner_dim = config.speculator_config["inner_dim"]
        self.top_k_tokens_per_head = [1] * self.n_predict
        self.emb_weight = math.sqrt(1 - self.state_weight**2) * math.sqrt(
            self.inner_dim / 2
        )
        self.emb.weight *= self.emb_weight

    def forward(
        self,
        hidden_states: torch.Tensor,
        input_ids: torch.Tensor,
    ):
        top_k_tokens_per_head = self.top_k_tokens_per_head

        # k indicates # of candidates
        # h indicates # of generated tokens
        state = hidden_states
        b = state.size(0)
        ind = input_ids.unsqueeze(0)
        all_probs = torch.empty(
            b, self.n_predict, self.vsize, device=state.device
        )  # b k h v
        assert (
            len(top_k_tokens_per_head) == self.n_predict
        ), f"You must provide a topk number for each head ({self.n_predict} heads, {len(top_k_tokens_per_head)} provided)"
        for i in range(self.n_predict):
            # Project and predict
            z = self.emb[i](ind)
            # z = z.mul(self.emb_weight)  # b k d
            state = self.proj[i](state) * self.state_weight + z
            state = self.activation(self.ln[i](state))  # b k d
            probs = F.log_softmax(self.head[i](state), dim=-1)  # b k v
            _probs, preds = probs.topk(top_k_tokens_per_head[i], dim=-1)  # b k k'

            # Update candidate set with new predictions

            # Update distribution set with new logits
            all_probs[:, i] = probs.exp()

            # Update state, log_probs and ind for new predictions
            state = state.unsqueeze(2).expand(
                -1, -1, top_k_tokens_per_head[i], -1
            )  # b k k' d
            state = state.reshape(-1, b, state.size(3))  # b kk' d
            ind = preds.view(-1, b)  # b kk'

        speculative_logits = all_probs
        return speculative_logits


class MLPSpeculatorHead(nn.Module):
    def __init__(self, lm_head, mlp_speculator, scale_input: bool):
        super().__init__()
        self.lm_head = lm_head
        self.mlp_speculator = mlp_speculator
        self.scale_input = scale_input

    def forward(
        self, input: torch.Tensor
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        logits = self.lm_head(input)
        # If we have too many tokens, we skip speculative logits
        if input.shape[0] > 128:
            return logits, None

        input_ids = logits.argmax(dim=-1)
        if self.scale_input:
            input = simple_norm(input)
        speculative_logits = self.mlp_speculator(input, input_ids)
        return logits, speculative_logits

    @staticmethod
    def load(config, prefix: str, weights):
        from pathlib import Path
        from safetensors import safe_open

        speculator_path = config.speculator["path"]

        for fname in config.speculator["model_paths"]:
            filename = str(Path(speculator_path) / fname)
            routing = weights.routing
            with safe_open(filename, framework="pytorch") as f:
                for k in f.keys():
                    if k in routing and routing[k] != filename:
                        raise RuntimeError(
                            f"Key {k} was found in multiple files: {filename} and {routing[k]}"
                        )
                    routing[k] = filename

        tie_weights = config.speculator_config.get("tie_weights", False)
        if tie_weights:
            mlp_speculator = MLPSpeculatorModelTied(config, "speculator", weights)
        else:
            mlp_speculator = MLPSpeculatorModel(config, "speculator", weights)
        # This is used in https://huggingface.co/ibm-fms/llama3-70b-accelerator
        scale_input = config.speculator_config.get("scale_input", False)
        lm_head = TensorParallelHead.load(config, prefix, weights)
        return MLPSpeculatorHead(lm_head, mlp_speculator, scale_input)


================================================
FILE: backends/gaudi/server/text_generation_server/layers/moe/__init__.py
================================================
from typing import Optional, Protocol, runtime_checkable

import torch
import torch.nn as nn
from loguru import logger
from transformers.activations import ACT2FN

from text_generation_server.layers import (
    TensorParallelColumnLinear,
    TensorParallelRowLinear,
)
from text_generation_server.layers.fp8 import HybridFP8UnquantLoader
from text_generation_server.layers.moe.unquantized import UnquantizedSparseMoELayer
from text_generation_server.layers.moe.fp8 import FP8SparseMoELayer
from text_generation_server.utils.log import log_once
from text_generation_server.utils.weights import (
    DefaultWeightsLoader,
    Weights,
    UnquantizedWeight,
)

from .fused_moe import fused_topk, grouped_topk

# NOTE: we are using a protocol here, because multiple inherance is not nice.
#       We need `Module`, and `Module` -> some abstract class -> some concrete
#       class inheritance is whacky.


@runtime_checkable
class MoELayer(Protocol):
    def __init__(
        self,
        *,
        n_expert_group: Optional[int],
        n_experts: int,
        prefix: str,
        renormalize: bool,
        topk: int,
        topk_group: Optional[int],
        weights: Weights,
        gate_proj_name: str = "gate_proj",
        up_proj_name: str = "up_proj",
        down_proj_name: str = "down_proj",
        hidden_act: str = "silu",
        scoring_func: Optional[str] = None,
        e_score_correction_bias: Optional[float] = None,
    ): ...

    def forward(
        self, x: torch.Tensor, *, gating_output: torch.Tensor
    ) -> torch.Tensor: ...


class DenseMoELayer(nn.Module):
    """
    Layer for MoE that applies *all* experts to each tokens and then weights
    their outputs based on the calculated routing. This layer is much slower
    than `SparseMoELayer` and should only be used when no fused kernels are
    available (e.g. for unsupported quantizers).
    """

    def __init__(
        self,
        *,
        n_expert_group: Optional[int],
        n_experts: int,
        prefix: str,
        renormalize: bool,
        topk: int,
        topk_group: Optional[int],
        weights: Weights,
        gate_proj_name: str = "gate_proj",
        up_proj_name: str = "up_proj",
        down_proj_name: str = "down_proj",
        hidden_act: str = "silu",
        scoring_func: Optional[str] = None,
        e_score_correction_bias: Optional[float] = None,
    ):
        super().__init__()

        assert scoring_func is None, "scoring func is not handled"
        assert e_score_correction_bias is None, "scoring correction bias is not handled"

        log_once(
            logger.info,
            "No fused layers are available for this model type, using (slower) dense MoE layer",
        )

        assert (n_expert_group is None) == (
            topk_group is None
        ), "n_expert_group and topk_group must both be None or have some value"

        self.n_expert_group = n_expert_group
        self.n_experts = n_experts
        self.renormalize = renormalize
        self.topk = topk
        self.topk_group = topk_group

        if "gelu" in hidden_act:
            self.act = lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh"
                    if hidden_act in ["gelu_fast", "gelu_pytorch_tanh"]
                    else "none"
                ),
            )
        elif "silu" in hidden_act:
            self.act = torch.nn.functional.silu
        else:
            self.act = ACT2FN[hidden_act]

        self.gate_proj = [
            TensorParallelColumnLinear.load(
                None,
                prefix=f"{prefix}.{i}.{gate_proj_name}",
                weights=weights,
                bias=False,
            )
            for i in range(self.n_experts)
        ]
        self.up_proj = [
            TensorParallelColumnLinear.load(
                None,
                prefix=f"{prefix}.{i}.{up_proj_name}",
                weights=weights,
                bias=False,
            )
            for i in range(self.n_experts)
        ]
        self.down_proj = [
            TensorParallelRowLinear.load(
                None,
                prefix=f"{prefix}.{i}.{down_proj_name}",
                weights=weights,
                bias=False,
            )
            for i in range(self.n_experts)
        ]

        self.process_group = weights.process_group

    def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> torch.Tensor:
        """
        x: (sequence_length, model_dim)
        gating_output: (sequence_length, n_experts)
        """
        # optional reshape
        input_shape = x.shape
        x = x.view(-1, input_shape[-1])

        if self.n_expert_group is not None and self.topk_group is not None:
            topk_weights, topk_ids = grouped_topk(
                x,
                gating_output,
                self.topk,
                renormalize=self.renormalize,
                num_expert_group=self.n_expert_group,
                topk_group=self.topk_group,
            )
        else:
            topk_weights, topk_ids = fused_topk(
                x, gating_output, self.topk, self.renormalize
            )
            topk_weights = topk_weights.to(x.dtype)

        weights = torch.zeros(
            topk_ids.shape[0], self.n_experts, dtype=x.dtype, device=x.device
        )

        weights.scatter_(1, topk_ids.long(), topk_weights.to(weights.dtype))

        out = torch.zeros_like(x)
        for i in range(self.n_experts):
            h = self.act(self.gate_proj[i](x)) * self.up_proj[i](x)
            h = self.down_proj[i](h, reduce=False)
            out += h * weights[:, i].view(-1, 1)

        return out


class SparseMoELayer(nn.Module):
    """
    Layer for MoE that uses fused kernels to only apply the active experts
    for each token (rather than applying all experts and selecting the
    outputs of active experts).
    """

    def __init__(
        self,
        *,
        n_expert_group: Optional[int],
        n_experts: int,
        prefix: str,
        renormalize: bool,
        topk: int,
        topk_group: Optional[int],
        weights: Weights,
        scoring_func: Optional[str] = "softmax",
        e_score_correction_bias: Optional[float] = None,
        gate_proj_name: str = "gate_proj",
        up_proj_name: str = "up_proj",
        down_proj_name: str = "down_proj",
    ):
        super().__init__()
        if (
            isinstance(weights.loader, DefaultWeightsLoader)
            and isinstance(weights.loader.weight_class, UnquantizedWeight)
        ) or isinstance(weights.loader, HybridFP8UnquantLoader):
            if (
                isinstance(weights.loader, HybridFP8UnquantLoader)
                and weights.loader.to_fp8
            ):
                cls = FP8SparseMoELayer
            else:
                cls = UnquantizedSparseMoELayer
        else:
            raise ValueError(
                f"Unsupported weights loader: {type(weights.loader)}, sparse MoE is only supported for unquantized, AWQ, and GPTQ weights"
            )

        log_once(
            logger.info,
            "Using MoE layer wih fused gemm",
        )

        self.moe = cls(
            n_expert_group=n_expert_group,
            n_experts=n_experts,
            prefix=prefix,
            renormalize=renormalize,
            topk=topk,
            topk_group=topk_group,
            weights=weights,
            scoring_func=scoring_func,
            e_score_correction_bias=e_score_correction_bias,
            gate_proj_name=gate_proj_name,
            up_proj_name=up_proj_name,
            down_proj_name=down_proj_name,
        )

    def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> torch.Tensor:
        return self.moe(x, gating_output=gating_output)

    @staticmethod
    def is_supported(weights: Weights) -> bool:
        return (
            isinstance(weights.loader, DefaultWeightsLoader)
            and isinstance(weights.loader.weight_class, UnquantizedWeight)
        ) or isinstance(weights.loader, HybridFP8UnquantLoader)


================================================
FILE: backends/gaudi/server/text_generation_server/layers/moe/fp8.py
================================================
from typing import Optional

import torch
import torch.nn as nn
import os

from text_generation_server.utils.weights import Weights
from text_generation_server.layers.fp8 import (
    Fp8Weight,
    fp8_quantize,
    quant_dtype,
    normalize_e4m3fn_to_native_float8,
    dynamic_quant,
    dequant_block_fp8_weight_naive,
)
from text_generation_server.layers.moe.fused_moe import select_experts
import habana_frameworks.torch as htorch


class FP8SparseMoELayer(nn.Module):
    def __init__(
        self,
        *,
        n_expert_group: Optional[int],
        n_experts: int,
        prefix: str,
        renormalize: bool,
        topk: int,
        topk_group: Optional[int],
        weights: Weights,
        scoring_func: Optional[str] = "softmax",
        e_score_correction_bias: Optional[float] = None,
        gate_proj_name: str = "gate_proj",
        up_proj_name: str = "up_proj",
        down_proj_name: str = "down_proj",
    ):
        super().__init__()

        assert (n_expert_group is None) == (
            topk_group is None
        ), "n_expert_group and topk_group must both be None or have some value"

        self.n_expert_group = n_expert_group
        self.topk = topk
        self.topk_group = topk_group
        self.renormalize = renormalize
        self.weight_block_size = weights.weights_loader.weight_block_size
        self.scoring_func = scoring_func
        self.e_score_correction_bias = e_score_correction_bias
        self.world_size = weights.process_group.size()
        self.rank = weights.process_group.rank()
        self.ep_rank = self.rank
        self.use_ep = os.getenv("USE_EXPERT_PARALLEL", "true").lower() == "true"
        if (n_experts + self.world_size - 1) // self.world_size < 4:
            self.use_ep = False
        if self.use_ep:
            n_experts_per_rank = (n_experts + self.world_size - 1) // self.world_size
            self.ep_offset = self.ep_rank * n_experts_per_rank
            n_experts = min(n_experts_per_rank, n_experts - self.ep_offset)
        else:
            self.ep_offset = 0

        (
            self.gate_up_proj,
            self.gate_up_proj_weight_scale,
            self.gate_up_proj_input_scale,
        ) = _load_expert_multi_weights_col(
            prefix=prefix,
            n_experts=n_experts,
            gate_proj_name=gate_proj_name,
            up_proj_name=up_proj_name,
            weights=weights,
            use_ep=self.use_ep,
            ep_offset=self.ep_offset,
        )

        self.down_proj, self.down_proj_weight_scale, self.down_proj_input_scale = (
            _load_expert_weights_row(
                prefix=prefix,
                n_experts=n_experts,
                name=down_proj_name,
                weights=weights,
                use_ep=self.use_ep,
                ep_offset=self.ep_offset,
            )
        )
        if self.weight_block_size is not None:
            self.gate_up_proj, self.gate_up_proj_weight_scale = dynamic_quant(
                dequant_block_fp8_weight_naive(
                    self.gate_up_proj,
                    self.gate_up_proj_weight_scale,
                    self.weight_block_size,
                )
            )
            self.down_proj, self.down_proj_weight_scale = dynamic_quant(
                dequant_block_fp8_weight_naive(
                    self.down_proj, self.down_proj_weight_scale, self.weight_block_size
                )
            )
            self.gate_up_proj_weight_scale, self.down_proj_weight_scale = (
                self.gate_up_proj_weight_scale.squeeze(-1),
                self.down_proj_weight_scale.squeeze(-1),
            )

    def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> torch.Tensor:
        topk_weights, topk_ids = select_experts(
            hidden_states=x,
            router_logits=gating_output,
            use_grouped_topk=self.n_expert_group is not None,
            top_k=self.topk,
            renormalize=self.renormalize,
            topk_group=self.topk_group,
            num_expert_group=self.n_expert_group,
            scoring_func=self.scoring_func,
            e_score_correction_bias=self.e_score_correction_bias,
        )
        total_num_experts = gating_output.size(-1)
        x_fp8, x_scale = dynamic_quant(x, single_scale=True)

        if self.use_ep:
            moe_n_slice = 1
            n_expert_slice = (
                total_num_experts + self.world_size - 1
            ) // self.world_size
        else:
            moe_n_slice = 1
            n_expert_slice = (total_num_experts + moe_n_slice - 1) // moe_n_slice
        for i in range(moe_n_slice):
            min_expert = i * n_expert_slice
            max_expert = min((i + 1) * n_expert_slice, total_num_experts)
            w13_list_slice = [
                self.gate_up_proj[j, ...] for j in range(min_expert, max_expert)
            ]
            w2_list_slice = [
                self.down_proj[j, ...] for j in range(min_expert, max_expert)
            ]
            w13_weight_scale = [
                self.gate_up_proj_weight_scale[j, ...]
                for j in range(min_expert, max_expert)
            ]
            w2_weight_scale = [
                self.down_proj_weight_scale[j, ...]
                for j in range(min_expert, max_expert)
            ]

            current_hidden_states = torch.ops.hpu.mixture_of_experts(
                hidden_states=x_fp8,
                expert_routing_table=topk_ids.to(torch.int64),
                router_weights=topk_weights.to(x.dtype),
                w12=w13_list_slice,
                w3=w2_list_slice,
                d_scale_hidden_states=x_scale,
                d_scale_w12=w13_weight_scale,
                d_scale_w3=w2_weight_scale,
                permuted_weights=True,
                activation="silu",
                experts_min=min_expert + self.ep_offset,
                experts_max=max_expert + self.ep_offset - 1,
            )
            htorch.core.mark_step()
            if i == 0:
                final_hidden_states = current_hidden_states
            else:
                final_hidden_states.add_(current_hidden_states)
        return final_hidden_states


def _load_expert_weights(
    get_weight_fn,
    *,
    prefix: str,
    n_experts: int,
    name: str,
    weights: Weights,
    ep_offset: int = 0,
) -> torch.Tensor:
    all_weight = None
    all_weight_scales = None
    max_input_scale = None

    for i in range(n_experts):
        weight = get_weight_fn(prefix, i + ep_offset, name, weights)

        assert isinstance(weight, Fp8Weight)

        if all_weight is None:
            all_weight = torch.empty(
                (n_experts,) + weight.weight.shape,
                dtype=quant_dtype,
                device=weight.weight.device,
            )
        if all_weight_scales is None:
            all_weight_scales = torch.empty(
                (n_experts,) + weight.weight_scale.shape,
                dtype=torch.float32,
                device=weight.weight.device,
            )

        if weight.weight.dtype in {torch.float8_e4m3fn, torch.float8_e4m3fnuz}:
            all_weight[i], all_weight_scales[i], current_input_scale = (
                normalize_e4m3fn_to_native_float8(
                    weight.weight, weight.weight_scale, weight.input_scale
                )
            )
            if current_input_scale is not None:
                if max_input_scale is None or current_input_scale > max_input_scale:
                    max_input_scale = current_input_scale
        else:
            all_weight[i], all_weight_scales[i] = fp8_quantize(
                weight.weight, scalar=True
            )

    assert all_weight is not None

    return all_weight, all_weight_scales, max_input_scale


def _load_expert_multi_weights_col(
    *,
    prefix: str,
    n_experts: int,
    gate_proj_name: str,
    up_proj_name: str,
    weights: Weights,
    use_ep: bool = False,
    ep_offset: int = 0,
) -> torch.Tensor:
    def get_weight_fn_sharded(prefix, i, name, weights):
        return weights.get_multi_weights_col(
            [f"{prefix}.{i}.{gate_proj_name}", f"{prefix}.{i}.{up_proj_name}"], 0
        )

    def get_weight_fn(prefix, i, name, weights):
        return weights.get_multi_weights(
            [f"{prefix}.{i}.{gate_proj_name}", f"{prefix}.{i}.{up_proj_name}"], 0
        )

    return _load_expert_weights(
        get_weight_fn if use_ep else get_weight_fn_sharded,
        prefix=prefix,
        n_experts=n_experts,
        name=None,
        weights=weights,
        ep_offset=ep_offset if use_ep else 0,
    )


def _load_expert_weights_row(
    *,
    prefix: str,
    n_experts: int,
    name: str,
    weights: Weights,
    use_ep: bool = False,
    ep_offset: int = 0,
) -> torch.Tensor:
    def get_weight_fn_sharded(prefix, i, name, weights):
        return weights.get_weights_row(f"{prefix}.{i}.{name}")

    def get_weight_fn(prefix, i, name, weights):
        return weights.get_weights(f"{prefix}.{i}.{name}")

    return _load_expert_weights(
        get_weight_fn if use_ep else get_weight_fn_sharded,
        prefix=prefix,
        n_experts=n_experts,
        name=name,
        weights=weights,
        ep_offset=ep_offset if use_ep else 0,
    )


================================================
FILE: backends/gaudi/server/text_generation_server/layers/moe/fused_moe.py
================================================
# coding=utf-8
# Copyright 2023, 2024 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Tuple, Optional

import torch


def grouped_topk(
    hidden_states: torch.Tensor,
    gating_output: torch.Tensor,
    topk: int,
    renormalize: bool,
    num_expert_group: int = 0,
    topk_group: int = 0,
    scoring_func: str = "softmax",
    e_score_correction_bias: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, torch.Tensor]:
    assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"

    gating_output = gating_output.float()
    if e_score_correction_bias is not None:
        e_score_correction_bias = e_score_correction_bias.float()

    if scoring_func == "softmax":
        scores = torch.softmax(gating_output, dim=-1)
    elif scoring_func == "sigmoid":
        scores = gating_output.sigmoid()
    else:
        raise ValueError(f"Unsupported scoring function: {scoring_func}")

    num_token = scores.shape[0]
    if e_score_correction_bias is not None:
        # Store original scores before applying correction bias. We use biased
        # scores for expert selection but original scores for routing weights
        original_scores = scores
        scores = scores + e_score_correction_bias.unsqueeze(0)
        group_scores = (
            scores.view(num_token, num_expert_group, -1).topk(2, dim=-1)[0].sum(dim=-1)
        )
    else:
        group_scores = (
            scores.view(num_token, num_expert_group, -1).max(dim=-1).values
        )  # [n, n_group]

    group_idx = torch.topk(group_scores, k=topk_group, dim=-1, sorted=False)[
        1
    ]  # [n, top_k_group]
    group_mask = torch.zeros_like(group_scores)  # [n, n_group]
    group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
    score_mask = (
        group_mask.unsqueeze(-1)
        .expand(num_token, num_expert_group, scores.shape[-1] // num_expert_group)
        .reshape(num_token, -1)
    )  # [n, e]
    tmp_scores = scores.masked_fill(~score_mask.bool(), float("-inf"))  # [n, e]

    if e_score_correction_bias is not None:
        topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)[1]
        # Use original unbiased scores for the routing weights
        topk_weights = original_scores.gather(1, topk_ids)
    else:
        topk_weights, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)

    if renormalize:
        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)

    return topk_weights.to(torch.float32), topk_ids.to(torch.int32)


def fused_topk(
    hidden_states: torch.Tensor,
    gating_output: torch.Tensor,
    topk: int,
    renormalize: bool,
) -> Tuple[torch.Tensor, torch.Tensor]:
    topk_weights = torch.nn.functional.softmax(
        gating_output, dim=1, dtype=torch.float32
    )
    topk_weights, topk_ids = torch.topk(topk_weights, topk, dim=-1)
    if renormalize:
        topk_weights /= topk_weights.sum(dim=-1, keepdim=True)
    return topk_weights, topk_ids


def select_experts(
    hidden_states: torch.Tensor,
    router_logits: torch.Tensor,
    top_k: int,
    use_grouped_topk: bool,
    renormalize: bool,
    topk_group: Optional[int] = None,
    num_expert_group: Optional[int] = None,
    scoring_func: str = "softmax",
    e_score_correction_bias: Optional[torch.Tensor] = None,
):

    # DeekSeekv2 uses grouped_top_k
    if use_grouped_topk:
        assert topk_group is not None
        assert num_expert_group is not None
        topk_weights, topk_ids = grouped_topk(
            hidden_states=hidden_states,
            gating_output=router_logits,
            topk=top_k,
            renormalize=renormalize,
            num_expert_group=num_expert_group,
            topk_group=topk_group,
            scoring_func=scoring_func,
            e_score_correction_bias=e_score_correction_bias,
        )
    else:
        topk_weights, topk_ids = fused_topk(
            hidden_states=hidden_states,
            gating_output=router_logits,
            topk=top_k,
            renormalize=renormalize,
        )
    return topk_weights, topk_ids


================================================
FILE: backends/gaudi/server/text_generation_server/layers/moe/unquantized.py
================================================
from typing import Optional

import torch
import torch.nn as nn

from text_generation_server.utils.weights import UnquantizedWeight, Weights
from vllm_hpu_extension.ops import VllmMixtureOfExpertsOp
import habana_frameworks.torch as htorch
import torch.nn.functional as F
import os


class UnquantizedSparseMoELayer(nn.Module):
    def __init__(
        self,
        *,
        n_expert_group: Optional[int],
        n_experts: int,
        prefix: str,
        renormalize: bool,
        topk: int,
        topk_group: Optional[int],
        weights: Weights,
        scoring_func: Optional[str] = "softmax",
        e_score_correction_bias: Optional[float] = None,
        gate_proj_name: str = "gate_proj",
        up_proj_name: str = "up_proj",
        down_proj_name: str = "down_proj",
    ):
        super().__init__()

        assert (n_expert_group is None) == (
            topk_group is None
        ), "n_expert_group and topk_group must both be None or have some value"

        self.n_expert_group = n_expert_group
        self.topk = topk
        self.topk_group = topk_group
        self.renormalize = renormalize
        self.weight_block_size = weights.weights_loader.weight_block_size
        self.scoring_func = scoring_func
        self.e_score_correction_bias = e_score_correction_bias
        self.rank = weights.process_group.rank()
        self.world_size = weights.process_group.size()
        self.use_ep = os.getenv("USE_EXPERT_PARALLEL", "true").lower() == "true"
        if (n_experts + self.world_size - 1) // self.world_size < 4:
            self.use_ep = False
        if self.use_ep:
            n_experts_per_rank = (n_experts + self.world_size - 1) // self.world_size
            self.ep_offset = self.rank * n_experts_per_rank
            n_experts = min(n_experts_per_rank, n_experts - self.ep_offset)
            experts_min = self.ep_offset
            experts_max = self.ep_offset + n_experts - 1
        else:
            self.ep_offset = 0
            experts_min = 0
            experts_max = n_experts - 1

        self.gate_up_proj = _load_expert_multi_weights_col(
            prefix=prefix,
            n_experts=n_experts,
            gate_proj_name=gate_proj_name,
            up_proj_name=up_proj_name,
            weights=weights,
            use_ep=self.use_ep,
            ep_offset=self.ep_offset,
        )

        self.down_proj = _load_expert_weights_row(
            prefix=prefix,
            n_experts=n_experts,
            name=down_proj_name,
            weights=weights,
            use_ep=self.use_ep,
            ep_offset=self.ep_offset,
        )

        self.MoeOp = VllmMixtureOfExpertsOp(n_experts, experts_min, experts_max)
        for i in range(n_experts):
            self.MoeOp.w13_list[i].set_weight(self.gate_up_proj[i])
            self.MoeOp.w2_list[i].set_weight(self.down_proj[i])

    def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> torch.Tensor:
        htorch.core.mark_step()
        routing_weights = F.softmax(gating_output, dim=1, dtype=torch.float32)
        routing_weights, selected_experts = torch.topk(
            routing_weights, self.topk, dim=-1
        )
        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
        routing_weights = routing_weights.to(x.dtype)

        final_hidden_states = self.MoeOp(
            hidden_states=x,
            expert_routing_table=selected_experts,
            router_weights=routing_weights,
            permuted_weights=True,
            activation="silu",
        )

        return final_hidden_states.view(-1, x.shape[1])


def _load_expert_multi_weights_col(
    *,
    prefix: str,
    n_experts: int,
    gate_proj_name: str,
    up_proj_name: str,
    weights: Weights,
    use_ep: bool = False,
    ep_offset: int = 0,
) -> torch.Tensor:
    all_weight = None
    for i in range(n_experts):
        if not use_ep:
            weight = weights.get_multi_weights_col(
                [f"{prefix}.{i}.{gate_proj_name}", f"{prefix}.{i}.{up_proj_name}"], 0
            )
        else:
            weight = weights.get_multi_weights(
                [
                    f"{prefix}.{i+ep_offset}.{gate_proj_name}",
                    f"{prefix}.{i+ep_offset}.{up_proj_name}",
                ],
                0,
            )

        assert isinstance(weight, UnquantizedWeight)

        if all_weight is None:
            all_weight = torch.empty(
                (n_experts,) + weight.weight.shape,
                dtype=weight.weight.dtype,
                device=weight.weight.device,
            )

        all_weight[i] = weight.weight

    assert all_weight is not None

    return all_weight


def _load_expert_weights_row(
    *,
    prefix: str,
    n_experts: int,
    name: str,
    weights: Weights,
    use_ep: bool = False,
    ep_offset: int = 0,
) -> torch.Tensor:
    all_weight = None
    for i in range(n_experts):
        if not use_ep:
            weight = weights.get_weights_row(
                f"{prefix}.{i}.{name}",
            )
        else:
            weight = weights.get_weights(
                f"{prefix}.{i+ep_offset}.{name}",
            )

        assert isinstance(weight, UnquantizedWeight)

        if all_weight is None:
            all_weight = torch.empty(
                (n_experts,) + weight.weight.shape,
                dtype=weight.weight.dtype,
                device=weight.weight.device,
            )

        all_weight[i] = weight.weight

    assert all_weight is not None

    return all_weight


================================================
FILE: backends/gaudi/server/text_generation_server/layers/rotary.py
================================================
import os
import math
import torch
from torch import nn
from habana_frameworks.torch.hpex.kernels import (
    RotaryPosEmbeddingMode,
    apply_rotary_pos_emb,
)


def _create_inv_freq(dim, base, device):
    inv_freq = 1.0 / (
        base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim)
    )
    return inv_freq


def _get_rope_config(config):
    if os.getenv("ROPE_SCALING", None) is not None:
        rope_scaling = {
            "type": os.environ["ROPE_SCALING"],
            "factor": float(os.environ["ROPE_FACTOR"]),
        }
        return rope_scaling
    return getattr(config, "rope_scaling", None)


class PositionRotaryEmbedding(nn.Module):
    def __init__(self, inv_freq, scaling_factor, max_position_embeddings):
        super().__init__()
        self.inv_freq = inv_freq
        self._seq_len_cached = 0
        self._cos_cached = None
        self._sin_cached = None
        self._cos_k_cached = None
        self._sin_k_cached = None
        self.scaling_factor = scaling_factor
        self.dynamic_args = None
        self._update_cos_sin_cache(
            torch.float32, inv_freq.device, max_position_embeddings
        )

    def forward(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        cos: torch.Tensor,
        sin: torch.Tensor,
    ):
        num_tokens = query.shape[0]
        head_size = query.shape[-1]
        # HPU RoPE kernel requires hidden dimension for cos and sin to be equal
        # to query hidden dimension, so the original tensors need to be
        # expanded
        # GPT-NeoX kernel requires position_ids = None, offset, mode = BLOCKWISE
        # and expansion of cos/sin tensors via concatenation
        rope_mode = RotaryPosEmbeddingMode.BLOCKWISE
        cos = torch.cat((cos, cos), dim=-1)
        sin = torch.cat((sin, sin), dim=-1)
        rotary_dim = cos.shape[-1]
        query_shape = query.shape
        query = query.view(num_tokens, -1, head_size)
        query_rot = query[..., :rotary_dim]
        query_pass = query[..., rotary_dim:]
        query_rot = apply_rotary_pos_emb(query_rot, cos, sin, None, 0, rope_mode)
        query.copy_(torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape))

        key_shape = key.shape
        key = key.view(num_tokens, -1, head_size)
        key_rot = key[..., :rotary_dim]
        key_pass = key[..., rotary_dim:]
        key_rot = apply_rotary_pos_emb(key_rot, cos, sin, None, 0, rope_mode)
        key.copy_(torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape))

    @classmethod
    def static(cls, config, dim, base, device):
        inv_freq = _create_inv_freq(dim, base, device)
        scaling_factor = None
        rope_scaling = _get_rope_config(config)
        if not hasattr(config, "max_position_embeddings") and hasattr(
            config, "max_seq_len"
        ):
            # handling for dbrx
            config.max_position_embeddings = config.max_seq_len
        if rope_scaling is not None:
            # `rope_type` is now standard in transformers, but some existing models
            # have `type` instead.
            rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))

            if rope_type == "linear":
                pass
            elif rope_type == "default":
                pass
            elif rope_type == "mrope":
                mrope_section = rope_scaling["mrope_section"]
                if mrope_section is not None:
                    return RotaryPositionEmbeddingMultimodalSections(
                        inv_freq,
                        scaling_factor,
                        mrope_section,
                        config.max_position_embeddings,
                    )
            elif rope_type == "dynamic":
                scaling_factor = rope_scaling["factor"]
                return DynamicPositionRotaryEmbedding(
                    dim=dim,
                    max_position_embeddings=config.max_position_embeddings,
                    base=base,
                    device=inv_freq.device,
                    scaling_factor=scaling_factor,
                )
            elif rope_type == "llama3":
                inv_freq = apply_llama3_scaling(
                    inv_freq,
                    scaling_factor=rope_scaling["factor"],
                    low_freq_factor=rope_scaling["low_freq_factor"],
                    high_freq_factor=rope_scaling["high_freq_factor"],
                    original_max_position_embeddings=rope_scaling[
                        "original_max_position_embeddings"
                    ],
                )

                return cls(inv_freq, scaling_factor, config.max_position_embeddings)

            elif rope_type == "yarn":
                scaling_factor = rope_scaling["factor"]
                mscale = rope_scaling.get("mscale", 1.0)
                mscale_all_dim = rope_scaling.get("mscale_all_dim", 0.0)
                return YarnPositionRotaryEmbedding(
                    dim=2 * inv_freq.shape[0],
                    max_position_embeddings=rope_scaling[
                        "original_max_position_embeddings"
                    ],
                    base=base,
                    device=inv_freq.device,
                    scaling_factor=scaling_factor,
                    extrapolation_factor=1,
                    attn_factor=1,
                    beta_fast=32,
                    beta_slow=1,
                    mscale=mscale,
                    mscale_all_dim=mscale_all_dim,
                )
            elif rope_type in ["su", "longrope"]:
                short_factor = torch.tensor(
                    rope_scaling["short_factor"], dtype=torch.float32, device=device
                )
                short_inv_freq = 1.0 / (
                    short_factor
                    * base
                    ** (
                        torch.arange(0, dim, 2, device=device, dtype=torch.float32)
                        / dim
                    )
                )
                long_factor = torch.tensor(
                    rope_scaling["long_factor"], dtype=torch.float32, device=device
                )
                long_inv_freq = 1.0 / (
                    long_factor
                    * base
                    ** (
                        torch.arange(0, dim, 2, device=device, dtype=torch.float32)
                        / dim
                    )
                )

                original_max_position_embeddings = (
                    config.original_max_position_embeddings
                )
                max_position_embeddings = config.max_position_embeddings
                if max_position_embeddings <= original_max_position_embeddings:
                    scaling_factor = 1.0
                else:
                    scale = max_position_embeddings / original_max_position_embeddings
                    scaling_factor = math.sqrt(
                        1 + math.log(scale) / math.log(original_max_position_embeddings)
                    )

                # if short_mscale and long_mscale are provided we need to scale the freqs
                # using the Phi3LongRoPEScaledRotaryEmbedding
                if ("short_mscale" in rope_scaling) and ("long_mscale" in rope_scaling):
                    short_mscale = rope_scaling["short_mscale"]
                    long_mscale = rope_scaling["long_mscale"]
                    return Phi3LongRoPEScaledRotaryEmbedding(
                        short_inv_freq=short_inv_freq,
                        long_inv_freq=long_inv_freq,
                        max_position_embeddings=config.max_position_embeddings,
                        short_mscale=short_mscale,
                        long_mscale=long_mscale,
                        original_max_position_embeddings=original_max_position_embeddings,
                    )

                return SuRotaryEmbedding(
                    short_inv_freq=short_inv_freq,
                    long_inv_freq=long_inv_freq,
                    scaling_factor=scaling_factor,
                    original_max_position_embeddings=original_max_position_embeddings,
                    max_position_embeddings=config.max_position_embeddings,
                )
            else:
                raise NotImplementedError(
                    f"rope scaling type {rope_scaling['type']} is not implemented or invalid"
                )
        return cls(inv_freq, scaling_factor, config.max_position_embeddings)

    @classmethod
    def load(cls, config, prefix, weights):
        # XXX: Always load this in float32 !
        dtype = weights.dtype
        weights.dtype = torch.float32
        inv_freq = weights.get_tensor(f"{prefix}.inv_freq")
        weights.dtype = dtype

        scaling_factor = None
        rope_scaling = _get_rope_config(config)
        if rope_scaling is not None:
            scaling_factor = rope_scaling["factor"]
            if rope_scaling["type"] == "linear":
                pass
            elif rope_scaling["type"] == "dynamic":
                return DynamicPositionRotaryEmbedding(
                    dim=2 * inv_freq.shape[0],
                    max_position_embeddings=config.max_position_embeddings,
                    base=10000.0,
                    device=inv_freq.device,
                    scaling_factor=scaling_factor,
                )
            elif rope_scaling["type"] == "yarn":
                mscale = rope_scaling.get("mscale", 1.0)
                mscale_all_dim = rope_scaling.get("mscale_all_dim", 0.0)
                return YarnPositionRotaryEmbedding(
                    dim=2 * inv_freq.shape[0],
                    max_position_embeddings=rope_scaling[
                        "original_max_position_embeddings"
                    ],
                    base=10000.0,
                    device=inv_freq.device,
                    scaling_factor=scaling_factor,
                    extrapolation_factor=1,
                    attn_factor=1,
                    beta_fast=32,
                    beta_slow=1,
                    mscale=mscale,
                    mscale_all_dim=mscale_all_dim,
                )
            else:
                raise NotImplementedError(
                    f"rope scaling type {rope_scaling['type']} is not implemented or invalid"
                )
        return cls(inv_freq, scaling_factor, config.max_position_embeddings)

    def _update_cos_sin_cache(self, dtype, device, seqlen):
        # Reset the tables if the sequence length has changed,
        # or if we're on a new device (possibly due to tracing for instance)
        if (
            seqlen > self._seq_len_cached
            or self._cos_cached.device != device
            or self._cos_cached.dtype != dtype
        ):
            self._seq_len_cached = seqlen
            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
            if self.scaling_factor is not None:
                t /= self.scaling_factor
            # Don't do einsum, it converts fp32 to fp16
            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)

            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
            self._cos_cached = torch.cos(freqs).to(dtype)
            self._sin_cached = torch.sin(freqs).to(dtype)

    def get_cos_sin(self, position_ids: torch.Tensor):

        cos = torch.index_select(self._cos_cached, 0, position_ids)
        sin = torch.index_select(self._sin_cached, 0, position_ids)

        # Note: this unsqueeze is not necessary on RoCm + VLLM ROPE implementation, but we leave it as is to avoid yet an other controlflow.
        return cos.unsqueeze(1), sin.unsqueeze(1)


class SuRotaryEmbedding(PositionRotaryEmbedding):
    def __init__(
        self,
        short_inv_freq,
        long_inv_freq,
        scaling_factor,
        original_max_position_embeddings,
        max_position_embeddings,
    ):
        super(PositionRotaryEmbedding, self).__init__()
        self.short_inv_freq = short_inv_freq
        self.long_inv_freq = long_inv_freq
        self.scaling_factor = scaling_factor
        self.original_max_position_embeddings = original_max_position_embeddings
        self._seq_len_cached = 0
        self._cos_cached = None
        self._sin_cached = None
        self._cos_k_cached = None
        self._sin_k_cached = None
        self.dynamic_args = None
        self._update_cos_sin_cache(
            torch.float32, short_inv_freq.device, max_position_embeddings
        )

    def _update_cos_sin_cache(self, dtype, device, seqlen):
        # Reset the tables if the sequence length has changed,
        # or if we're on a new device (possibly due to tracing for instance)
        if (
            seqlen > self._seq_len_cached
            or self._cos_cached is None
            or self._cos_cached.device != device
            or self._cos_cached.dtype != dtype
        ):
            self._seq_len_cached = seqlen

            t = torch.arange(seqlen, device=device, dtype=self.short_inv_freq.dtype)
            short_freqs = torch.outer(
                t[: self.original_max_position_embeddings],
                self.short_inv_freq.to(device=t.device),
            )
            long_freqs = torch.outer(
                t[self.original_max_position_embeddings :],
                self.long_inv_freq.to(device=t.device),
            )

            freqs = torch.cat([short_freqs, long_freqs])

            self._cos_cached = (torch.cos(freqs) * self.scaling_factor).to(dtype)
            self._sin_cached = (torch.sin(freqs) * self.scaling_factor).to(dtype)


class Phi3LongRoPEScaledRotaryEmbedding(PositionRotaryEmbedding):
    def __init__(
        self,
        short_inv_freq: torch.Tensor,
        long_inv_freq: torch.Tensor,
        max_position_embeddings: int,
        short_mscale: float,
        long_mscale: float,
        original_max_position_embeddings: int,
    ):
        super(PositionRotaryEmbedding, self).__init__()
        self.short_inv_freq = short_inv_freq
        self.long_inv_freq = long_inv_freq
        self.max_position_embeddings = max_position_embeddings
        self.short_mscale = short_mscale
        self.long_mscale = long_mscale
        self.original_max_position_embeddings = original_max_position_embeddings

        # cache
        self._seq_len_cached = 0
        self._cos_cached = None
        self._sin_cached = None
        self._cos_k_cached = None
        self._sin_k_cached = None
        self.dynamic_args = None
        self._update_cos_sin_cache(
            torch.float32, short_inv_freq.device, max_position_embeddings
        )

    def _update_cos_sin_cache(self, dtype, device, seqlen):
        if (
            seqlen > self._seq_len_cached
            or self._cos_cached is None
            or self._cos_cached.device != device
            or self._cos_cached.dtype != dtype
        ):
            self._seq_len_cached = seqlen
            t = torch.arange(seqlen, device=device, dtype=self.short_inv_freq.dtype)

            short_freqs = torch.outer(
                t[: self.original_max_position_embeddings],
                self.short_inv_freq.to(device=t.device),
            )

            long_freqs = torch.outer(
                t[self.original_max_position_embeddings :],
                self.long_inv_freq.to(device=t.device),
            )

            short_freqs = short_freqs * self.short_mscale
            long_freqs = long_freqs * self.long_mscale

            freqs = torch.empty((seqlen, short_freqs.shape[1]), device=device)
            freqs[: self.original_max_position_embeddings] = short_freqs
            freqs[self.original_max_position_embeddings :] = long_freqs

            self._cos_cached = torch.cos(freqs).to(dtype)
            self._sin_cached = torch.sin(freqs).to(dtype)


class DynamicPositionRotaryEmbedding(PositionRotaryEmbedding):
    def __init__(self, dim, max_position_embeddings, base, device, scaling_factor):
        inv_freq = _create_inv_freq(dim, base, device)
        super().__init__(inv_freq, scaling_factor, max_position_embeddings)
        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base

    def _update_cos_sin_cache(self, dtype, device, seqlen):
        # Reset the tables if the sequence length has changed,
        # or if we're on a new device (possibly due to tracing for instance)
        if (
            seqlen > self._seq_len_cached
            or self._cos_cached.device != device
            or self._cos_cached.dtype != dtype
        ):
            if seqlen > self.max_position_embeddings:
                newbase = self.base * (
                    (self.scaling_factor * seqlen / self.max_position_embeddings)
                    - (self.scaling_factor - 1)
                ) ** (self.dim / (self.dim - 2))
                self.inv_freq = _create_inv_freq(
                    self.dim, newbase, self.inv_freq.device
                )
            self._seq_len_cached = seqlen
            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
            # Don't do einsum, it converts fp32 to fp16
            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)

            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
            self._cos_cached = torch.cos(freqs).to(dtype)
            self._sin_cached = torch.sin(freqs).to(dtype)


def find_correction_dim(num_rotations, dim, base=10000, max_position_embeddings=2048):
    return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
        2 * math.log(base)
    )


# Find dim range bounds based on rotations
def find_correction_range(
    low_rot, high_rot, dim, base=10000, max_position_embeddings=2048
):
    low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings))
    high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings))
    return max(low, 0), min(high, dim - 1)  # Clamp values just in case


def linear_ramp_mask(min, max, dim):
    if min == max:
        max += 0.001  # Prevent singularity

    linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
    ramp_func = torch.clamp(linear_func, 0, 1)
    return ramp_func


def get_mscale(scale: float = 1.0, mscale: float = 1.0):
    if scale <= 1:
        return 1.0
    return 0.1 * mscale * math.log(scale) + 1.0


class YarnPositionRotaryEmbedding(PositionRotaryEmbedding):
    def __init__(
        self,
        dim,
        max_position_embeddings,
        base,
        device,
        scaling_factor,
        *,
        extrapolation_factor,
        attn_factor,
        beta_fast,
        beta_slow,
        mscale: float,
        mscale_all_dim: float,
    ):
        inv_freq = _create_inv_freq(dim, base, device)
        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base
        self.extrapolation_factor = extrapolation_factor
        self.attn_factor = attn_factor
        self.beta_fast = beta_fast
        self.beta_slow = beta_slow
        self.mscale_all_dim = mscale_all_dim
        self.scaling_factor = scaling_factor
        self.mscale = float(
            get_mscale(self.scaling_factor, mscale)
            / get_mscale(self.scaling_factor, mscale_all_dim)
            * self.attn_factor
        )  # Get n-d magnitude scaling corrected for interpolation
        super().__init__(inv_freq, scaling_factor, max_position_embeddings)

    def _update_cos_sin_cache(self, dtype, device, seqlen):
        # Reset the tables if the sequence length has changed,
        # or if we're on a new device (possibly due to tracing for instance)
        if (
            seqlen > self._seq_len_cached
            or self._cos_cached.device != device
            or self._cos_cached.dtype != dtype
        ):
            if seqlen > self.max_position_embeddings or True:
                inv_freq_extrapolation = _create_inv_freq(
                    self.dim, self.base, self.inv_freq.device
                )
                freqs = 1.0 / inv_freq_extrapolation
                inv_freq_interpolation = 1.0 / (self.scaling_factor * freqs)
                low, high = find_correction_range(
                    self.beta_fast,
                    self.beta_slow,
                    self.dim,
                    self.base,
                    self.max_position_embeddings,
                )

                inv_freq_mask = (
                    1 - linear_ramp_mask(low, high, self.dim // 2).float().to(device)
                ) * self.extrapolation_factor  # Get n-d rotational scaling corrected for extrapolation
                inv_freq = (
                    inv_freq_interpolation * (1 - inv_freq_mask)
                    + inv_freq_extrapolation * inv_freq_mask
                )

                self.inv_freq = inv_freq

            self._seq_len_cached = seqlen
            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
            # Don't do einsum, it converts fp32 to fp16
            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)

            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
            self._cos_cached = (torch.cos(freqs) * self.mscale).to(dtype)
            self._sin_cached = (torch.sin(freqs) * self.mscale).to(dtype)


def apply_llama3_scaling(
    freqs: torch.Tensor,
    *,
    scaling_factor: int,
    low_freq_factor: int,
    high_freq_factor: int,
    original_max_position_embeddings: int,
):
    low_freq_wavelen = original_max_position_embeddings / low_freq_factor
    high_freq_wavelen = original_max_position_embeddings / high_freq_factor
    new_freqs = []

    for freq in freqs:
        wavelen = 2 * math.pi / freq

        if wavelen < high_freq_wavelen:
            new_freqs.append(freq)
        elif wavelen > low_freq_wavelen:
            new_freqs.append(freq / scaling_factor)
        else:
            assert low_freq_wavelen != high_freq_wavelen
            smooth = (original_max_position_embeddings / wavelen - low_freq_factor) / (
                high_freq_factor - low_freq_factor
            )
            new_freqs.append((1 - smooth) * freq / scaling_factor + smooth * freq)

    return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device)


class RotaryPositionEmbeddingMultimodalSections(PositionRotaryEmbedding):
    def __init__(
        self,
        inv_freq: torch.Tensor,
        scaling_factor: float,
        sections: list,
        max_position_embeddings,
    ):
        self.sections = sections
        self._cos_cached = None
        self._sin_cached = None
        self.section_indices = (
            torch.arange(len(self.sections))
            .repeat_interleave(torch.tensor(self.sections))
            .view(1, 1, -1)
            .to(inv_freq.device)
        )
        super().__init__(inv_freq, scaling_factor, max_position_embeddings)

    def _update_cos_sin_cache(
        self, dtype: torch.dtype, device: torch.device, seqlen: int
    ):
        # always cache the cos/sin for the full sequence length to avoid
        # recomputing if the sequence length is smaller than the cached one
        if (
            seqlen > self._seq_len_cached
            or self._cos_cached.device != device
            or self._cos_cached.dtype != dtype
        ):
            self._seq_len_cached = seqlen
            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
            self._cos_cached = torch.cos(freqs).to(dtype)
            self._sin_cached = torch.sin(freqs).to(dtype)
            self._sections = self.section_indices.expand(seqlen, -1, -1)

    def get_cos_sin(
        self,
        position_ids: torch.Tensor,
    ):
        slen = position_ids.shape[0]

        cos = self._cos_cached[position_ids].gather(1, self._sections[:slen])
        sin = self._sin_cached[position_ids].gather(1, self._sections[:slen])
        return cos, sin


================================================
FILE: backends/gaudi/server/text_generation_server/layers/speculative.py
================================================
import torch
import json
from typing import Tuple, Optional
from text_generation_server.layers.tensor_parallel import TensorParallelHead
from text_generation_server.layers.medusa import MedusaHeadV1, MedusaHeadV2
from text_generation_server.layers.mlp import MLPSpeculatorHead


class SpeculativeHead(torch.nn.Module):
    def __init__(self, lm_head, speculator):
        super().__init__()
        self.head = lm_head
        self.speculator = speculator

    @staticmethod
    def load(config, prefix: str, weights):
        speculator = config.speculator
        if speculator:
            speculator_path = config.speculator["path"]
            speculator_config = str(speculator_path / "config.json")

            with open(speculator_config, "r") as f:
                speculator_config = json.load(f)

            config.speculator_config = speculator_config
            try:
                architecture = speculator_config["architectures"][0]

                if architecture == "MLPSpeculatorPreTrainedModel":
                    speculator = MLPSpeculatorHead.load(config, prefix, weights)
                else:
                    speculator = None
            except KeyError:
                try:
                    speculator = MedusaHeadV1.load(config, prefix, weights)
                except Exception:
                    speculator = MedusaHeadV2(config, prefix, weights)
            lm_head = None
        else:
            lm_head = TensorParallelHead.load(config, prefix, weights)
            speculator = None
        return SpeculativeHead(lm_head, speculator)

    def forward(
        self, input: torch.Tensor
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        if self.speculator is not None:
            return self.speculator(input)

        assert self.head is not None
        logits = self.head(input)
        return logits, None


================================================
FILE: backends/gaudi/server/text_generation_server/layers/tensor_parallel.py
================================================
import torch
from torch.nn import functional as F
from typing import Iterable, List
from text_generation_server.layers.linear import get_linear, FastLinear

import habana_frameworks.torch as htorch


class LayerConcat(torch.nn.Module):
    """
    Apply multiple layers to the input and concatenate their
    outputs.
    """

    def __init__(self, layers: Iterable[torch.nn.Module], dim: int = -1):
        """
        `dim` is the dimension along which layer outputs are concatenated.
        """
        super().__init__()
        self.layers = layers
        self.dim = dim

    def forward(self, x: torch.Tensor):
        outputs = [layer(x) for layer in self.layers]
        return torch.cat(outputs, self.dim)


class SuperLayer(torch.nn.Module):
    def __init__(self, linear):
        super().__init__()
        self.linear = linear

    def forward(self, x):
        return self.linear.forward(x)


class TensorParallelHead(SuperLayer):
    def __init__(self, linear, process_group, should_gather: bool):
        super().__init__(linear)
        self.process_group = process_group
        self.should_gather = should_gather

    @staticmethod
    def load(config, prefix: str, weights):
        if config.quantize == "exl2":
            try:
                # If the piece and LM head embeddings are shared, we have
                # non-quantized weights...
                weight = weights.get_tensor(f"{prefix}.weight")
            except Exception:
                # ...otherwise they are quantized.
                weight = weights.get_weights_col(prefix)
            should_gather = weights.process_group.size() > 1
        elif weights.process_group.size() > 1:
            try:
                weight = weights.get_sharded(f"{prefix}.weight", dim=0)
                should_gather = True
            except AssertionError:
                # If the vocab size is not divisible by number of shards
                # just load the entire thing.
                weight = weights.get_tensor(f"{prefix}.weight")
                should_gather = False
        else:
            weight = weights.get_tensor(f"{prefix}.weight")
            should_gather = False

        return TensorParallelHead(
            get_linear(weight, bias=None),
            process_group=weights.process_group,
            should_gather=should_gather,
        )

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        if not self.should_gather:
            return super().forward(input)

        world_size = self.process_group.size()
        if len(input.shape) == 2 and isinstance(self.linear, FastLinear):
            out_dim = self.linear.weight.shape[0]

            if input.shape[0] == 1:
                world_out = input.new_empty(1, out_dim * world_size)
                local_out = input.new_empty(1, out_dim)
                gather_input = local_out
            else:
                world_out = input.new_empty(out_dim * world_size, input.shape[0])
                gather_input = input.new_empty(out_dim, input.shape[0])
                local_out = gather_input.T

            torch.mm(input, self.linear.weight.T, out=local_out)
            htorch.core.mark_step()
            torch.distributed.all_gather_into_tensor(
                world_out, gather_input, group=self.process_group
            )

            if input.shape[0] == 1:
                return world_out
            return world_out.T

        output = super().forward(input)
        world_output = [
            torch.empty_like(output) for _ in range(self.process_group.size())
        ]

        htorch.core.mark_step()
        torch.distributed.all_gather(world_output, output, group=self.process_group)
        world_output = torch.cat(world_output, dim=-1)
        return world_output


class TensorParallelColumnLinear(SuperLayer):
    @classmethod
    def load_gate_up(cls, config, prefix: str, weights, bias: bool):
        """Specific method when the QKV was joined after the fact"""
        weight = weights.get_weights_col_packed_gate_up(prefix)
        if bias:
            raise NotImplementedError("packed_gate_up only implemented without bias")
        else:
            bias = None
        linear = get_linear(weight, bias)
        return cls(linear)

    @classmethod
    def load_qkv(
        cls,
        config,
        prefix: str,
        weights,
        bias: bool,
        num_heads: int,
        num_key_value_heads: int,
    ):
        """Specific method when the QKV was joined after the fact"""
        weight = weights.get_weights_col_packed_qkv(
            prefix,
            num_heads=num_heads,
            num_key_value_heads=num_key_value_heads,
        )
        if bias:
            raise NotImplementedError("packed_qkv only implemented for baichuan")
        else:
            bias = None
        linear = get_linear(weight, bias)
        return cls(linear)

    @classmethod
    def load(cls, config, prefix: str, weights, bias: bool):
        weight = weights.get_weights_col(prefix)
        if bias:
            bias = weights.get_sharded(f"{prefix}.bias", dim=0)
        else:
            bias = None
        linear = get_linear(weight, bias)
        return cls(linear)

    @classmethod
    def load_multi(cls, config, prefixes: List[str], weights, bias: bool, dim: int):
        if config.quantize == "exl2":
            linears = []
            for prefix in prefixes:
                weight = weights.get_weights_col(prefix)
                b = weights.get_tensor(f"{prefix}.bias") if bias else None
                linears.append(get_linear(weight, b))
            linear = LayerConcat(linears)
        else:
            weight = weights.get_multi_weights_col(prefixes, dim=dim)
            if bias:
                b = [weights.get_sharded(f"{p}.bias", dim=0) for p in prefixes]
                bias = torch.cat(b, dim=dim)
            else:
                bias = None
            linear = get_linear(weight, bias)
        return cls(linear)


class TensorParallelRowLinear(SuperLayer):
    def __init__(self, linear, process_group):
        super().__init__(linear)
        self.process_group = process_group

    @classmethod
    def load(cls, config, prefix: str, weights, bias: bool):
        weight = weights.get_weights_row(prefix)

        if bias and weights.process_group.rank() == 0:
            # Rank is only on the first rank process
            bias = weights.get_tensor(f"{prefix}.bias")
        else:
            bias = None
        return cls(
            get_linear(weight, bias),
            process_group=weights.process_group,
        )

    def forward(self, input: torch.Tensor, reduce: bool = True) -> torch.Tensor:
        out = super().forward(input)
        if self.process_group.size() > 1 and reduce:
            # FIXME(kzawora): this is a workaround for a bug in Habana PT bridge
            # occurring when PT_HPU_ENABLE_LAZY_COLLECTIVES=true env var is used
            # (which is required for tensor parallel HPUGraph inference)
            htorch.core.mark_step()
            torch.distributed.all_reduce(out, group=self.process_group)
        return out


class TensorParallelEmbedding(torch.nn.Module):
    def __init__(self, prefix: str, weights, reduce=True):
        super().__init__()
        weight = weights.get_partial_sharded(f"{prefix}.weight", dim=0)
        num_embeddings = weights.get_shape(f"{prefix}.weight")[0]

        process_group = weights.process_group

        world_size = process_group.size()
        rank = process_group.rank()

        block_size = (num_embeddings + world_size - 1) // world_size
        self.min_id = rank * block_size
        self.max_id = min(num_embeddings, (rank + 1) * block_size)
        self.null_idx = weight.shape[
            0
        ]  # Usually block_size, might be less in non even vocab_size.
        self.process_group = weights.process_group
        self.reduce = reduce

        """Additional 0 entry used for masking"""
        self.weight = torch.nn.Parameter(F.pad(weight, (0, 0, 0, 1)))

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        # default all out of bounds values to `self.null_idx` that will then be mapped to 0
        # translate for [0, self.max_id - self.min_id[
        input = torch.where(
            (self.min_id > input) | (input >= self.max_id),
            self.null_idx,
            input - self.min_id,
        )
        out = torch.nn.functional.embedding(input, self.weight)
        if self.reduce and self.process_group.size() > 1:
            # FIXME(kzawora): this is a workaround for a bug in Habana PT bridge
            # occurring when PT_HPU_ENABLE_LAZY_COLLECTIVES=true env var is used
            # (which is required for tensor parallel HPUGraph inference)
            htorch.core.mark_step()
            torch.distributed.all_reduce(out, group=self.process_group)
        return out


================================================
FILE: backends/gaudi/server/text_generation_server/models/__init__.py
================================================
# ruff: noqa: F821
# the above line disables the `undefined-name` rule for the model type variables
import torch
import os

from loguru import logger
from transformers.configuration_utils import PretrainedConfig
from huggingface_hub import hf_hub_download, HfApi
from typing import Optional
from pathlib import Path
from typing import List, Dict
import enum

# Needed to properly setup habana_frameworks

from text_generation_server.utils.speculate import get_speculate, set_speculate
from text_generation_server.models.model import Model
from text_generation_server.models.custom_modeling.flash_phi_moe_modeling import (
    PhiMoEConfig,
)

from text_generation_server.utils.adapter import (
    AdapterParameters,
    build_layer_weight_lookup,
    load_and_merge_adapters,
    AdapterInfo,
)
from text_generation_server.adapters.lora import LoraWeights

from text_generation_server.utils.log import log_master

__all__ = [
    "Model",
    "CausalLM",
    "Seq2SeqLM",
    "get_model_with_lora_adapters",
]

VLM_BATCH_TYPES = set()

FLASH_ATTENTION = True

try:
    from text_generation_server.models.flash_causal_lm import FlashCausalLM
    from text_generation_server.models.flash_vlm_causal_lm import FlashVlmCausalLM
    from text_generation_server.models.mllama_causal_lm import FlashMllamaCausalLM
    from text_generation_server.models.custom_modeling.flash_deepseek_v2_modeling import (
        FlashDeepseekV2ForCausalLM,
        DeepseekV2Config,
    )
    from text_generation_server.models.custom_modeling.flash_deepseek_v3_modeling import (
        FlashDeepseekV3ForCausalLM,
        DeepseekV3Config,
    )
    from text_generation_server.models.custom_modeling.flash_llama_modeling import (
        FlashLlamaForCausalLM,
    )
    from text_generation_server.models.custom_modeling.flash_llama4_modeling import (
        Llama4ForConditionalGeneration,
    )
    from text_generation_server.models.custom_modeling.flash_cohere_modeling import (
        FlashCohereForCausalLM,
    )
    from text_generation_server.models.custom_modeling.flash_gemma_modeling import (
        FlashGemmaForCausalLM,
    )
    from text_generation_server.models.custom_modeling.flash_gemma2_modeling import (
        FlashGemma2ForCausalLM,
    )
    from text_generation_server.models.custom_modeling.flash_gemma3_modeling import (
        Gemma3ForConditionalGeneration,
        FlashGemma3ForCausalLM,
    )
    from text_generation_server.models.custom_modeling.flash_dbrx_modeling import (
        FlashDbrxForCausalLM,
        DbrxConfig,
    )
    from text_generation_server.models.custom_modeling.flash_rw_modeling import (
        RWConfig,
        FlashRWForCausalLM,
    )
    from text_generation_server.models.custom_modeling.flash_neox_modeling import (
        FlashGPTNeoXForCausalLM,
    )
    from text_generation_server.models.custom_modeling.flash_pali_gemma_modeling import (
        PaliGemmaForConditionalGeneration,
    )
    from text_generation_server.models.custom_modeling.flash_phi_modeling import (
        FlashPhiForCausalLM,
    )
    from text_generation_server.models.mllama_causal_lm import FlashMllamaCausalLMBatch
    from text_generation_server.models.custom_modeling.flash_mllama import (
        FlashMllamaForConditionalGeneration,
    )
    from text_generation_server.models.custom_modeling.flash_llava_next import (
        FlashLlavaNextForConditionalGeneration,
    )

    from text_generation_server.models.custom_modeling.flash_santacoder_modeling import (
        FlashSantacoderForCausalLM,
    )
    from text_generation_server.models.custom_modeling.flash_starcoder2_modeling import (
        FlashStarcoder2ForCausalLM,
    )
    from text_generation_server.models.custom_modeling.flash_qwen2_modeling import (
        Qwen2ForCausalLM,
    )
    from text_generation_server.models.custom_modeling.flash_qwen3_modeling import (
        Qwen3ForCausalLM,
    )
    from text_generation_server.models.custom_modeling.flash_qwen3_moe_modeling import (
        Qwen3MoeForCausalLM,
    )
    from text_generation_server.models.custom_modeling.flash_mistral_modeling import (
        FlashMistralForCausalLM,
    )
    from text_generation_server.models.custom_modeling.flash_mixtral_modeling import (
        FlashMixtralForCausalLM,
    )
    from text_generation_server.models.custom_modeling.flash_gpt2_modeling import (
        FlashGPT2ForCausalLM,
    )
    from text_generation_server.models.custom_modeling.flash_gptj_modeling import (
        FlashGPTJForCausalLM,
    )
    from text_generation_server.models.custom_modeling.idefics2 import (
        Idefics2ForConditionalGeneration,
    )
    from text_generation_server.models.custom_modeling.idefics3 import (
        Idefics3ForConditionalGeneration,
    )
    from text_generation_server.models.custom_modeling.qwen2_vl import (
        Qwen2VLForConditionalGeneration,
    )
    from text_generation_server.models.custom_modeling.qwen2_5_vl import (
        Qwen2_5VLForConditionalGeneration,
        Qwen2_5_VLConfig,
        Qwen2_5_VLProcessor,
    )
    from text_generation_server.layers.attention import SUPPORTS_WINDOWING
except ImportError as e:
    log_master(logger.warning, f"Could not import Flash Attention enabled models: {e}")
    SUPPORTS_WINDOWING = False
    FLASH_ATTENTION = False
    VLM_BATCH_TYPES = set()

if FLASH_ATTENTION:
    __all__.append(FlashCausalLM)

    from text_generation_server.models.flash_vlm_causal_lm import (
        FlashVlmCausalLMBatch,
    )

    VLM_BATCH_TYPES = {
        FlashVlmCausalLMBatch,
        FlashMllamaCausalLMBatch,
    }


__all__.append(VLM_BATCH_TYPES)


class ModelType(enum.Enum):
    DEEPSEEK_V2 = {
        "type": "deepseek_v2",
        "name": "Deepseek V2",
        "url": "https://huggingface.co/deepseek-ai/DeepSeek-V2",
    }
    DEEPSEEK_V3 = {
        "type": "deepseek_v3",
        "name": "Deepseek V3",
        "url": "https://huggingface.co/deepseek-ai/DeepSeek-V3",
    }
    IDEFICS2 = {
        "type": "idefics2",
        "name": "Idefics 2",
        "url": "https://huggingface.co/HuggingFaceM4/idefics2-8b",
        "multimodal": True,
    }
    IDEFICS3 = {
        "type": "idefics3",
        "name": "Idefics 3",
        "url": "https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3",
        "multimodal": True,
    }
    LLAVA_NEXT = {
        "type": "llava_next",
        "name": "Llava Next (1.6)",
        "url": "https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf",
        "multimodal": True,
    }
    LLAMA = {
        "type": "llama",
        "name": "Llama",
        "url": "https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f",
    }
    LLAMA4 = {
        "type": "llama4",
        "name": "Llama4",
        "url": "https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f",
    }
    PHI3 = {
        "type": "phi3",
        "name": "Phi 3",
        "url": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct",
    }
    GRANITE = {
        "type": "granite",
        "name": "Granite",
        "url": "https://huggingface.co/ibm-granite/granite-3.0-8b-instruct",
    }
    GEMMA = {
        "type": "gemma",
        "name": "Gemma",
        "url": "https://huggingface.co/google/gemma-7b",
    }
    PALIGEMMA = {
        "type": "paligemma",
        "name": "PaliGemma",
        "url": "https://huggingface.co/google/paligemma-3b-pt-224",
    }
    GEMMA2 = {
        "type": "gemma2",
        "name": "Gemma2",
        "url": "https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315",
    }
    GEMMA3 = {
        "type": "gemma3",
        "name": "Gemma3",
        "url": "https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d",
    }
    GEMMA3_TEXT = {
        "type": "gemma3_text",
        "name": "Gemma3 Text",
        "url": "https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d",
    }
    COHERE = {
        "type": "cohere",
        "name": "Cohere",
        "url": "https://huggingface.co/CohereForAI/c4ai-command-r-plus",
    }
    DBRX = {
        "type": "dbrx",
        "name": "Dbrx",
        "url": "https://huggingface.co/databricks/dbrx-instruct",
    }
    MAMBA = {
        "type": "mamba",
        "name": "Mamba",
        "url": "https://huggingface.co/state-spaces/mamba-2.8b-slimpj",
    }
    MISTRAL = {
        "type": "mistral",
        "name": "Mistral",
        "url": "https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407",
    }
    MIXTRAL = {
        "type": "mixtral",
        "name": "Mixtral",
        "url": "https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1",
    }
    GPT_BIGCODE = {
        "type": "gpt_bigcode",
        "name": "Gpt Bigcode",
        "url": "https://huggingface.co/bigcode/gpt_bigcode-santacoder",
    }
    PHI = {
        "type": "phi",
        "name": "Phi",
        "url": "https://huggingface.co/microsoft/phi-1_5",
    }
    PHI_MOE = {
        "type": "phimoe",
        "name": "PhiMoe",
        "url": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
    }
    BAICHUAN = {
        "type": "baichuan",
        "name": "Baichuan",
        "url": "https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat",
    }
    FALCON = {
        "type": "falcon",
        "name": "Falcon",
        "url": "https://huggingface.co/tiiuae/falcon-7b-instruct",
    }
    STARCODER2 = {
        "type": "starcoder2",
        "name": "StarCoder 2",
        "url": "https://huggingface.co/bigcode/starcoder2-15b-instruct-v0.1",
    }
    QWEN2 = {
        "type": "qwen2",
        "name": "Qwen 2",
        "url": "https://huggingface.co/collections/Qwen/qwen2-6659360b33528ced941e557f",
    }
    QWEN2_VL = {
        "type": "qwen2_vl",
        "name": "Qwen 2 VL",
        "url": "https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d",
    }
    QWEN2_5_VL = {
        "type": "qwen2_5_vl",
        "name": "Qwen 2.5 VL",
        "url": "https://huggingface.co/collections/Qwen/qwen25-66e81a666513e518adb90d9e",
    }
    QWEN3 = {
        "type": "qwen3",
        "name": "Qwen 3",
        "url": "https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f",
    }
    QWEN3_MOE = {
        "type": "qwen3_moe",
        "name": "Qwen 3 Moe",
        "url": "https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f",
    }
    GALACTICA = {
        "type": "galactica",
        "name": "Galactica",
        "url": "https://huggingface.co/facebook/galactica-120b",
    }
    SANTACODER = {
        "type": "santacoder",
        "name": "SantaCoder",
        "url": "https://huggingface.co/bigcode/santacoder",
    }
    GPT2 = {
        "type": "gpt2",
        "name": "Gpt2",
        "url": "https://huggingface.co/openai-community/gpt2",
    }
    GPT_NEOX = {
        "type": "gpt_neox",
        "name": "Gpt Neox",
        "url": "https://huggingface.co/EleutherAI/gpt-neox-20b",
    }
    GPTJ = {
        "type": "gptj",
        "name": "Gptj",
        "url": "https://huggingface.co/EleutherAI/gpt-j-6b",
    }
    MLLAMA = {
        "type": "mllama",
        "name": "Mllama",
        "url": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct",
        "multimodal": True,
    }


__GLOBALS = locals()
for data in ModelType:
    __GLOBALS[data.name] = data.value["type"]

SDP_ON_BF16 = int(os.environ.get("SDP_ON_BF16", 0))
# Disable gradients
torch.set_grad_enabled(False)


def get_model(
    model_id: str,
    lora_adapter_ids: Optional[List[str]],
    revision: Optional[str],
    sharded: bool,
    quantize: Optional[str],
    speculate: Optional[int],
    dtype: Optional[torch.dtype],
    kv_cache_dtype: Optional[str],
    trust_remote_code: bool,
    max_input_tokens: int,
) -> Model:
    global FLASH_ATTENTION

    if speculate is not None:
        set_speculate(speculate)
    else:
        set_speculate(0)

    config_dict, _ = PretrainedConfig.get_config_dict(
        model_id, revision=revision, trust_remote_code=trust_remote_code
    )
    model_type = config_dict.get("model_type", None)

    speculator = None
    if "medusa_num_heads" in config_dict:
        medusa_model_id = model_id
        medusa_revision = revision
        model_id = config_dict["base_model_name_or_path"]
        revision = "main"
        speculate_medusa = config_dict["medusa_num_heads"]
        if speculate is not None:
            if speculate > speculate_medusa:
                raise RuntimeError(
                    f"Speculate is set to `{speculate}` but this medusa models only has `{speculate_medusa}` heads, please make them match"
                )
            else:
                set_speculate(speculate)
        else:
            set_speculate(speculate_medusa)

        config_dict, _ = PretrainedConfig.get_config_dict(
            model_id, revision=revision, trust_remote_code=trust_remote_code
        )
        # Reload model type from parent.
        model_type = config_dict.get("model_type", None)
        is_local = Path(medusa_model_id).exists()
        if not is_local:
            medusa_config = hf_hub_download(
                medusa_model_id, revision=medusa_revision, filename="config.json"
            )
            hf_hub_download(
                medusa_model_id,
                revision=medusa_revision,
                filename="medusa_lm_head.safetensors",
            )
            speculator = {
                "path": Path(medusa_config).parent,
                "model_paths": ["medusa_lm_head.safetensors"],
            }
        else:
            speculator = {
                "path": Path(medusa_model_id),
                "model_paths": ["medusa_lm_head.safetensors"],
            }

        method = "medusa"
    elif model_type == "mlp_speculator":
        mlp_model_id = model_id
        mlp_revision = revision
        model_id = config_dict["base_model_name_or_path"]
        revision = "main"
        speculate_mlp = config_dict["n_predict"]
        if speculate is not None:
            if speculate > speculate_mlp:
                raise RuntimeError(
                    f"Speculate is set to `{speculate}` but this mlp_speculator models only has `{speculate_mlp}` heads, please make them match"
                )
            else:
                set_speculate(speculate)
        else:
            set_speculate(speculate_mlp)

        config_dict, _ = PretrainedConfig.get_config_dict(
            model_id, revision=revision, trust_remote_code=trust_remote_code
        )
        # Reload model type from parent.
        model_type = config_dict.get("model_type", None)
        is_local = Path(mlp_model_id).exists()
        extension = ".safetensors"
        if not is_local:
            mlp_speculator_config = hf_hub_download(
                mlp_model_id, revision=mlp_revision, filename="config.json"
            )
            api = HfApi()
            info = api.model_info(mlp_model_id, revision=mlp_revision)
            filenames = [
                s.rfilename
                for s in info.siblings
                if s.rfilename.endswith(extension)
                and len(s.rfilename.split("/")) == 1
                and "arguments" not in s.rfilename
                and "args" not in s.rfilename
                and "training" not in s.rfilename
            ]
            for filename in filenames:
                hf_hub_download(
                    mlp_model_id,
                    revision=mlp_revision,
                    filename=filename,
                )
            speculator_dir_path = Path(mlp_speculator_config).parent
            # if these are downloaded, they get converted to safetensors
            filenames.extend(
                [p for p in os.listdir(speculator_dir_path) if p.endswith(extension)]
            )
            speculator = {
                "path": Path(mlp_speculator_config).parent,
                "model_paths": filenames,
            }
        else:
            speculator = Path(mlp_model_id)
            filenames = [p for p in os.listdir(speculator) if p.endswith(extension)]
            speculator = {"path": speculator, "model_paths": filenames}
        method = "mlp_speculator"
    else:
        method = "n-gram"

    speculate = get_speculate()
    if speculate > 0:
        logger.info(f"Using speculation {method} with {speculate} input ids.")

    model_type = config_dict["model_type"]

    if kv_cache_dtype == "fp8_e4m3fn":
        kv_cache_dtype = torch.float8_e4m3fn
    elif kv_cache_dtype == "fp8_e5m2":
        kv_cache_dtype = torch.float8_e5m2
    else:
        kv_cache_dtype = dtype

    if FLASH_ATTENTION:
        if model_type == DEEPSEEK_V2:
            head_size = max(
                config_dict.get("qk_nope_dim", 128)
                + config_dict.get("qk_rope_dim", 64),
                config_dict.get("v_head_dim", 128),
            )
            return FlashCausalLM(
                model_id=model_id,
                model_class=FlashDeepseekV2ForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                default_dtype=torch.bfloat16,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
                config_class=DeepseekV2Config,
                head_size=head_size,
            )
        elif model_type == DEEPSEEK_V3:
            head_size = max(
                config_dict.get("qk_nope_dim", 128)
                + config_dict.get("qk_rope_dim", 64),
                config_dict.get("v_head_dim", 128),
            )
            return FlashCausalLM(
                model_id=model_id,
                model_class=FlashDeepseekV3ForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                default_dtype=torch.bfloat16,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
                config_class=DeepseekV3Config,
                head_size=head_size,
            )

        elif (
            model_type == GPT_BIGCODE
            or model_type == GPT2
            and model_id.startswith("bigcode/")
        ):
            return FlashCausalLM(
                model_id=model_id,
                model_class=FlashSantacoderForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
                aliases={"transformer.wte.weight": ["lm_head.weight"]},
                num_kv_heads=1,
            )
        elif model_type == GPT2:
            return FlashCausalLM(
                model_id=model_id,
                model_class=FlashGPT2ForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
            )
        elif model_type == GPTJ:
            return FlashCausalLM(
                model_id=model_id,
                model_class=FlashGPTJForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
            )
        elif model_type == GPT_NEOX:
            from text_generation_server.models.custom_modeling.flash_neox_modeling import (
                GPTNeoXConfig,
            )

            return FlashCausalLM(
                model_id=model_id,
                model_class=FlashGPTNeoXForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
                config_class=GPTNeoXConfig,
            )
        elif model_type == PHI:
            return FlashCausalLM(
                model_id=model_id,
                model_class=FlashPhiForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
            )
        elif model_type == PHI_MOE:
            return FlashCausalLM(
                model_id=model_id,
                model_class=FlashLlamaForCausalLM,
                config_class=PhiMoEConfig,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
            )
        elif model_type == LLAMA or model_type == PHI3 or model_type == GRANITE:
            return FlashCausalLM(
                model_id=model_id,
                model_class=FlashLlamaForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
            )
        elif model_type == LLAMA4:
            print(f"Llama4 model detected: {model_id}")
            return FlashVlmCausalLM(
                model_id=model_id,
                model_class=Llama4ForConditionalGeneration,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                default_dtype=torch.bfloat16,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
                support_chunking=False,
            )
        elif model_type == BAICHUAN:
            return FlashCausalLM(
                model_id=model_id,
                model_class=FlashLlamaForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
            )
        elif model_type == GEMMA:
            return FlashCausalLM(
                model_id=model_id,
                model_class=FlashGemmaForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                # Works better for these models
                default_dtype=torch.bfloat16,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
            )
        elif model_type == GEMMA2:
            return FlashCausalLM(
                model_id=model_id,
                model_class=FlashGemma2ForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                # Works better for these models
                default_dtype=torch.bfloat16,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
            )
        elif model_type == GEMMA3:
            return FlashVlmCausalLM(
                model_id=model_id,
                model_class=Gemma3ForConditionalGeneration,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                default_dtype=torch.bfloat16,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
                support_chunking=False,
            )
        elif model_type == GEMMA3_TEXT:
            return FlashCausalLM(
                model_id=model_id,
                model_class=FlashGemma3ForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                # Works better for these models
                default_dtype=torch.bfloat16,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
            )
        elif model_type == COHERE:
            return FlashCausalLM(
                model_id=model_id,
                model_class=FlashCohereForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
            )
        elif model_type == DBRX:
            return FlashCausalLM(
                model_id=model_id,
                model_class=FlashDbrxForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                # Dbrx works better in bfloat16.
                default_dtype=torch.bfloat16,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
                config_class=DbrxConfig,
            )
        elif (
            model_type in ["RefinedWeb", "RefinedWebModel", FALCON]
            and not sharded
            and not config_dict.get("alibi", False)
        ):
            return FlashCausalLM(
                model_id=model_id,
                model_class=FlashRWForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                aliases={
                    "lm_head.weight": ["transformer.word_embeddings.weight"],
                    "transformer.word_embeddings.weight": ["lm_head.weight"],
                },
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
                config_class=RWConfig,
            )
        elif model_type == MISTRAL:
            return FlashCausalLM(
                model_id=model_id,
                model_class=FlashMistralForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
            )
        elif model_type == MIXTRAL:
            return FlashCausalLM(
                model_id=model_id,
                model_class=FlashMixtralForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
            )
        elif model_type == STARCODER2:
            return FlashCausalLM(
                model_id=model_id,
                model_class=FlashStarcoder2ForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
            )
        elif model_type == QWEN2:
            return FlashCausalLM(
                model_id=model_id,
                model_class=Qwen2ForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
            )
        elif model_type == QWEN2_VL:
            return FlashVlmCausalLM(
                model_id=model_id,
                model_class=Qwen2VLForConditionalGeneration,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                default_dtype=torch.bfloat16,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
                # TODO: Fix bug in rust image_text_replacement implementation
                support_chunking=False,
            )
        elif model_type == QWEN2_5_VL:
            return FlashVlmCausalLM(
                model_id=model_id,
                model_class=Qwen2_5VLForConditionalGeneration,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                default_dtype=torch.bfloat16,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
                config_class=Qwen2_5_VLConfig,
                processor_class=Qwen2_5_VLProcessor,
                # TODO: Fix bug in rust image_text_replacement implementation
                support_chunking=False,
            )
        elif model_type == QWEN3:
            return FlashCausalLM(
                model_id=model_id,
                model_class=Qwen3ForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
            )
        elif model_type == QWEN3_MOE:
            return FlashCausalLM(
                model_id=model_id,
                model_class=Qwen3MoeForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
            )
        elif model_type == MLLAMA:
            return FlashMllamaCausalLM(
                model_id=model_id,
                model_class=FlashMllamaForConditionalGeneration,
                batch_class=FlashMllamaCausalLMBatch,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                default_dtype=torch.bfloat16,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
                support_chunking=False,
            )
        elif model_type == IDEFICS2:
            return FlashVlmCausalLM(
                model_id=model_id,
                model_class=Idefics2ForConditionalGeneration,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
                # XXX: Extremely important to cap resolution in order to limit
                # VRAM usage.
                processor_kwargs={"size": {"longest_edge": 448, "shortest_edge": 378}},
            )
        elif model_type == IDEFICS3:
            return FlashVlmCausalLM(
                model_id=model_id,
                model_class=Idefics3ForConditionalGeneration,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                default_dtype=torch.bfloat16,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
                # XXX: Extremely important to cap resolution in order to limit
                # VRAM usage.
                processor_kwargs={"size": {"longest_edge": 1456}},
            )
        elif model_type == PALIGEMMA:
            return FlashVlmCausalLM(
                model_id=model_id,
                model_class=PaliGemmaForConditionalGeneration,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                # Works better for these models
                default_dtype=torch.bfloat16,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
            )
        elif model_type == LLAVA_NEXT:
            return FlashVlmCausalLM(
                model_class=FlashLlavaNextForConditionalGeneration,
                model_id=model_id,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
            )

    raise ValueError(f"Unsupported model type {model_type}")


# get_model_with_lora_adapters wraps the internal get_model function and adds support for loading adapters
# this provides a post model loading hook to load adapters into the model after the model has been loaded
def get_model_with_lora_adapters(
    model_id: str,
    lora_adapters: Optional[List[AdapterInfo]],
    revision: Optional[str],
    sharded: bool,
    quantize: Optional[str],
    speculate: Optional[int],
    dtype: Optional[torch.dtype],
    kv_cache_dtype: Optional[str],
    trust_remote_code: bool,
    max_input_tokens: int,
    adapter_to_index: Dict[str, int],
):
    lora_adapter_ids = [adapter.id for adapter in lora_adapters]
    model = get_model(
        model_id,
        lora_adapter_ids,
        revision,
        sharded,
        quantize,
        speculate,
        dtype,
        kv_cache_dtype,
        trust_remote_code,
        max_input_tokens,
    )

    if len(lora_adapters) > 0:
        target_to_layer = build_layer_weight_lookup(model.model)

        for index, adapter in enumerate(lora_adapters):
            # The AdapterParameters object allows for merging multiple adapters into a single adapter.
            # At the moment, we only support loading a single adapter into the model, but we keep the
            # AdapterParameters object for easier extension in the future.
            adapter_parameters = AdapterParameters(
                adapter_info=[adapter],
                # when merging multiple adapters we can weight them differently
                # if this is not set, all adapters will be weighted equally
                # see: text_generation_server.utils.merges.strategies for impl
                weights=None,
                merge_strategy=0,
                density=1.0,
                majority_sign_method=0,
            )

            adapter_index = index + 1
            adapter_to_index[adapter.id] = adapter_index

            logger.info(
                f"Loading adapter weights into model: {','.join([adapter.id for adapter in adapter_parameters.adapter_info])}"
            )
            weight_names = tuple([v[0] for v in target_to_layer.values()])
            (
                module_map,
                adapter_config,
                adapter_weight_names,
                adapter_tokenizer,
            ) = load_and_merge_adapters(
                model.model_id,
                adapter_parameters,
                adapter_index,
                weight_names,
                False,
            )

            unused_weight_names = adapter_weight_names.copy()

            adapter_layers = [
                "q_proj",
                "k_proj",
                "v_proj",
                "o_proj",
                "gate_proj",
                "up_proj",
                "down_proj",
                "qkv_proj",
            ]

            for layer_name in adapter_layers:
                nlayers = (
                    1 if layer_name == "lm_head" else len(model.model.model.layers)
                )
                adapter_weights = LoraWeights.prepare_weights(
                    config=adapter_config,
                    module_map=module_map,
                    layer_type=layer_name,
                    unused_weight_names=unused_weight_names,
                    nlayers=nlayers,
                    dtype=model.dtype,
                    world_size=model.world_size,
                    process_group=model.process_group,
                    target_to_layer=target_to_layer,
                )

                if adapter_weights is None:
                    continue

                model.layer_to_adapter_weights[layer_name].add_adapter(
                    adapter_index, adapter_weights
                )

            if len(unused_weight_names) > 0:
                logger.warning(
                    f"{','.join([a.id for a in lora_adapters])} unused adapter weights: {unused_weight_names}"
                )

            if adapter_tokenizer is not None:
                model.tokenizers.add_tokenizer(adapter_index, adapter_tokenizer)

            model.loaded_adapters.add(adapter_index)

    return model


================================================
FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/__init__.py
================================================


================================================
FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/bloom_modeling.py
================================================
# coding=utf-8
# Copyright 2022 HuggingFace Inc. team and BigScience workshop.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch BLOOM model."""

import math
import os
import warnings
from typing import Optional, Tuple, Union

import torch
import torch.distributed
import torch.utils.checkpoint
from torch import nn
from torch.nn import LayerNorm
from torch.nn import functional as F

from transformers.modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
)
from transformers import BloomConfig, PreTrainedModel

from text_generation_server.layers import (
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    TensorParallelRowLinear,
    SpeculativeHead,
)

CUSTOM_KERNELS_ENABLED = False
if (
    torch.cuda.is_available()
    and not os.environ.get("DISABLE_CUSTOM_KERNELS", "False") == "True"
):
    try:
        from custom_kernels import fused_bloom_attention_cuda

        CUSTOM_KERNELS_ENABLED = True
    except ImportError:
        pass

_CHECKPOINT_FOR_DOC = "bigscience/bloom-560m"
_CONFIG_FOR_DOC = "BloomConfig"

BLOOM_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "bigscience/bigscience-small-testing",
    "bigscience/bloom-560m",
    "bigscience/bloom-1b1",
    "bigscience/bloom-1b7",
    "bigscience/bloom-3b",
    "bigscience/bloom-7b1",
    "bigscience/bloom",
]


def _make_causal_mask(
    input_ids_shape: torch.Size, device: torch.device, past_key_values_length: int
) -> torch.BoolTensor:
    """
    Make causal mask used for self-attention.
    """
    batch_size, target_length = input_ids_shape
    mask = torch.ones(
        (target_length, target_length + past_key_values_length),
        dtype=torch.bool,
        device=device,
    )
    mask = mask.triu(1 + past_key_values_length)

    expanded_mask = mask.unsqueeze(0).expand(
        batch_size, target_length, target_length + past_key_values_length
    )
    return expanded_mask


def _expand_mask(mask: torch.Tensor, tgt_length: int) -> torch.BoolTensor:
    """
    Expands attention_mask from `[batch_size, src_length]` to `[batch_size, 1, tgt_length, src_length]`.
    """
    batch_size, src_length = mask.shape
    tgt_length = tgt_length if tgt_length is not None else src_length

    expanded_mask = ~(mask[:, None, :].to(torch.bool))
    return expanded_mask.expand(batch_size, tgt_length, src_length)


def build_alibi_tensor(attention_mask: torch.Tensor, num_heads: int) -> torch.Tensor:
    """
    Link to paper: https://arxiv.org/abs/2108.12409 Alibi tensor is not causal as the original paper mentions, it
    relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value
    `softmax(l+a) = softmax(l)`. Based on
    https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
    TODO @thomasw21 this doesn't work as nicely due to the masking strategy, and so masking varies slightly.

    Args:
    Returns tensor shaped (batch_size * num_heads, 1, max_seq_len)
        attention_mask (`torch.Tensor`):
            Token-wise attention mask, this should be of shape (batch_size, max_seq_len).
        num_heads (`int`, *required*):
            number of heads
        dtype (`torch.dtype`, *optional*, default=`torch.bfloat16`):
            dtype of the output tensor
    """
    batch_size, seq_length = attention_mask.shape
    closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
    base = torch.tensor(
        2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))),
        device=attention_mask.device,
        dtype=torch.float32,
    )
    powers = torch.arange(
        1, 1 + closest_power_of_2, device=attention_mask.device, dtype=torch.int32
    )
    slopes = torch.pow(base, powers)

    if closest_power_of_2 != num_heads:
        extra_base = torch.tensor(
            2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))),
            device=attention_mask.device,
            dtype=torch.float32,
        )
        num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
        extra_powers = torch.arange(
            1,
            1 + 2 * num_remaining_heads,
            2,
            device=attention_mask.device,
            dtype=torch.int32,
        )
        slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)

    # Note: alibi will added to the attention bias that will be applied to the query, key product of attention
    # => therefore alibi will have to be of shape (batch_size, num_heads, query_length, key_length)
    # => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length)
    # => the query_length dimension will then be broadcasted correctly
    # This is more or less identical to T5's relative position bias:
    # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527
    arange_tensor = ((attention_mask.cumsum(dim=-1) - 1) * attention_mask)[:, None, :]
    alibi = slopes[..., None] * arange_tensor
    return alibi


# @torch.jit.script
def dropout_add(
    x: torch.Tensor, residual: torch.Tensor, prob: float, training: bool
) -> torch.Tensor:
    """
    Dropout add function

    Args:
        x (`torch.tensor`, *required*):
            input tensor
        residual (`torch.tensor`, *required*):
            esidual tensor
        prob (`float`, *required*):
            dropout probability
        training (`bool`, *required*):
            training mode
    """
    out = F.dropout(x, p=prob, training=training)
    out = residual + out
    return out


# @torch.jit.script # this is shit for unknow reasons.
def _split_heads(
    fused_qkv: torch.Tensor, num_heads: int, head_dim: int
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """
    Split the last dimension into (num_heads, head_dim) without making any copies, results share same memory
    storage as `fused_qkv`

    Args:
        fused_qkv (`torch.tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim]

    Returns:
        query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
        value: [batch_size, seq_length, num_heads, head_dim]
    """
    batch_size, seq_length, three_times_hidden_size = fused_qkv.shape
    fused_qkv = fused_qkv.view(batch_size, seq_length, num_heads, 3 * head_dim)
    query_layer, key_layer, value_layer = fused_qkv.split(head_dim, dim=-1)

    query_layer = query_layer.transpose(1, 2).reshape(
        batch_size * num_heads, seq_length, head_dim
    )
    key_layer = key_layer.permute(0, 2, 3, 1).reshape(
        batch_size * num_heads, head_dim, seq_length
    )
    value_layer = value_layer.transpose(1, 2).reshape(
        batch_size * num_heads, seq_length, head_dim
    )

    return query_layer, key_layer, value_layer


# @torch.jit.script
def _merge_heads(x: torch.Tensor, num_heads: int, head_dim: int) -> torch.Tensor:
    """
    Merge heads together over the last dimenstion

    Args:
        x: (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]

    Returns:
        torch.tensor: [batch_size, seq_length, num_heads * head_dim]
    """
    # What we want to achieve is:
    # batch_size * num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads * head_dim
    batch_size_and_num_heads, seq_length, _ = x.shape
    batch_size = batch_size_and_num_heads // num_heads

    # First view to decompose the batch size
    # batch_size * num_heads, seq_length, head_dim -> batch_size, num_heads, seq_length, head_dim
    x = x.view(batch_size, num_heads, seq_length, head_dim)

    # batch_size, num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads, head_dim
    x = x.permute(0, 2, 1, 3)

    # batch_size, seq_length, num_heads, head_dim -> batch_size, seq_length, num_heads * head_dim
    return x.reshape(batch_size, seq_length, num_heads * head_dim)


class BloomAttention(nn.Module):
    def __init__(self, prefix, config: BloomConfig, weights):
        super().__init__()

        self.pretraining_tp = config.pretraining_tp
        self.slow_but_exact = config.slow_but_exact

        self.process_group = weights.process_group

        self.hidden_size = config.hidden_size
        self.num_heads = config.n_head
        self.head_dim = self.hidden_size // self.num_heads
        self.split_size = self.hidden_size
        self.hidden_dropout = config.hidden_dropout

        if self.head_dim * self.num_heads != self.hidden_size:
            raise ValueError(
                f"`hidden_size` must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`:"
                f" {self.num_heads})."
            )

        # Layer-wise attention scaling
        self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim)
        self.beta = 1.0

        process_group = weights.process_group
        if self.num_heads % process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {process_group.size()}"
            )
        self.num_heads = self.num_heads // process_group.size()
        self.query_key_value = TensorParallelColumnLinear.load(
            config=config,
            prefix=f"{prefix}.query_key_value",
            weights=weights,
            bias=True,
        )
        self.dense = TensorParallelRowLinear.load(
            config=config, prefix=f"{prefix}.dense", weights=weights, bias=True
        )
        self.attention_dropout = nn.Dropout(config.attention_dropout)

    @staticmethod
    def compute_attention(
        fused_qkv: torch.Tensor,
        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]],
        alibi: torch.Tensor,
        attention_mask: torch.Tensor,
        head_mask: Optional[torch.Tensor],
        beta: float,
        inv_norm_factor: float,
        num_heads: int,
        use_cache: bool,
    ):
        batch_size, q_length, three_times_hidden_size = fused_qkv.shape
        head_dim = three_times_hidden_size // (3 * num_heads)
        batch_size * num_heads

        ### TODO @thomasw21: this takes quite a bit of time, how do I accelerate that?
        # 3 x [batch_size, seq_length, num_heads, head_dim]
        (query_layer, key_layer, value_layer) = _split_heads(
            fused_qkv, num_heads=num_heads, head_dim=head_dim
        )

        if layer_past is not None:
            past_key, past_value = layer_past
            # concatenate along seq_length dimension:
            #  - key: [batch_size * self.num_heads, head_dim, kv_length]
            #  - value: [batch_size * self.num_heads, kv_length, head_dim]
            past_key = past_key.view(-1, *past_key.shape[-2:])
            key_layer = torch.cat((past_key, key_layer), dim=2)
            past_value = past_value.view(-1, *past_value.shape[-2:])
            value_layer = torch.cat((past_value, value_layer), dim=1)

        _, _, kv_length = key_layer.shape

        if use_cache is True:
            present = (key_layer, value_layer)
        else:
            present = None
        ###

        # [batch_size * num_heads, q_length, kv_length]
        # we use `torch.Tensor.baddbmm` instead of `torch.baddbmm` as the latter isn't supported by TorchScript v1.11
        attention_scores = alibi.baddbmm(
            batch1=query_layer,
            batch2=key_layer,
            beta=beta,
            alpha=inv_norm_factor,
        )

        # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length]
        input_dtype = attention_scores.dtype
        # `float16` has a minimum value of -65504.0, whereas `bfloat16` and `float32` have a minimum value of `-3.4e+38`
        if input_dtype == torch.float16:
            attention_scores = attention_scores.to(torch.float)
        # torch.finfo not supported by torch.jit, we temporarily remplace with `-1e34`
        attn_weights = attention_scores.masked_fill_(
            attention_mask, torch.finfo(attention_scores.dtype).min
        )
        attention_probs = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(
            input_dtype
        )

        # # [batch_size, num_heads, q_length, kv_length]
        # attention_probs = self.attention_dropout(attention_probs)

        if head_mask is not None:
            attention_probs = attention_probs * head_mask

        # matmul: [batch_size * num_heads, q_length, head_dim]
        context_layer = torch.bmm(attention_probs, value_layer, out=query_layer)

        # change view [batch_size, num_heads, q_length, head_dim]
        context_layer = _merge_heads(
            context_layer, num_heads=num_heads, head_dim=head_dim
        )

        return context_layer, present, attention_probs

    def forward(
        self,
        hidden_states: torch.Tensor,
        residual: torch.Tensor,
        alibi: torch.Tensor,
        attention_mask: torch.Tensor,
        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
        head_mask: Optional[torch.Tensor] = None,
        use_cache: bool = False,
        output_attentions: bool = False,
    ):
        fused_qkv = self.query_key_value(
            hidden_states
        )  # [batch_size, seq_length, 3 x hidden_size]
        batch_size, q_length, _ = fused_qkv.shape

        if layer_past is not None:
            past_key, past_value = layer_past
            layer_past = (
                past_key.view(-1, *past_key.shape[-2:]),
                past_value.view(-1, *past_value.shape[-2:]),
            )

        if CUSTOM_KERNELS_ENABLED and attention_mask.shape[-1] < 4096:
            assert self.training is False, "Only foward pass was implemented"
            assert (
                attention_mask.shape[-1] < 4096
            ), "Custom kernel support only up to 4096 tokens"
            (
                context_layer,
                present,
                attention_probs,
            ) = fused_bloom_attention_cuda.forward(
                fused_qkv,
                layer_past,
                alibi,
                attention_mask,
                head_mask,
                self.beta,
                self.inv_norm_factor,
                self.num_heads,
                use_cache,
            )
        else:
            context_layer, present, attention_probs = self.compute_attention(
                fused_qkv=fused_qkv,
                layer_past=layer_past,
                alibi=alibi,
                attention_mask=attention_mask,
                head_mask=head_mask,
                beta=self.beta,
                inv_norm_factor=self.inv_norm_factor,
                num_heads=self.num_heads,
                use_cache=use_cache,
            )

        # aggregate results across tp ranks. See here: https://github.com/pytorch/pytorch/issues/76232
        if self.pretraining_tp > 1 and self.slow_but_exact:
            slices = self.hidden_size / self.pretraining_tp
            output_tensor = torch.zeros_like(context_layer)
            for i in range(self.pretraining_tp):
                output_tensor = output_tensor + F.linear(
                    context_layer[:, :, int(i * slices) : int((i + 1) * slices)],
                    self.dense.weight[:, int(i * slices) : int((i + 1) * slices)],
                )
        else:
            output_tensor = self.dense(context_layer)

        # output_tensor = dropout_add(output_tensor, residual, self.hidden_dropout, self.training)
        output_tensor += residual

        outputs = (output_tensor, present)
        if output_attentions:
            outputs += (attention_probs,)

        return outputs


class BloomMLP(nn.Module):
    def __init__(self, prefix, config: BloomConfig, weights):
        super().__init__()

        self.pretraining_tp = config.pretraining_tp
        self.slow_but_exact = config.slow_but_exact
        self.dense_h_to_4h = TensorParallelColumnLinear.load(
            config=config, prefix=f"{prefix}.dense_h_to_4h", weights=weights, bias=True
        )
        self.dense_4h_to_h = TensorParallelRowLinear.load(
            config=config, prefix=f"{prefix}.dense_4h_to_h", weights=weights, bias=True
        )
        self.gelu_impl = torch.nn.GELU(approximate="tanh")
        self.hidden_dropout = config.hidden_dropout

    def forward(
        self, hidden_states: torch.Tensor, residual: torch.Tensor
    ) -> torch.Tensor:
        hidden_states = self.gelu_impl(self.dense_h_to_4h(hidden_states))

        if self.pretraining_tp > 1 and self.slow_but_exact:
            intermediate_output = torch.zeros_like(residual)
            slices = self.dense_4h_to_h.weight.shape[-1] / self.pretraining_tp
            for i in range(self.pretraining_tp):
                intermediate_output = intermediate_output + F.linear(
                    hidden_states[:, :, int(i * slices) : int((i + 1) * slices)],
                    self.dense_4h_to_h.weight[
                        :, int(i * slices) : int((i + 1) * slices)
                    ],
                )
        else:
            intermediate_output = self.dense_4h_to_h(hidden_states)

        # output = dropout_add(intermediate_output, residual, self.hidden_dropout, self.training)
        intermediate_output += residual

        return intermediate_output


class BloomBlock(nn.Module):
    def __init__(self, layer_id: int, config: BloomConfig, weights):
        super().__init__()

        prefix = f"h.{layer_id}"
        self.input_layernorm = LayerNorm.load(
            prefix=f"{prefix}.input_layernorm",
            weights=weights,
            eps=config.layer_norm_epsilon,
        )
        self.num_heads = config.n_head
        self.self_attention = BloomAttention(
            prefix=f"{prefix}.self_attention", config=config, weights=weights
        )
        self.post_attention_layernorm = LayerNorm.load(
            prefix=f"{prefix}.post_attention_layernorm",
            weights=weights,
            eps=config.layer_norm_epsilon,
        )

        self.mlp = BloomMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
        self.apply_residual_connection_post_layernorm = (
            config.apply_residual_connection_post_layernorm
        )
        self.hidden_dropout = config.hidden_dropout

    def forward(
        self,
        hidden_states: torch.Tensor,
        alibi: torch.Tensor,
        attention_mask: torch.Tensor,
        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
        head_mask: Optional[torch.Tensor] = None,
        use_cache: bool = False,
        output_attentions: bool = False,
    ):
        # hidden_states: [batch_size, seq_length, hidden_size]

        # Layer norm at the beginning of the transformer layer.
        layernorm_output = self.input_layernorm(hidden_states)

        # Layer norm post the self attention.
        if self.apply_residual_connection_post_layernorm:
            residual = layernorm_output
        else:
            residual = hidden_states

        # Self attention.
        attn_outputs = self.self_attention(
            layernorm_output,
            residual,
            layer_past=layer_past,
            attention_mask=attention_mask,
            alibi=alibi,
            head_mask=head_mask,
            use_cache=use_cache,
            output_attentions=output_attentions,
        )

        attention_output = attn_outputs[0]

        outputs = attn_outputs[1:]

        layernorm_output = self.post_attention_layernorm(attention_output)

        # Get residual
        if self.apply_residual_connection_post_layernorm:
            residual = layernorm_output
        else:
            residual = attention_output

        # MLP.
        output = self.mlp(layernorm_output, residual)

        if use_cache:
            outputs = (output,) + outputs
        else:
            outputs = (output,) + outputs[1:]

        return outputs  # hidden_states, present, attentions


class BloomPreTrainedModel(PreTrainedModel):
    config_class = BloomConfig
    base_model_prefix = "transformer"
    _no_split_modules = ["BloomBlock"]

    @staticmethod
    def _convert_to_standard_cache(
        past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]], batch_size: int
    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
        """
        Standardizes the format of the cache so as to match most implementations, i.e. to tuple(tuple([batch_size,
        num_heads, ...]))
        """
        batch_size_times_num_heads, head_dim, seq_length = past_key_value[0][0].shape
        num_heads = batch_size_times_num_heads // batch_size
        # key: [batch_size * num_heads, head_dim, seq_length] -> [batch_size, num_heads, head_dim, seq_length]
        # value: [batch_size * num_heads, seq_length, head_dim] -> [batch_size, num_heads, seq_length, head_dim]
        return tuple(
            (
                layer_past[0].view(batch_size, num_heads, head_dim, seq_length),
                layer_past[1].view(batch_size, num_heads, seq_length, head_dim),
            )
            for layer_past in past_key_value
        )

    @staticmethod
    def _convert_to_bloom_cache(
        past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]],
    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
        """
        Converts the cache to the format expected by Bloom, i.e. to tuple(tuple([batch_size * num_heads, ...]))
        """
        batch_size, num_heads, head_dim, seq_length = past_key_value[0][0].shape
        batch_size_times_num_heads = batch_size * num_heads
        # key:  [batch_size, num_heads, head_dim, seq_length] -> [batch_size * num_heads, head_dim, seq_length]
        # value: [batch_size, num_heads, seq_length, head_dim] -> [batch_size * num_heads, seq_length, head_dim]
        return tuple(
            (
                layer_past[0].view(batch_size_times_num_heads, head_dim, seq_length),
                layer_past[1].view(batch_size_times_num_heads, seq_length, head_dim),
            )
            for layer_past in past_key_value
        )


class BloomModel(BloomPreTrainedModel):
    def __init__(self, config: BloomConfig, weights):
        super().__init__(config)

        self.embed_dim = config.hidden_size
        self.num_heads = config.n_head

        process_group = weights.process_group
        self.tp_rank = process_group.rank()
        self.tp_world_size = process_group.size()

        self.word_embeddings = TensorParallelEmbedding(
            prefix="word_embeddings", weights=weights
        )

        self.word_embeddings_layernorm = LayerNorm.load(
            prefix="word_embeddings_layernorm",
            weights=weights,
            eps=config.layer_norm_epsilon,
        )

        # Transformer blocks
        self.h = nn.ModuleList(
            [
                BloomBlock(layer_id=layer_id, config=config, weights=weights)
                for layer_id in range(config.num_hidden_layers)
            ]
        )

        # Final Layer Norm
        self.ln_f = LayerNorm.load(
            prefix="ln_f", weights=weights, eps=config.layer_norm_epsilon
        )

    def _prepare_attn_mask(
        self,
        attention_mask: torch.Tensor,
        input_shape: Tuple[int, int],
        past_key_values_length: int,
    ) -> torch.BoolTensor:
        # create causal mask
        # [batch_size, seq_length] -> [batch_size, tgt_length, src_length]
        combined_attention_mask = None
        device = attention_mask.device
        _, src_length = input_shape

        if src_length > 1:
            combined_attention_mask = _make_causal_mask(
                input_shape,
                device=device,
                past_key_values_length=past_key_values_length,
            )

        # [batch_size, seq_length] -> [batch_size, tgt_length, src_length]
        expanded_attn_mask = _expand_mask(attention_mask, tgt_length=src_length)
        combined_attention_mask = (
            expanded_attn_mask
            if combined_attention_mask is None
            else expanded_attn_mask | combined_attention_mask
        )

        return combined_attention_mask

    def set_input_embeddings(self, new_embeddings: torch.Tensor):
        self.word_embeddings = new_embeddings

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **deprecated_arguments,
    ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
        if deprecated_arguments.pop("position_ids", False) is not False:
            # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
            warnings.warn(
                "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore"
                " passing `position_ids`.",
                FutureWarning,
            )
        if len(deprecated_arguments) > 0:
            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")

        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        if input_ids is not None and inputs_embeds is not None:
            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time"
            )
        elif input_ids is not None:
            batch_size, seq_length = input_ids.shape
        elif inputs_embeds is not None:
            batch_size, seq_length, _ = inputs_embeds.shape
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        if past_key_values is None:
            past_key_values = tuple([None] * len(self.h))

        # Prepare head mask if needed
        # 1.0 in head_mask indicate we keep the head
        # attention_probs has shape batch_size x num_heads x N x N
        # head_mask has shape n_layer x batch x num_heads x N x N
        head_mask = self.get_head_mask(head_mask, self.config.n_layer)

        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)

        hidden_states = self.word_embeddings_layernorm(inputs_embeds)

        presents = () if use_cache else None
        all_self_attentions = () if output_attentions else None
        all_hidden_states = () if output_hidden_states else None

        # Compute alibi tensor: check build_alibi_tensor documentation
        seq_length_with_past = seq_length
        past_key_values_length = 0
        if past_key_values[0] is not None:
            past_key_values_length = past_key_values[0][0].shape[-1]
            seq_length_with_past = seq_length_with_past + past_key_values_length
        if attention_mask is None:
            attention_mask = torch.ones(
                (batch_size, seq_length_with_past), device=hidden_states.device
            )
        else:
            attention_mask = attention_mask.to(hidden_states.device)

        alibi = build_alibi_tensor(attention_mask, self.num_heads)

        causal_mask = self._prepare_attn_mask(
            attention_mask,
            input_shape=(batch_size, seq_length),
            past_key_values_length=past_key_values_length,
        )

        if hasattr(self, "tp_rank"):
            assert self.num_heads % self.tp_world_size == 0
            block_size = self.num_heads // self.tp_world_size
            alibi = alibi[
                :, self.tp_rank * block_size : (self.tp_rank + 1) * block_size
            ]
            alibi = alibi.reshape(batch_size * block_size, 1, seq_length_with_past)
            causal_mask = torch.repeat_interleave(causal_mask, block_size, dim=0)
        else:
            alibi = alibi.reshape(batch_size * self.num_heads, 1, seq_length_with_past)
            causal_mask = torch.repeat_interleave(causal_mask, self.num_heads, dim=0)

        alibi = alibi.to(hidden_states.dtype)

        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            outputs = block(
                hidden_states,
                layer_past=layer_past,
                attention_mask=causal_mask,
                head_mask=head_mask[i],
                use_cache=use_cache,
                output_attentions=output_attentions,
                alibi=alibi,
            )

            hidden_states = outputs[0]
            if use_cache is True:
                presents = presents + (outputs[1],)

            if output_attentions:
                all_self_attentions = all_self_attentions + (
                    outputs[2 if use_cache else 1],
                )

        # Add last hidden state
        hidden_states = self.ln_f(hidden_states)

        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        if not return_dict:
            return tuple(
                v
                for v in [
                    hidden_states,
                    presents,
                    all_hidden_states,
                    all_self_attentions,
                ]
                if v is not None
            )

        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=presents,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
        )


class BloomForCausalLM(BloomPreTrainedModel):
    def __init__(self, prefix: str, config, weights):
        super().__init__(config)
        self.transformer = BloomModel(config, weights)

        self.lm_head = SpeculativeHead.load(
            config,
            prefix="word_embeddings",
            weights=weights,
        )

    def prepare_inputs_for_generation(
        self,
        input_ids: torch.LongTensor,
        past_key_values: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        **kwargs,
    ) -> dict:
        # only last token for input_ids if past is not None
        if past_key_values:
            input_ids = input_ids[:, -1].unsqueeze(-1)

            # the cache may be in the stardard format (e.g. in contrastive search), convert to bloom's format if needed
            if past_key_values[0][0].shape[0] == input_ids.shape[0]:
                past_key_values = self._convert_to_bloom_cache(past_key_values)

        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
        if inputs_embeds is not None and past_key_values is None:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}

        model_inputs.update(
            {
                "past_key_values": past_key_values,
                "use_cache": kwargs.get("use_cache"),
                "attention_mask": attention_mask,
            }
        )
        return model_inputs

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **deprecated_arguments,
    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        """
        if deprecated_arguments.pop("position_ids", False) is not False:
            # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
            warnings.warn(
                "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore"
                " passing `position_ids`.",
                FutureWarning,
            )
        if len(deprecated_arguments) > 0:
            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")

        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        transformer_outputs = self.transformer(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = transformer_outputs[0]

        logits, speculative_logits = self.lm_head(hidden_states)
        loss = None

        if not return_dict:
            output = (logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return (
            CausalLMOutputWithCrossAttentions(
                loss=loss,
                logits=logits,
                past_key_values=transformer_outputs.past_key_values,
                hidden_states=transformer_outputs.hidden_states,
                attentions=transformer_outputs.attentions,
            ),
            speculative_logits,
        )


================================================
FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/clip.py
================================================
from typing import Optional, Tuple

import torch
from torch import nn

from transformers.activations import ACT2FN
from transformers.modeling_attn_mask_utils import (
    _create_4d_causal_attention_mask,
    _prepare_4d_attention_mask,
)
from transformers.modeling_outputs import (
    BaseModelOutputWithPooling,
)
from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig

from text_generation_server.layers import (
    TensorParallelEmbedding,
    TensorParallelColumnLinear,
    TensorParallelRowLinear,
)


class CLIPVisionEmbeddings(nn.Module):
    def __init__(self, prefix, config: CLIPVisionConfig, weights):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size
        self.image_size = config.image_size
        self.patch_size = config.patch_size

        # TODO Should we TP this ?
        self.class_embedding = weights.get_tensor(f"{prefix}.class_embedding")

        self.patch_embedding = nn.Conv2d(
            in_channels=config.num_channels,
            out_channels=self.embed_dim,
            kernel_size=self.patch_size,
            stride=self.patch_size,
            bias=False,
        )
        self.patch_embedding.weight = nn.Parameter(
            weights.get_tensor(f"{prefix}.patch_embedding.weight"), requires_grad=False
        )

        self.num_patches = (self.image_size // self.patch_size) ** 2
        self.num_positions = self.num_patches + 1
        self.position_embedding = TensorParallelEmbedding(
            prefix=f"{prefix}.position_embedding", weights=weights
        )
        self.register_buffer(
            "position_ids",
            torch.arange(self.num_positions, device=weights.device).expand((1, -1)),
            persistent=False,
        )

    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
        batch_size = pixel_values.shape[0]
        target_dtype = self.patch_embedding.weight.dtype
        patch_embeds = self.patch_embedding(
            pixel_values.to(dtype=target_dtype)
        )  # shape = [*, width, grid, grid]
        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)

        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
        embeddings = embeddings + self.position_embedding(self.position_ids)
        return embeddings


class CLIPTextEmbeddings(nn.Module):
    def __init__(self, config: CLIPTextConfig):
        super().__init__()
        embed_dim = config.hidden_size

        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
        self.position_embedding = nn.Embedding(
            config.max_position_embeddings, embed_dim
        )

        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
        self.register_buffer(
            "position_ids",
            torch.arange(config.max_position_embeddings).expand((1, -1)),
            persistent=False,
        )

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
    ) -> torch.Tensor:
        seq_length = (
            input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
        )

        if position_ids is None:
            position_ids = self.position_ids[:, :seq_length]

        if inputs_embeds is None:
            inputs_embeds = self.token_embedding(input_ids)

        position_embeddings = self.position_embedding(position_ids)
        embeddings = inputs_embeds + position_embeddings

        return embeddings


class CLIPAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_size = self.embed_dim // self.num_heads
        if self.head_size * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {self.num_heads})."
            )
        self.num_heads = self.num_heads // weights.process_group.size()
        self.embed_dim = self.embed_dim // weights.process_group.size()
        self.scale = self.head_size**-0.5
        self.dropout = config.attention_dropout

        self.qkv = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
            dim=0,
            weights=weights,
            bias=True,
        )
        self.out_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.out_proj",
            weights=weights,
            bias=True,
        )

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return (
            tensor.view(bsz, seq_len, self.num_heads, self.head_size)
            .transpose(1, 2)
            .contiguous()
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        causal_attention_mask: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        """Input shape: Batch x Time x Channel"""

        bsz, tgt_len, _ = hidden_states.size()

        # get query proj

        qkv = self.qkv(hidden_states)
        query_states, key_states, value_states = qkv.split(
            [
                self.head_size * self.num_heads,
            ]
            * 3,
            dim=2,
        )
        query_states = query_states * self.scale
        key_states = self._shape(key_states, -1, bsz)
        value_states = self._shape(value_states, -1, bsz)

        proj_shape = (bsz * self.num_heads, -1, self.head_size)
        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
        key_states = key_states.view(*proj_shape)
        value_states = value_states.view(*proj_shape)

        src_len = key_states.size(1)
        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))

        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
            raise ValueError(
                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
                f" {attn_weights.size()}"
            )

        # apply the causal_attention_mask first
        if causal_attention_mask is not None:
            if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
                raise ValueError(
                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
                    f" {causal_attention_mask.size()}"
                )
            attn_weights = (
                attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
                + causal_attention_mask
            )
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)

        if attention_mask is not None:
            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
                raise ValueError(
                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
                )
            attn_weights = (
                attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
                + attention_mask
            )
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)

        attn_weights = nn.functional.softmax(attn_weights, dim=-1)

        attn_probs = nn.functional.dropout(
            attn_weights, p=self.dropout, training=self.training
        )

        attn_output = torch.bmm(attn_probs, value_states)

        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_size):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_size)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_size)
        attn_output = attn_output.transpose(1, 2)
        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)

        attn_output = self.out_proj(attn_output)

        return attn_output, None


class CLIPMLP(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        self.activation_fn = ACT2FN[config.hidden_act]
        self.fc1 = TensorParallelColumnLinear.load(
            prefix=f"{prefix}.fc1", config=config, weights=weights, bias=True
        )
        self.fc2 = TensorParallelRowLinear.load(
            prefix=f"{prefix}.fc2", config=config, weights=weights, bias=True
        )

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.fc1(hidden_states)
        hidden_states = self.activation_fn(hidden_states)
        hidden_states = self.fc2(hidden_states)
        return hidden_states


class CLIPEncoderLayer(nn.Module):
    def __init__(self, prefix, config: CLIPConfig, weights):
        super().__init__()
        self.embed_dim = config.hidden_size
        self.self_attn = CLIPAttention(
            prefix=f"{prefix}.self_attn", config=config, weights=weights
        )
        self.layer_norm1 = nn.LayerNorm.load(
            prefix=f"{prefix}.layer_norm1", weights=weights, eps=config.layer_norm_eps
        )
        self.mlp = CLIPMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
        self.layer_norm2 = nn.LayerNorm.load(
            prefix=f"{prefix}.layer_norm2", weights=weights, eps=config.layer_norm_eps
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
        causal_attention_mask: torch.Tensor,
    ):
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
        """
        residual = hidden_states

        hidden_states = self.layer_norm1(hidden_states)
        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            causal_attention_mask=causal_attention_mask,
        )
        hidden_states = residual + hidden_states

        residual = hidden_states
        hidden_states = self.layer_norm2(hidden_states)
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states

        return hidden_states


class CLIPPreTrainedModel(nn.Module):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = CLIPConfig
    base_model_prefix = "clip"
    supports_gradient_checkpointing = True


CLIP_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`CLIPConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

CLIP_TEXT_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
"""

CLIP_VISION_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
"""

CLIP_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.
"""


class CLIPEncoder(nn.Module):
    """
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`CLIPEncoderLayer`].

    Args:
        config: CLIPConfig
    """

    def __init__(self, prefix, config: CLIPConfig, weights):
        super().__init__()
        self.config = config
        self.layers = nn.ModuleList(
            [
                CLIPEncoderLayer(
                    prefix=f"{prefix}.layers.{i}", config=config, weights=weights
                )
                for i in range(config.num_hidden_layers)
            ]
        )

    def forward(
        self,
        inputs_embeds,
        attention_mask: Optional[torch.Tensor] = None,
        causal_attention_mask: Optional[torch.Tensor] = None,
    ):
        r"""
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
        """

        hidden_states = inputs_embeds
        for idx, encoder_layer in enumerate(self.layers):
            hidden_states = encoder_layer(
                hidden_states,
                attention_mask,
                causal_attention_mask,
            )

        return hidden_states


class CLIPTextTransformer(nn.Module):
    def __init__(self, prefix: str, config: CLIPTextConfig, weights=None):
        super().__init__()
        self.config = config
        embed_dim = config.hidden_size
        self.embeddings = CLIPTextEmbeddings(config)
        # Initialize weights and apply final processing with `self.post_init()`
        self.encoder = CLIPEncoder(
            prefix=f"{prefix}.encoder", config=config, weights=weights
        )
        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)

        # For `pooled_output` computation
        self.eos_token_id = config.eos_token_id

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
    ):
        r"""
        Returns:

        """
        if input_ids is None:
            raise ValueError("You have to specify input_ids")

        input_shape = input_ids.size()
        input_ids = input_ids.view(-1, input_shape[-1])

        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)

        # CLIP's text model uses causal mask, prepare it here.
        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
        causal_attention_mask = _create_4d_causal_attention_mask(
            input_shape, hidden_states.dtype, device=hidden_states.device
        )
        # expand attention_mask
        if attention_mask is not None:
            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
            attention_mask = _prepare_4d_attention_mask(
                attention_mask, hidden_states.dtype
            )

        encoder_outputs = self.encoder(
            inputs_embeds=hidden_states,
            attention_mask=attention_mask,
            causal_attention_mask=causal_attention_mask,
        )

        last_hidden_state = encoder_outputs[0]
        last_hidden_state = self.final_layer_norm(last_hidden_state)

        if self.eos_token_id == 2:
            # The `eos_token_id` was incorrect before PR #24773: Let's keep what have been done here.
            # A CLIP model with such `eos_token_id` in the config can't work correctly with extra new tokens added
            # ------------------------------------------------------------
            # text_embeds.shape = [batch_size, sequence_length, transformer.width]
            # take features from the eot embedding (eot_token is the highest number in each sequence)
            # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
            last_hidden_state[
                torch.arange(
                    last_hidden_state.shape[0], device=last_hidden_state.device
                ),
                input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(
                    dim=-1
                ),
            ]
        else:
            # The config gets updated `eos_token_id` from PR #24773 (so the use of exta new tokens is possible)
            last_hidden_state[
                torch.arange(
                    last_hidden_state.shape[0], device=last_hidden_state.device
                ),
                # We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`)
                (
                    input_ids.to(dtype=torch.int, device=last_hidden_state.device)
                    == self.eos_token_id
                )
                .int()
                .argmax(dim=-1),
            ]

        return last_hidden_state


class CLIPTextModel(CLIPPreTrainedModel):
    config_class = CLIPTextConfig

    _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"]

    def __init__(self, prefix, config: CLIPTextConfig):
        super().__init__(config)
        self.text_model = CLIPTextTransformer(prefix, config)
        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
    ):
        r"""
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPTextModel

        >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```"""

        return self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
        )


class CLIPVisionTransformer(nn.Module):
    def __init__(self, prefix, config: CLIPVisionConfig, weights):
        super().__init__()
        self.config = config

        self.embeddings = CLIPVisionEmbeddings(
            prefix=f"{prefix}.embeddings", config=config, weights=weights
        )
        self.pre_layrnorm = nn.LayerNorm.load(
            prefix=f"{prefix}.pre_layrnorm", weights=weights, eps=config.layer_norm_eps
        )
        self.encoder = CLIPEncoder(
            prefix=f"{prefix}.encoder", config=config, weights=weights
        )
        # self.post_layernorm = nn.LayerNorm.load(prefix=f"{prefix}.post_layernorm", weights=weights, eps=config.layer_norm_eps)

    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
    ):
        r"""
        Returns:

        """
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        hidden_states = self.embeddings(pixel_values)
        hidden_states = self.pre_layrnorm(hidden_states)

        encoder_outputs = self.encoder(
            inputs_embeds=hidden_states,
        )
        last_hidden_state = encoder_outputs
        # pooled_output = last_hidden_state[:, 0, :]
        # pooled_output = self.post_layernorm(pooled_output)

        return BaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,
            # pooler_output=pooled_output,
            # hidden_states=encoder_outputs,
        )


class CLIPVisionModel(CLIPPreTrainedModel):
    config_class = CLIPVisionConfig
    main_input_name = "pixel_values"
    _no_split_modules = ["CLIPEncoderLayer"]

    def __init__(self, config: CLIPVisionConfig):
        super().__init__(config)
        self.vision_model = CLIPVisionTransformer(config)
        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self) -> nn.Module:
        return self.vision_model.embeddings.patch_embedding

    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
    ):
        r"""
        Returns:

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPVisionModel

        >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```"""

        return self.vision_model(
            pixel_values=pixel_values,
        )


class CLIPModel(nn.Module):
    def __init__(self, prefix, config: CLIPConfig, weights):
        super().__init__()
        text_config = config.text_config
        vision_config = config.vision_config

        self.projection_dim = config.projection_dim
        self.text_embed_dim = text_config.hidden_size
        self.vision_embed_dim = vision_config.hidden_size

        self.text_model = CLIPTextTransformer(text_config)
        self.vision_model = CLIPVisionTransformer(vision_config)

        self.visual_projection = nn.Linear(
            self.vision_embed_dim, self.projection_dim, bias=False
        )
        self.text_projection = nn.Linear(
            self.text_embed_dim, self.projection_dim, bias=False
        )
        self.logit_scale = nn.Parameter(
            torch.tensor(self.config.logit_scale_init_value)
        )

        # Initialize weights and apply final processing
        self.post_init()

    def get_text_features(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
    ) -> torch.FloatTensor:
        r"""
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPTextModel`].

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPModel

        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```"""
        text_outputs = self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
        )

        pooled_output = text_outputs[1]
        text_features = self.text_projection(pooled_output)

        return text_features

    def get_image_features(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
    ) -> torch.FloatTensor:
        r"""
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPVisionModel`].

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPModel

        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> image_features = model.get_image_features(**inputs)
        ```"""
        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
        vision_outputs = self.vision_model(
            pixel_values=pixel_values,
        )

        pooled_output = vision_outputs[1]  # pooled_output
        image_features = self.visual_projection(pooled_output)

        return image_features

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        pixel_values: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
    ):
        r"""
        Returns:

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPModel

        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```"""
        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
        vision_outputs = self.vision_model(
            pixel_values=pixel_values,
        )

        text_outputs = self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
        )

        image_embeds = vision_outputs[1]
        image_embeds = self.visual_projection(image_embeds)

        text_embeds = text_outputs[1]
        text_embeds = self.text_projection(text_embeds)

        # normalized features
        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)

        # cosine similarity as logits
        logit_scale = self.logit_scale.exp()
        logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
        logits_per_image = logits_per_text.t()

        return logits_per_image, logits_per_text


================================================
FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
================================================
# coding=utf-8
# Copyright 2024 Cohere team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
import torch.distributed

from torch import nn
from transformers.activations import ACT2FN
from typing import Optional, List, Tuple

from text_generation_server.layers.attention import (
    paged_attention,
    attention,
    set_block_mapping,
    Seqlen,
    HPUPagedAttentionMetadata,
)
from text_generation_server.layers.attention.kv_cache import get_kv_scales
from text_generation_server.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    SpeculativeHead,
    get_linear,
)
from text_generation_server.layers.layernorm import (
    FastLayerNorm,
)
from text_generation_server.layers.rotary import (
    PositionRotaryEmbedding,
)
from text_generation_server.utils.weights import UnquantizedWeight
from habana_frameworks.torch.hpex.kernels import (
    RotaryPosEmbeddingMode,
    apply_rotary_pos_emb,
)

import habana_frameworks.torch as htorch


class CohereRotary(PositionRotaryEmbedding):
    def forward(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        cos: torch.Tensor,
        sin: torch.Tensor,
    ):
        # Such controlflows may add some overhead.
        num_tokens = query.shape[0]
        head_size = query.shape[-1]
        rope_mode = RotaryPosEmbeddingMode.PAIRWISE
        sin = torch.repeat_interleave(sin, 2, dim=-1)
        cos = torch.repeat_interleave(cos, 2, dim=-1)
        rotary_dim = cos.shape[-1]
        query_shape = query.shape
        query = query.view(num_tokens, -1, head_size)
        query_rot = query[..., :rotary_dim]
        query_pass = query[..., rotary_dim:]
        query_rot = apply_rotary_pos_emb(query_rot, cos, sin, None, 0, rope_mode)
        query.copy_(torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape))

        key_shape = key.shape
        key = key.view(num_tokens, -1, head_size)
        key_rot = key[..., :rotary_dim]
        key_pass = key[..., rotary_dim:]
        key_rot = apply_rotary_pos_emb(key_rot, cos, sin, None, 0, rope_mode)
        key.copy_(torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape))


class CohereLayerNorm(nn.Module):
    def __init__(self, prefix, weights, eps):
        super().__init__()
        weight = weights.get_sharded(f"{prefix}.weight", dim=0)
        self.weight = nn.Parameter(weight)
        # Fake weights
        self.ones = weight.new_ones(weight.shape[1])
        self.eps = eps

    def forward(self, hidden_states):
        hidden_states = hidden_states.reshape(
            -1, self.weight.shape[0], self.weight.shape[1]
        )
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        mean = hidden_states.mean(-1, keepdim=True)
        hidden_states_minus_mean = hidden_states - mean
        variance = hidden_states_minus_mean.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states_minus_mean * torch.rsqrt(variance + self.eps)
        hidden_states = self.weight.to(torch.float32) * hidden_states
        hidden_states = hidden_states.view(-1, self.weight.shape[1])
        return hidden_states.to(input_dtype)


def load_attention(config, prefix, weights):
    if config.num_attention_heads != config.num_key_value_heads:
        return _load_gqa(config, prefix, weights)
    else:
        return TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
            dim=0,
            weights=weights,
            bias=config.attention_bias,
        )


def _load_gqa(config, prefix: str, weights):
    assert config.hidden_size % config.num_attention_heads == 0
    assert config.num_attention_heads % weights.process_group.size() == 0

    weight = weights.get_multi_weights_col(
        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
        dim=0,
    )

    if isinstance(weight, UnquantizedWeight):
        weight.weight = weight.weight.to(dtype=weights.dtype).to(device=weights.device)

        head_size = config.hidden_size // config.num_attention_heads
        num_heads = config.num_attention_heads // weights.process_group.size()
        num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
        assert list(weight.weight.shape) == [
            (num_heads + 2 * num_key_value_heads) * head_size,
            config.hidden_size,
        ], f"{list(weight.weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"

    if config.attention_bias:
        w = [
            weights.get_sharded(f"{p}.bias", dim=0)
            for p in [f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"]
        ]
        bias = torch.cat(w, dim=0).to(dtype=weights.dtype).to(device=weights.device)
    else:
        bias = None

    return TensorParallelColumnLinear(get_linear(weight, bias=bias))


class FlashCohereAttention(torch.nn.Module):
    def __init__(
        self,
        prefix: str,
        config,
        weights,
        rotary_emb,
    ):
        super().__init__()
        self.num_heads = config.num_attention_heads
        self.hidden_size = config.hidden_size
        self.head_size = self.hidden_size // self.num_heads

        self.rotary_emb = rotary_emb

        self.softmax_scale = self.head_size**-0.5

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()
        self.num_key_value_heads = (
            config.num_key_value_heads // weights.process_group.size()
        )

        self.query_key_value = load_attention(config, prefix, weights)
        self.kv_scales = get_kv_scales(weights, f"{prefix}")

        self.use_qk_norm = config.use_qk_norm
        if self.use_qk_norm:
            self.q_norm = CohereLayerNorm(
                prefix=f"{prefix}.q_norm",
                weights=weights,
                eps=config.layer_norm_eps,
            )
            self.k_norm = CohereLayerNorm(
                prefix=f"{prefix}.k_norm",
                weights=weights,
                eps=config.layer_norm_eps,
            )
        else:
            self.q_norm = None
            self.k_norm = None

        self.o_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.o_proj",
            weights=weights,
            bias=config.attention_bias,
        )
        self.num_groups = self.num_heads // self.num_key_value_heads
        self.kv_head_mapping = torch.arange(
            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
        ).repeat_interleave(self.num_groups)

    def forward(
        self,
        hidden_states,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        hpu_attention_meta,
    ):
        qkv = self.query_key_value(hidden_states)
        query, key, value = qkv.split(
            [
                self.head_size * self.num_heads,
                self.head_size * self.num_key_value_heads,
                self.head_size * self.num_key_value_heads,
            ],
            dim=1,
        )

        if self.use_qk_norm:
            query = query.reshape(-1, self.head_size)
            key = key.reshape(-1, self.head_size)
            query = self.q_norm(query.contiguous())
            key = self.k_norm(key.contiguous())

        query = query.view(-1, self.num_heads, self.head_size)
        key = key.view(-1, self.num_key_value_heads, self.head_size)
        value = value.view(-1, self.num_key_value_heads, self.head_size)

        self.rotary_emb(query, key, cos, sin)

        kv_cache.store(
            key=key,
            value=value,
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            # sdpa
            attn_output = attention(
                query=query,
                key=key,
                value=value,
                kv_cache=kv_cache,
                kv_scales=self.kv_scales,
                seqlen=seqlen,
                softmax_scale=self.softmax_scale,
            )
        # Decode
        else:
            attn_output = paged_attention(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                seqlen,
                kv_scales=self.kv_scales,
                hpu_attention_meta=hpu_attention_meta,
            )

        return self.o_proj(
            attn_output.view(-1, self.num_heads * self.head_size), reduce=False
        )


class CohereMLP(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        act = config.hidden_act
        self.act = (
            ACT2FN[act]
            if "gelu" not in act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
                ),
            )
        )
        # Fuse gate and up proj
        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
            weights=weights,
            dim=0,
            bias=False,
        )
        self.down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.down_proj",
            weights=weights,
            bias=False,
        )
        self.intermediate_size = (
            config.intermediate_size // weights.process_group.size()
        )

    def forward(self, hidden_states):
        gate_up_states = self.gate_up_proj(hidden_states)
        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
        return self.down_proj(
            self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], reduce=False
        )


class FlashCohereLayer(nn.Module):
    def __init__(self, prefix: str, layer_id, config, weights, rotary_emb):
        super().__init__()
        prefix = f"{prefix}.layers.{layer_id}"
        self.self_attn = FlashCohereAttention(
            prefix=f"{prefix}.self_attn",
            config=config,
            weights=weights,
            rotary_emb=rotary_emb,
        )
        self.mlp = CohereMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)

        self.input_layernorm = FastLayerNorm.load_no_bias(
            prefix=f"{prefix}.input_layernorm",
            weights=weights,
            eps=config.layer_norm_eps,
        )
        self.process_group = weights.process_group

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        hpu_attention_meta,
    ):
        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)

        # Self Attention
        attn_output = self.self_attn(
            normed_hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            hpu_attention_meta,
        )

        mlp_output = self.mlp(normed_hidden_states)
        output = attn_output + mlp_output

        if self.process_group.size() > 1:
            torch.distributed.all_reduce(output, group=self.process_group)

        return output, res


class FlashCohereModel(torch.nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()

        process_group = weights.process_group
        self.tp_rank = process_group.rank()
        self.tp_world_size = process_group.size()
        self.embed_tokens = TensorParallelEmbedding(
            prefix=f"{prefix}.embed_tokens", weights=weights
        )
        rotary_emb = CohereRotary.static(
            config=config,
            dim=config.hidden_size // config.num_attention_heads,
            base=config.rope_theta,
            device=weights.device,
        )
        self.layers = nn.ModuleList(
            [
                FlashCohereLayer(
                    prefix,
                    layer_id,
                    config,
                    weights,
                    rotary_emb,
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )
        self.norm = FastLayerNorm.load_no_bias(
            prefix=f"{prefix}.norm", weights=weights, eps=config.layer_norm_eps
        )

        self.gradient_checkpointing = False

        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads
        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: torch.Tensor,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
    ) -> torch.Tensor:
        if hpu_attention_meta is not None:
            hpu_attention_meta = set_block_mapping(
                hpu_attention_meta, input_ids.shape[0]
            )
        hidden_states = self.embed_tokens(input_ids)

        # Get rotary cos and sin for this forward
        # Avoid to index in each layer
        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(position_ids)

        residual = None
        lazy_mode = htorch.utils.internal.is_lazy()
        if lazy_mode:
            htorch.core.mark_step()
        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                slots,
                seqlen,
                hpu_attention_meta,
            )
            if lazy_mode:
                htorch.core.mark_step()

        hidden_states, _ = self.norm(hidden_states, residual)

        return hidden_states


class FlashCohereForCausalLM(torch.nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()

        if not prefix:
            prefix = "model"
        else:
            prefix = f"{prefix}.model"

        self.model = FlashCohereModel(prefix, config, weights)
        try:
            self.lm_head = SpeculativeHead.load(
                config,
                prefix="lm_head",
                weights=weights,
            )
        except RuntimeError:
            self.lm_head = SpeculativeHead.load(
                config,
                prefix=f"{prefix}.embed_tokens",
                weights=weights,
            )
        self.logit_scale = config.logit_scale

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        hidden_states = self.model(
            input_ids,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            hpu_attention_meta,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.lm_head(hidden_states)
        logits *= self.logit_scale
        if speculative_logits is not None:
            speculative_logits *= self.logit_scale
        return logits, speculative_logits


================================================
FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
================================================
# coding=utf-8
# Copyright 2022 HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
import torch.distributed

from torch import nn
from transformers.activations import ACT2FN
from transformers.configuration_utils import PretrainedConfig
from typing import Optional, List, Tuple, Any
from text_generation_server.layers.attention.kv_cache import get_kv_scales


from text_generation_server.layers.attention import (
    paged_attention,
    attention,
    set_block_mapping,
    Seqlen,
    HPUPagedAttentionMetadata,
)
from text_generation_server.layers import (
    FastLinear,
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    SpeculativeHead,
    get_linear,
)
from text_generation_server.layers.rotary import (
    PositionRotaryEmbedding,
)
from text_generation_server.layers.layernorm import (
    FastLayerNorm,
)
from vllm_hpu_extension.ops import DynamicFusedMOE
import habana_frameworks.torch as htorch


class DbrxAttentionConfig(PretrainedConfig):
    def __init__(
        self,
        attn_pdrop: float = 0,
        clip_qkv: Optional[float] = None,
        kv_n_heads: int = 1,
        rope_theta: float = 10000.0,
        **kwargs: Any,
    ):
        super().__init__(**kwargs)
        self.attn_pdrop = attn_pdrop
        self.clip_qkv = clip_qkv
        self.kv_n_heads = kv_n_heads
        self.rope_theta = rope_theta

        for k in ["model_type"]:
            if k in kwargs:
                kwargs.pop(k)
        if len(kwargs) != 0:
            raise ValueError(f"Found unknown {kwargs=}")


class DbrxFFNConfig(PretrainedConfig):
    def __init__(
        self,
        ffn_act_fn: Optional[dict] = None,
        ffn_hidden_size: int = 3584,
        moe_num_experts: int = 4,
        moe_top_k: int = 1,
        moe_jitter_eps: Optional[float] = None,
        moe_loss_weight: float = 0.01,
        moe_normalize_expert_weights: Optional[float] = 1,
        uniform_expert_assignment: bool = False,
        **kwargs: Any,
    ):
        super().__init__()
        if ffn_act_fn is None:
            ffn_act_fn = {"name": "silu"}
        self.ffn_act_fn = ffn_act_fn
        self.ffn_hidden_size = ffn_hidden_size
        self.moe_num_experts = moe_num_experts
        self.moe_top_k = moe_top_k
        self.moe_jitter_eps = moe_jitter_eps
        self.moe_loss_weight = moe_loss_weight
        self.moe_normalize_expert_weights = moe_normalize_expert_weights
        self.uniform_expert_assignment = uniform_expert_assignment

        if uniform_expert_assignment:
            raise ValueError("`uniform_expert_assignment = True` is not supported")

        for k in ["model_type"]:
            if k in kwargs:
                kwargs.pop(k)
        if len(kwargs) != 0:
            raise ValueError(f"Found unknown {kwargs=}")


class DbrxConfig(PretrainedConfig):
    attribute_map = {
        "hidden_size": "d_model",
        "num_attention_heads": "n_heads",
        "num_hidden_layers": "n_layers",
    }

    def __init__(
        self,
        d_model: int = 2048,
        n_heads: int = 16,
        n_layers: int = 24,
        max_seq_len: int = 2048,
        vocab_size: int = 32000,
        resid_pdrop: float = 0.0,
        emb_pdrop: float = 0.0,
        attn_config: Optional[DbrxAttentionConfig] = None,
        ffn_config: Optional[DbrxFFNConfig] = None,
        use_cache: bool = True,
        initializer_range: float = 0.02,
        output_router_logits: bool = False,
        router_aux_loss_coef: float = 0.05,
        **kwargs: Any,
    ):
        if attn_config is None:
            self.attn_config = DbrxAttentionConfig()
        elif isinstance(attn_config, dict):
            self.attn_config = DbrxAttentionConfig(**attn_config)
        else:
            self.attn_config = attn_config

        if ffn_config is None:
            self.ffn_config = DbrxFFNConfig()
        elif isinstance(ffn_config, dict):
            self.ffn_config = DbrxFFNConfig(**ffn_config)
        else:
            self.ffn_config = ffn_config

        self.d_model = d_model
        self.n_heads = n_heads
        self.n_layers = n_layers
        self.max_seq_len = max_seq_len
        self.vocab_size = vocab_size
        self.resid_pdrop = resid_pdrop
        self.emb_pdrop = emb_pdrop
        self.use_cache = use_cache
        self.initializer_range = initializer_range
        self.output_router_logits = output_router_logits
        self.router_aux_loss_coef = router_aux_loss_coef

        tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
        if tie_word_embeddings:
            raise ValueError("tie_word_embeddings is not supported for Dbrx models.")

        super().__init__(
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )

    @property
    def num_key_value_heads(self):
        # We can't use the attribute map, since this the number of KV
        # heads is not top-level.
        return self.attn_config.kv_n_heads


def promote_scalar(x: torch.Tensor) -> torch.Tensor:
    return x.view(1) if len(x.size()) == 0 else x


def load_attention(config, prefix, weights):
    return TensorParallelColumnLinear.load_qkv(
        config,
        prefix=f"{prefix}.Wqkv",
        weights=weights,
        bias=False,
        num_heads=config.n_heads,
        num_key_value_heads=config.attn_config.kv_n_heads,
    )


def _load_experts(config, prefix, weights):
    world_size = weights.process_group.size()
    rank = weights.process_group.rank()

    assert (
        config.ffn_config.ffn_hidden_size % world_size == 0
    ), f"The chosen size {config.ffn_config.ffn_hidden_size} is not compatible with sharding on {world_size} shards"

    expert_size = config.ffn_config.ffn_hidden_size
    block_size = expert_size // world_size
    start = rank * block_size
    stop = (rank + 1) * block_size

    tensor = torch.empty(
        (config.ffn_config.moe_num_experts * block_size, config.d_model),
        dtype=weights.dtype,
        device=weights.device,
    )

    slice_ = weights._get_slice(f"{prefix}")

    for i in range(config.ffn_config.moe_num_experts):
        offset = i * expert_size
        expert_slice = slice_[start + offset : stop + offset]

        tensor[i * block_size : (i + 1) * block_size] = expert_slice.to(
            dtype=weights.dtype
        ).to(device=weights.device)
    return tensor


def _load_experts_quantized(config, prefix, weights, cls):
    world_size = weights.process_group.size()
    rank = weights.process_group.rank()

    assert (
        config.ffn_config.ffn_hidden_size % world_size == 0
    ), f"The chosen size {config.ffn_config.ffn_hidden_size} is not compatible with sharding on {world_size} shards"

    expert_size = config.ffn_config.ffn_hidden_size
    block_size = expert_size // world_size
    start = rank * block_size
    stop = (rank + 1) * block_size

    slice_ = weights._get_slice(f"{prefix}")

    experts = []
    for i in range(config.ffn_config.moe_num_experts):
        if config.quantize in ["gptq", "awq"]:
            raise NotImplementedError(
                "Dbrx does not support gptq/awq quantization yet."
            )
        else:
            offset = i * expert_size
            expert_slice = (
                slice_[start + offset : stop + offset]
                .to(dtype=weights.dtype)
                .to(device=weights.device)
            )

        if cls == TensorParallelRowLinear:
            expert_slice = expert_slice.t().contiguous()
            linear = get_linear(expert_slice, None)
            experts.append(cls(linear, weights.process_group))
        else:
            linear = get_linear(expert_slice, None)
            experts.append(cls(linear))

    return experts


class DbrxAttention(torch.nn.Module):
    def __init__(
        self,
        prefix: str,
        config,
        weights,
        rotary_emb,
    ):
        super().__init__()
        self.clip_qkv = config.attn_config.clip_qkv
        self.num_heads = config.n_heads
        self.hidden_size = config.d_model
        self.head_size = self.hidden_size // self.num_heads

        self.rotary_emb = rotary_emb

        self.softmax_scale = self.head_size**-0.5

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()
        self.num_key_value_heads = (
            config.attn_config.kv_n_heads // weights.process_group.size()
        )

        self.query_key_value = load_attention(config, prefix, weights)
        self.kv_scales = get_kv_scales(weights, f"{prefix}")

        self.o_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.out_proj",
            weights=weights,
            bias=False,
        )
        self.num_groups = self.num_heads // self.num_key_value_heads
        self.kv_head_mapping = torch.arange(
            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
        ).repeat_interleave(self.num_groups)

    def forward(
        self,
        hidden_states,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        hpu_attention_meta,
    ):
        qkv = self.query_key_value(hidden_states)
        if self.clip_qkv is not None:
            qkv = qkv.clamp(min=-self.clip_qkv, max=self.clip_qkv)

        query, kv = qkv.split(
            [
                self.head_size * self.num_heads,
                2 * self.head_size * self.num_key_value_heads,
            ],
            dim=1,
        )
        query = query.view(-1, self.num_heads, self.head_size)
        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)

        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)

        kv_cache.store(
            key=kv[:, 0],
            value=kv[:, 1],
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            # sdpa
            attn_output = attention(
                query=query,
                key=kv[:, 0],
                value=kv[:, 1],
                kv_cache=kv_cache,
                kv_scales=self.kv_scales,
                seqlen=seqlen,
                softmax_scale=self.softmax_scale,
            )
        # Decode
        else:
            attn_output = paged_attention(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                seqlen,
                kv_scales=self.kv_scales,
                hpu_attention_meta=hpu_attention_meta,
            )

        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))


class DbrxNormAttentionNorm(nn.Module):
    def __init__(
        self,
        prefix: str,
        config,
        weights,
        rotary_emb,
    ):
        super().__init__()
        self.norm_1 = FastLayerNorm.load_no_bias(
            prefix=f"{prefix}.norm_1", weights=weights, eps=1e-5
        )
        self.self_attn = DbrxAttention(
            prefix=f"{prefix}.attn",
            config=config,
            weights=weights,
            rotary_emb=rotary_emb,
        )
        self.norm_2 = FastLayerNorm.load_no_bias(
            prefix=f"{prefix}.norm_2",
            weights=weights,
            eps=1e-5,
        )

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        hpu_attention_meta,
    ):
        normed_hidden_states, res = self.norm_1(hidden_states, residual)

        # Self Attention
        attn_output = self.self_attn(
            normed_hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            hpu_attention_meta,
        )

        # faster post attention rms norm
        normed_attn_res_output, attn_res = self.norm_2(attn_output, res)

        return normed_attn_res_output, attn_res


@torch.jit.script
def select_experts(
    gate_logits: torch.Tensor, top_k: int, moe_normalize_expert_weights: int
):
    # all_probs: (sequence_length, n_experts) and upcast for softmax
    all_probs = torch.nn.functional.softmax(gate_logits, dim=1, dtype=torch.float)
    # weights, selected_experts: (sequence_length, top-k)
    weights, selected_experts = torch.topk(all_probs, top_k, dim=-1)
    if moe_normalize_expert_weights:
        weights = weights / torch.norm(
            weights, p=moe_normalize_expert_weights, dim=-1, keepdim=True
        )
    weights = weights.view(-1)
    selected_experts = selected_experts.view(-1)

    return selected_experts, weights


@torch.jit.script
def round_up(x: torch.Tensor, value: int):
    return torch.div(x + (value - 1), value, rounding_mode="trunc") * value


class BlockSparseMoE(nn.Module):
    def __init__(self, prefix, config: DbrxConfig, weights):
        super().__init__()
        self.moe_normalize_expert_weights = (
            config.ffn_config.moe_normalize_expert_weights
        )
        self.hidden_dim = config.d_model
        self.ffn_dim = config.ffn_config.ffn_hidden_size // weights.process_group.size()
        self.num_experts = config.ffn_config.moe_num_experts
        self.top_k = config.ffn_config.moe_top_k

        act = config.ffn_config.ffn_act_fn["name"]
        if "gelu" in act:
            self.act = lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
                ),
            )
        elif "silu" in act:
            self.act = torch.nn.functional.silu
        else:
            self.act = ACT2FN[act]

        # gating
        self.gate = FastLinear.load(
            config, f"{prefix}.router.layer", weights, bias=False
        )

        # merged expert weights, all of size  (n_experts * ffn_dim, hidden_dim)
        w1 = _load_experts(config, f"{prefix}.experts.mlp.w1", weights).view(
            self.num_experts, self.ffn_dim, self.hidden_dim
        )
        v1 = _load_experts(config, f"{prefix}.experts.mlp.v1", weights).view(
            self.num_experts, self.ffn_dim, self.hidden_dim
        )
        self.wv1 = torch.cat([w1, v1], dim=1)
        self.w2 = (
            _load_experts(config, f"{prefix}.experts.mlp.w2", weights)
            .view(self.num_experts, self.ffn_dim, self.hidden_dim)
            .transpose(1, 2)
            .contiguous()
        )

        self.process_group = weights.process_group

        self.hpu_fused_moe = DynamicFusedMOE(self.num_experts)
        for i in range(self.num_experts):
            self.hpu_fused_moe.MoeOp.w13_list[i].set_weight(self.wv1[i])
            self.hpu_fused_moe.MoeOp.w2_list[i].set_weight(self.w2[i])

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # router_logits: (num_tokens, n_experts)
        router_logits = self.gate(x)
        out = self.hpu_fused_moe(x, router_logits, self.top_k)

        # Reduce sum
        if self.process_group.size() > 1:
            torch.distributed.all_reduce(out, group=self.process_group)

        return out.view(*x.shape)


class DenseMoE(nn.Module):
    def __init__(self, prefix, config: DbrxConfig, weights):
        super().__init__()

        self.moe_normalize_expert_weights = (
            config.ffn_config.moe_normalize_expert_weights
        )
        self.hidden_dim = config.d_model
        self.ffn_dim = config.ffn_config.ffn_hidden_size // weights.process_group.size()
        self.num_experts = config.ffn_config.moe_num_experts
        self.top_k = config.ffn_config.moe_top_k

        act = config.ffn_config.ffn_act_fn["name"]
        if "gelu" in act:
            self.act = lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
                ),
            )
        elif "silu" in act:
            self.act = torch.nn.functional.silu
        else:
            self.act = ACT2FN[act]

        # gating
        self.gate = FastLinear.load(
            config, f"{prefix}.router.layer", weights, bias=False
        )

        self.w1 = _load_experts_quantized(
            config,
            prefix=f"{prefix}.experts.mlp.w1",
            weights=weights,
            cls=TensorParallelColumnLinear,
        )
        self.w2 = _load_experts_quantized(
            config,
            prefix=f"{prefix}.experts.mlp.w2",
            weights=weights,
            cls=TensorParallelRowLinear,
        )
        self.v1 = _load_experts_quantized(
            config,
            prefix=f"{prefix}.experts.mlp.v1",
            weights=weights,
            cls=TensorParallelColumnLinear,
        )

        self.process_group = weights.process_group

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        x: (sequence_length, model_dim)
        gate_logits: (sequence_length, n_experts)
        """
        # optional reshape
        input_shape = x.shape
        x = x.view(-1, input_shape[-1])

        # gate_logits: (sequence_length, n_experts)
        gate_logits = self.gate(x)
        # all_probs: (sequence_length, n_experts) and upcast for softmax
        weights = torch.nn.functional.softmax(gate_logits, dim=1, dtype=torch.float)

        if self.top_k < self.num_experts:
            _, not_selected_experts = torch.topk(
                weights,
                self.num_experts - self.top_k,
                largest=False,
                sorted=False,
                dim=1,
            )
            # Mask not selected experts
            weights.scatter_(1, not_selected_experts, 0)

        # Re-normalize
        if self.moe_normalize_expert_weights:
            weights = weights / torch.norm(
                weights, p=self.moe_normalize_expert_weights, dim=-1, keepdim=True
            )
        weights = weights.to(x.dtype)

        # Final output tensor
        out = x.new_zeros(x.shape[0], self.hidden_dim)
        for i in range(self.num_experts):
            h = self.act(self.w1[i](x)) * self.v1[i](x)
            h = self.w2[i](h, reduce=False)
            # Add expert output to out with masking
            out += h * weights[:, i].view(-1, 1)

        # Reduce sum
        if self.process_group.size() > 1:
            torch.distributed.all_reduce(out, group=self.process_group)

        return out


class DbrxLayer(nn.Module):
    def __init__(self, prefix: str, layer_id, config, weights, rotary_emb):
        super().__init__()
        prefix = f"{prefix}.blocks.{layer_id}"

        self.attn = DbrxNormAttentionNorm(
            prefix=f"{prefix}.norm_attn_norm",
            config=config,
            weights=weights,
            rotary_emb=rotary_emb,
        )

        moe_cls = BlockSparseMoE if config.quantize is None else DenseMoE
        self.moe = moe_cls(f"{prefix}.ffn", config, weights)

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        hpu_attention_meta,
    ):
        # Self Attention
        attn_output, attn_res = self.attn(
            hidden_states,
            residual,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            hpu_attention_meta,
        )

        moe_output = self.moe(attn_output)

        return moe_output, attn_res


class DbrxModel(torch.nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()

        self.embed_tokens = TensorParallelEmbedding(
            prefix=f"{prefix}.wte", weights=weights
        )
        rotary_emb = PositionRotaryEmbedding.static(
            config=config,
            dim=config.d_model // config.n_heads,
            base=config.attn_config.rope_theta,
            device=weights.device,
        )

        self.layers = nn.ModuleList(
            [
                DbrxLayer(
                    prefix,
                    layer_id,
                    config,
                    weights,
                    rotary_emb,
                )
                for layer_id in range(config.n_layers)
            ]
        )
        self.norm = FastLayerNorm.load_no_bias(
            prefix=f"{prefix}.norm_f", weights=weights, eps=1e-5
        )

        self.head_size = self.layers[0].attn.self_attn.head_size
        self.num_heads = self.layers[0].attn.self_attn.num_heads
        self.num_key_value_heads = self.layers[0].attn.self_attn.num_key_value_heads

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
    ) -> torch.Tensor:
        if hpu_attention_meta is not None:
            hpu_attention_meta = set_block_mapping(
                hpu_attention_meta, input_ids.shape[0]
            )
        hidden_states = self.embed_tokens(input_ids)

        # Get rotary cos and sin for this forward
        # Avoid to index in each layer
        cos, sin = self.layers[0].attn.self_attn.rotary_emb.get_cos_sin(position_ids)
        residual = None
        lazy_mode = htorch.utils.internal.is_lazy()
        if lazy_mode:
            htorch.core.mark_step()
        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                slots,
                seqlen,
                hpu_attention_meta,
            )
            if lazy_mode:
                htorch.core.mark_step()

        hidden_states, _ = self.norm(hidden_states, residual)

        return hidden_states


class FlashDbrxForCausalLM(torch.nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()

        if not prefix:
            prefix = "transformer"
        else:
            prefix = f"{prefix}.transformer"

        self.model = DbrxModel(prefix, config, weights)
        self.lm_head = SpeculativeHead.load(
            config,
            prefix="lm_head",
            weights=weights,
        )

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        hidden_states = self.model(
            input_ids,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            hpu_attention_meta,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.lm_head(hidden_states)
        return logits, speculative_logits


================================================
FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py
================================================
# coding=utf-8
# Copyright 2023, 2024 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List, Optional, Tuple, Type

import torch
import torch.distributed
from torch import nn
from transformers.activations import ACT2FN
from transformers.configuration_utils import PretrainedConfig

from text_generation_server.layers import (
    FastLinear,
    SpeculativeHead,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    TensorParallelRowLinear,
    get_linear,
    Fp8Linear,
)
from text_generation_server.layers.attention import (
    Seqlen,
    attention,
    paged_attention_mla,
    set_block_mapping,
    HPUPagedAttentionMetadata,
)
from text_generation_server.layers.attention.kv_cache import KVCache, get_kv_scales
from text_generation_server.layers.layernorm import FastRMSNorm
from text_generation_server.layers.moe import DenseMoELayer, MoELayer, SparseMoELayer
from text_generation_server.layers.rotary import PositionRotaryEmbedding, get_mscale
from text_generation_server.utils.weights import Weights
import habana_frameworks.torch as htorch


def get_and_maybe_dequant_weights(layer: torch.nn.Module) -> torch.Tensor:
    if isinstance(layer, Fp8Linear):
        eye = torch.eye(
            layer.qweight.shape[-1], dtype=torch.bfloat16, device=layer.qweight.device
        )
        dequant_weights = layer(eye)
        del eye
        # standardize to (output, input)
        return dequant_weights.T
    return layer.weight


class DeepseekV2Config(PretrainedConfig):
    def __init__(
        self,
        vocab_size=102400,
        hidden_size=4096,
        intermediate_size=11008,
        moe_intermediate_size=1407,
        num_hidden_layers=30,
        num_attention_heads=32,
        num_key_value_heads=32,
        n_shared_experts=2,
        n_routed_experts=160,
        ep_size=1,
        routed_scaling_factor=1.0,
        kv_lora_rank=512,
        q_lora_rank=1536,
        qk_rope_head_dim=64,
        v_head_dim=128,
        qk_nope_head_dim=128,
        topk_method="gready",
        n_group=8,
        topk_group=3,
        num_experts_per_tok=6,
        moe_layer_freq=1,
        first_k_dense_replace=0,
        norm_topk_prob=False,
        scoring_func="softmax",
        aux_loss_alpha=0.001,
        seq_aux=True,
        hidden_act="silu",
        max_position_embeddings=2048,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        pad_token_id=None,
        bos_token_id=100000,
        eos_token_id=100001,
        pretraining_tp=1,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
        attention_bias=False,
        attention_dropout=0.0,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.moe_intermediate_size = moe_intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.n_shared_experts = n_shared_experts
        self.n_routed_experts = n_routed_experts
        self.ep_size = ep_size
        self.routed_scaling_factor = routed_scaling_factor
        self.kv_lora_rank = kv_lora_rank
        self.q_lora_rank = q_lora_rank
        self.qk_rope_head_dim = qk_rope_head_dim
        self.v_head_dim = v_head_dim
        self.qk_nope_head_dim = qk_nope_head_dim
        self.topk_method = topk_method
        self.n_group = n_group
        self.topk_group = topk_group
        self.num_experts_per_tok = num_experts_per_tok
        self.moe_layer_freq = moe_layer_freq
        self.first_k_dense_replace = first_k_dense_replace
        self.norm_topk_prob = norm_topk_prob
        self.scoring_func = scoring_func
        self.aux_loss_alpha = aux_loss_alpha
        self.seq_aux = seq_aux
        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.pretraining_tp = pretraining_tp
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout

        tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
        if tie_word_embeddings:
            raise ValueError(
                "tie_word_embeddings is not supported for Deepseek V2 models."
            )

        if ep_size != 1:
            raise ValueError(
                f"Currently only ep_size == 1 is supported for Deepseek V2 models, was {ep_size}"
            )

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )


class DeepseekV2Attention(torch.nn.Module):
    def __init__(
        self,
        prefix: str,
        config,
        weights: Weights,
        rotary_emb,
    ):
        super().__init__()
        self.num_heads = config.num_attention_heads
        self.hidden_size = config.hidden_size
        self.kv_lora_rank = config.kv_lora_rank
        self.q_lora_rank = config.q_lora_rank
        self.qk_nope_head_dim = config.qk_nope_head_dim
        self.qk_rope_head_dim = config.qk_rope_head_dim
        self.head_size = config.qk_nope_head_dim + config.qk_rope_head_dim
        self.value_head_size = config.v_head_dim
        self.head_pad_size = max(self.head_size, self.value_head_size)
        self.rotary_emb = rotary_emb

        mscale = get_mscale(
            self.rotary_emb.scaling_factor, self.rotary_emb.mscale_all_dim
        )
        self.softmax_scale = self.head_size**-0.5 * mscale * mscale

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()
        self.num_key_value_heads = (
            config.num_key_value_heads // weights.process_group.size()
        )

        if self.q_lora_rank is None:
            self.q_proj = TensorParallelColumnLinear.load(
                config,
                prefix=f"{prefix}.q_proj",
                weights=weights,
                bias=config.attention_bias,
            )
        else:
            self.q_a_proj = get_linear(
                weight=weights.get_weights(f"{prefix}.q_a_proj"),
                bias=(
                    weights.get_tensor(f"{prefix}.q_a_proj.bias")
                    if config.attention_bias
                    else None
                ),
            )
            self.q_a_layernorm = FastRMSNorm.load(
                prefix=f"{prefix}.q_a_layernorm",
                weights=weights,
                eps=config.rms_norm_eps,
            )
            self.q_b_proj = TensorParallelColumnLinear.load(
                config,
                prefix=f"{prefix}.q_b_proj",
                weights=weights,
                bias=config.attention_bias,
            )

        self.kv_a_proj_with_mqa = get_linear(
            weight=weights.get_weights(f"{prefix}.kv_a_proj_with_mqa"),
            bias=(
                weights.get_tensor(f"{prefix}.kv_a_proj_with_mqa.bias")
                if config.attention_bias
                else None
            ),
        )

        self.kv_scales = get_kv_scales(weights, f"{prefix}")

        self.kv_a_layernorm = FastRMSNorm.load(
            prefix=f"{prefix}.kv_a_layernorm", weights=weights, eps=config.rms_norm_eps
        )

        self.kv_b_proj = TensorParallelColumnLinear.load(
            config,
            prefix=f"{prefix}.kv_b_proj",
            weights=weights,
            bias=config.attention_bias,
        )

        self.o_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.o_proj",
            weights=weights,
            bias=False,
        )
        self.num_groups = self.num_heads // self.num_key_value_heads
        self.kv_head_mapping = torch.arange(
            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
        ).repeat_interleave(self.num_groups)

        kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj.linear).T
        kv_b_proj_weight = kv_b_proj_weight.view(
            self.kv_lora_rank,
            self.num_heads,
            self.qk_nope_head_dim + self.value_head_size,
        )

        W_UK, W_UV = kv_b_proj_weight.split(
            [self.qk_nope_head_dim, self.value_head_size], dim=-1
        )
        # Convert from (L, N, V) to (N, L, V)
        self.W_UV = W_UV.transpose(0, 1)
        # Convert from (L, N, P) to (N, P, L)
        self.W_UK_T = W_UK.permute(1, 2, 0)

    def _q_proj_and_k_up_proj(self, x):
        q_proj = self.q_proj if self.q_lora_rank is None else self.q_b_proj
        q_nope, q_pe = (
            q_proj(x)
            .view(-1, self.num_heads, self.head_size)
            .split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
        )

        # Convert from (B, N, P) to (N, B, P)
        q_nope = q_nope.transpose(0, 1)
        # Multiply (N, B, P) x (N, P, L) -> (N, B, L)
        ql_nope = torch.bmm(q_nope, self.W_UK_T)
        # Convert from (N, B, L) to (B, N, L)
        return ql_nope.transpose(0, 1), q_pe

    def _v_up_proj_and_o_proj(self, x):
        # Convert from (B, N, L) to (N, B, L)
        x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)
        # Multiply (N, B, L) x (N, L, V) -> (N, B, V)
        x = torch.bmm(x, self.W_UV)
        # Convert from (N, B, V) to (B, N * V)
        x = x.transpose(0, 1).reshape(-1, self.num_heads * self.value_head_size)
        return self.o_proj(x)

    def forward(
        self,
        hidden_states: torch.Tensor,
        cos: torch.Tensor,
        sin: torch.Tensor,
        cu_seqlen_prefill: torch.Tensor,
        kv_cache: KVCache,
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
    ):
        if self.q_lora_rank is None:
            hidden_states_or_q_c = hidden_states
        else:
            hidden_states_or_q_c = self.q_a_layernorm(self.q_a_proj(hidden_states))[0]

        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
        compressed_kv, key_pe = torch.split(
            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
        )

        key_pe = key_pe.view(-1, 1, self.qk_rope_head_dim)
        kv_c_normed = self.kv_a_layernorm(compressed_kv.contiguous())[0]

        # Prefill
        if cu_seqlen_prefill is not None:
            q_proj = self.q_proj if self.q_lora_rank is None else self.q_b_proj
            query = q_proj(hidden_states_or_q_c)
            query = query.view(-1, self.num_heads, self.head_size)
            query_nope, query_pe = torch.split(
                query, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
            )
        else:
            query_nope, query_pe = self._q_proj_and_k_up_proj(hidden_states_or_q_c)

        batch_size, heads, head_dim = query_pe.shape
        query_pe = (
            query_pe.view(batch_size, heads, head_dim // 2, 2)
            .transpose(2, 3)
            .reshape(batch_size, heads, head_dim)
        )
        batch_size, heads, head_dim = key_pe.shape
        key_pe = (
            key_pe.view(batch_size, heads, head_dim // 2, 2)
            .transpose(2, 3)
            .reshape(batch_size, heads, head_dim)
        )
        self.rotary_emb(query_pe, key_pe, cos, sin)
        latent_vec_k = torch.concat(
            (kv_c_normed, key_pe.view(-1, self.qk_rope_head_dim)), dim=-1
        )
        latent_vec_k = latent_vec_k.view(-1, self.qk_rope_head_dim + self.kv_lora_rank)

        latent_vec_k = latent_vec_k.unflatten(0, (slots.size(0), -1))

        kv_cache.store(
            key=latent_vec_k,
            value=None,
            slots=slots,
            kv_scales=self.kv_scales,
        )

        if cu_seqlen_prefill is not None:
            kv = self.kv_b_proj(kv_c_normed).view(
                -1,
                self.num_key_value_heads,
                self.qk_nope_head_dim + self.value_head_size,
            )

            key_nope, value = torch.split(
                kv, [self.qk_nope_head_dim, self.value_head_size], dim=-1
            )
            query[..., self.qk_nope_head_dim :] = query_pe
            key = torch.empty_like(query)
            key[..., : self.qk_nope_head_dim] = key_nope
            key[..., self.qk_nope_head_dim :] = key_pe

            # We need to pad the heads because Flash Attention does not support
            # qk and v with different head sizes.
            query = torch.nn.functional.pad(
                query, (0, self.head_pad_size - self.head_size), value=0
            )
            key = torch.nn.functional.pad(
                key, (0, self.head_pad_size - self.head_size), value=0
            )
            value = torch.nn.functional.pad(
                value, (0, self.head_pad_size - self.value_head_size), value=0
            )

            # flash attention
            attn_output = attention(
                query=query,
                key=key,
                value=value,
                kv_cache=kv_cache,
                kv_scales=self.kv_scales,
                seqlen=seqlen,
                softmax_scale=self.softmax_scale,
            )
            attn_output = attn_output[..., : self.value_head_size]

            return self.o_proj(
                attn_output.reshape(-1, self.num_heads * self.value_head_size)
            )
        else:
            # Decode
            query = torch.cat([query_nope, query_pe], dim=-1)
            attn_output = paged_attention_mla(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                seqlen,
                kv_scales=self.kv_scales,
                hpu_attention_meta=hpu_attention_meta,
                kv_lora_rank=self.kv_lora_rank,
            )
            attn_output = self._v_up_proj_and_o_proj(attn_output)
            return attn_output


class DeepseekV2MLP(nn.Module):
    def __init__(self, prefix: str, config, weights, intermediate_size: int):
        super().__init__()
        self.hidden_act = config.hidden_act
        if self.hidden_act != "silu":
            # Bail out because MoE only supports silu.
            raise NotImplementedError(
                "Currently only `silu` is supported as an activation for Deepseek V2."
            )
        self.act = ACT2FN[self.hidden_act]

        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
            weights=weights,
            dim=0,
            bias=False,
        )

        self.down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.down_proj",
            weights=weights,
            bias=False,
        )

        self.intermediate_size = intermediate_size // weights.process_group.size()

        # TODO: This is a hotfix to be removed & properly refactored.
        self.quantize = config.quantize

    def forward(self, hidden_states: torch.Tensor, reduce: bool = True):
        gate_up_states = self.gate_up_proj(hidden_states)
        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
        return self.down_proj(
            self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], reduce=reduce
        )


class DeepseekV2MoE(nn.Module):
    def __init__(
        self,
        prefix,
        config: DeepseekV2Config,
        moe_layer_cls: Type[MoELayer],
        weights,
    ):
        super().__init__()

        self.hidden_dim = config.hidden_size
        self.moe_intermediate_size = (
            config.moe_intermediate_size // weights.process_group.size()
        )
        self.routed_scaling_factor = config.routed_scaling_factor

        # Gating
        self.gate = FastLinear.load(config, f"{prefix}.gate", weights, bias=False)

        self.moe_layer = moe_layer_cls(
            prefix=f"{prefix}.experts",
            n_experts=config.n_routed_experts,
            n_expert_group=config.n_group,
            renormalize=config.norm_topk_prob,
            topk=config.num_experts_per_tok,
            topk_group=config.topk_group,
            weights=weights,
        )
        assert isinstance(self.moe_layer, MoELayer)

        if config.n_shared_experts is not None:
            self.shared_experts = DeepseekV2MLP(
                prefix=f"{prefix}.shared_experts",
                config=config,
                weights=weights,
                intermediate_size=config.moe_intermediate_size
                * config.n_shared_experts,
            )
        else:
            self.shared_experts = None

        self.process_group = weights.process_group

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if self.shared_experts is not None:
            shared_output = self.shared_experts(x, reduce=False)
        else:
            shared_output = None

        router_logits = self.gate(x)

        out = self.moe_layer(x, gating_output=router_logits)

        if shared_output is not None:
            out = out + shared_output

        # Reduce sum
        if self.process_group.size() > 1:
            torch.distributed.all_reduce(out, group=self.process_group)

        return out.view(*x.shape)


class DeepseekV2Layer(nn.Module):
    def __init__(self, prefix, layer_id, config, weights, rotary_emb):
        super().__init__()
        prefix = f"{prefix}.layers.{layer_id}"

        self.self_attn = DeepseekV2Attention(
            prefix=f"{prefix}.self_attn",
            config=config,
            weights=weights,
            rotary_emb=rotary_emb,
        )

        if (
            config.n_routed_experts is not None
            and layer_id >= config.first_k_dense_replace
            and layer_id % config.moe_layer_freq == 0
        ):
            moe_layer_cls = (
                SparseMoELayer
                if SparseMoELayer.is_supported(weights)
                else DenseMoELayer
            )
            self.mlp = DeepseekV2MoE(f"{prefix}.mlp", config, moe_layer_cls, weights)
        else:
            self.mlp = DeepseekV2MLP(
                prefix=f"{prefix}.mlp",
                config=config,
                weights=weights,
                intermediate_size=config.intermediate_size,
            )

        self.input_layernorm = FastRMSNorm.load(
            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
        )
        self.post_attention_layernorm = FastRMSNorm.load(
            prefix=f"{prefix}.post_attention_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        residual: torch.Tensor,
        cos: torch.Tensor,
        sin: torch.Tensor,
        cu_seqlen_prefill: torch.Tensor,
        kv_cache,
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
    ):
        normed_hidden_states, residual = self.input_layernorm(hidden_states, residual)

        # Self Attention
        attn_output = self.self_attn(
            normed_hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            hpu_attention_meta,
        )

        # faster post attention rms norm
        normed_attn_res_output, residual = self.post_attention_layernorm(
            attn_output, residual
        )

        output = self.mlp(normed_attn_res_output)

        return output, residual


class DeepseekV2Model(torch.nn.Module):
    def __init__(self, prefix: str, config, weights: Weights):
        super().__init__()

        self.embed_tokens = TensorParallelEmbedding(
            prefix=f"{prefix}.embed_tokens", weights=weights
        )

        rotary_emb = PositionRotaryEmbedding.static(
            config=config,
            dim=config.qk_rope_head_dim,
            base=config.rope_theta,
            device=weights.device,
        )
        self.layers = nn.ModuleList(
            [
                DeepseekV2Layer(
                    prefix,
                    layer_id,
                    config,
                    weights,
                    rotary_emb,
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )
        self.norm = FastRMSNorm.load(
            prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
        )

        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads
        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
    ) -> torch.Tensor:
        if hpu_attention_meta is not None:
            hpu_attention_meta = set_block_mapping(
                hpu_attention_meta, input_ids.shape[0]
            )
        hidden_states = self.embed_tokens(input_ids)

        # Get rotary cos and sin for this forward
        # Avoid to index in each layer
        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(position_ids)

        residual = None
        lazy_mode = htorch.utils.internal.is_lazy()
        if lazy_mode:
            htorch.core.mark_step()
        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                slots,
                seqlen,
                hpu_attention_meta,
            )
            if lazy_mode:
                htorch.core.mark_step()

        hidden_states, _ = self.norm(hidden_states, residual)

        return hidden_states


class FlashDeepseekV2ForCausalLM(torch.nn.Module):
    def __init__(self, prefix: str, config, weights: Weights):
        super().__init__()

        self.model = DeepseekV2Model(
            "model" if not prefix else f"{prefix}.model", config, weights
        )
        self.lm_head = SpeculativeHead.load(
            config,
            prefix="lm_head" if not prefix else f"{prefix}.lm_head",
            weights=weights,
        )

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        hidden_states = self.model(
            input_ids,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            hpu_attention_meta,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.lm_head(hidden_states)
        return logits, speculative_logits


================================================
FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_deepseek_v3_modeling.py
================================================
# coding=utf-8
# Copyright 2023, 2024 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List, Optional, Tuple, Type

import torch
import torch.distributed
from torch import nn
from transformers.activations import ACT2FN
from transformers.configuration_utils import PretrainedConfig

from text_generation_server.layers import (
    FastLinear,
    SpeculativeHead,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    TensorParallelRowLinear,
    get_linear,
    Fp8Linear,
)
from text_generation_server.layers.attention import (
    Seqlen,
    attention,
    paged_attention_mla,
    set_block_mapping,
    HPUPagedAttentionMetadata,
)
from text_generation_server.layers.attention.kv_cache import KVCache, get_kv_scales
from text_generation_server.layers.layernorm import FastRMSNorm
from text_generation_server.layers.moe import DenseMoELayer, MoELayer, SparseMoELayer
from text_generation_server.layers.rotary import PositionRotaryEmbedding, get_mscale
from text_generation_server.utils.weights import Weights
import habana_frameworks.torch as htorch


def get_and_maybe_dequant_weights(layer: torch.nn.Module) -> torch.Tensor:
    if isinstance(layer, Fp8Linear):
        eye = torch.eye(
            layer.qweight.shape[-1], dtype=torch.bfloat16, device=layer.qweight.device
        )
        dequant_weights = layer(eye)
        del eye
        # standardize to (output, input)
        return dequant_weights.T
    return layer.weight


class DeepseekV3Config(PretrainedConfig):
    def __init__(
        self,
        vocab_size=102400,
        hidden_size=4096,
        intermediate_size=11008,
        moe_intermediate_size=1407,
        num_hidden_layers=30,
        num_attention_heads=32,
        num_key_value_heads=32,
        n_shared_experts=2,
        n_routed_experts=160,
        ep_size=1,
        routed_scaling_factor=1.0,
        kv_lora_rank=512,
        q_lora_rank=1536,
        qk_rope_head_dim=64,
        v_head_dim=128,
        qk_nope_head_dim=128,
        topk_method="gready",
        n_group=8,
        topk_group=3,
        num_experts_per_tok=6,
        moe_layer_freq=1,
        first_k_dense_replace=0,
        norm_topk_prob=False,
        scoring_func="softmax",
        aux_loss_alpha=0.001,
        seq_aux=True,
        hidden_act="silu",
        max_position_embeddings=2048,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        pad_token_id=None,
        bos_token_id=100000,
        eos_token_id=100001,
        pretraining_tp=1,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
        attention_bias=False,
        attention_dropout=0.0,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.moe_intermediate_size = moe_intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.n_shared_experts = n_shared_experts
        self.n_routed_experts = n_routed_experts
        self.ep_size = ep_size
        self.routed_scaling_factor = routed_scaling_factor
        self.kv_lora_rank = kv_lora_rank
        self.q_lora_rank = q_lora_rank
        self.qk_rope_head_dim = qk_rope_head_dim
        self.v_head_dim = v_head_dim
        self.qk_nope_head_dim = qk_nope_head_dim
        self.topk_method = topk_method
        self.n_group = n_group
        self.topk_group = topk_group
        self.num_experts_per_tok = num_experts_per_tok
        self.moe_layer_freq = moe_layer_freq
        self.first_k_dense_replace = first_k_dense_replace
        self.norm_topk_prob = norm_topk_prob
        self.scoring_func = scoring_func
        self.aux_loss_alpha = aux_loss_alpha
        self.seq_aux = seq_aux
        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.pretraining_tp = pretraining_tp
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout

        tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
        if tie_word_embeddings:
            raise ValueError(
                "tie_word_embeddings is not supported for Deepseek V2 models."
            )

        if ep_size != 1:
            raise ValueError(
                f"Currently only ep_size == 1 is supported for Deepseek V2 models, was {ep_size}"
            )

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )


class DeepseekV3Attention(torch.nn.Module):
    def __init__(
        self,
        prefix: str,
        config,
        weights: Weights,
        rotary_emb,
    ):
        super().__init__()
        self.num_heads = config.num_attention_heads
        self.hidden_size = config.hidden_size
        self.kv_lora_rank = config.kv_lora_rank
        self.q_lora_rank = config.q_lora_rank
        self.qk_nope_head_dim = config.qk_nope_head_dim
        self.qk_rope_head_dim = config.qk_rope_head_dim
        self.head_size = config.qk_nope_head_dim + config.qk_rope_head_dim
        self.value_head_size = config.v_head_dim
        self.head_pad_size = max(self.head_size, self.value_head_size)
        self.rotary_emb = rotary_emb

        mscale = get_mscale(
            self.rotary_emb.scaling_factor, self.rotary_emb.mscale_all_dim
        )
        self.softmax_scale = self.head_size**-0.5 * mscale * mscale

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()
        self.num_key_value_heads = (
            config.num_key_value_heads // weights.process_group.size()
        )

        if self.q_lora_rank is None:
            self.q_proj = TensorParallelColumnLinear.load(
                config,
                prefix=f"{prefix}.q_proj",
                weights=weights,
                bias=config.attention_bias,
            )
        else:
            self.q_a_proj = get_linear(
                weight=weights.get_weights(f"{prefix}.q_a_proj"),
                bias=(
                    weights.get_tensor(f"{prefix}.q_a_proj.bias")
                    if config.attention_bias
                    else None
                ),
            )
            self.q_a_layernorm = FastRMSNorm.load(
                prefix=f"{prefix}.q_a_layernorm",
                weights=weights,
                eps=config.rms_norm_eps,
            )
            self.q_b_proj = TensorParallelColumnLinear.load(
                config,
                prefix=f"{prefix}.q_b_proj",
                weights=weights,
                bias=config.attention_bias,
            )

        self.kv_a_proj_with_mqa = get_linear(
            weight=weights.get_weights(f"{prefix}.kv_a_proj_with_mqa"),
            bias=(
                weights.get_tensor(f"{prefix}.kv_a_proj_with_mqa.bias")
                if config.attention_bias
                else None
            ),
        )

        self.kv_scales = get_kv_scales(weights, f"{prefix}")

        self.kv_a_layernorm = FastRMSNorm.load(
            prefix=f"{prefix}.kv_a_layernorm", weights=weights, eps=config.rms_norm_eps
        )

        self.kv_b_proj = TensorParallelColumnLinear.load(
            config,
            prefix=f"{prefix}.kv_b_proj",
            weights=weights,
            bias=config.attention_bias,
        )

        self.o_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.o_proj",
            weights=weights,
            bias=False,
        )
        self.num_groups = self.num_heads // self.num_key_value_heads
        self.kv_head_mapping = torch.arange(
            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
        ).repeat_interleave(self.num_groups)

        kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj.linear).T
        kv_b_proj_weight = kv_b_proj_weight.view(
            self.kv_lora_rank,
            self.num_heads,
            self.qk_nope_head_dim + self.value_head_size,
        )
        W_UK, W_UV = kv_b_proj_weight.split(
            [self.qk_nope_head_dim, self.value_head_size], dim=-1
        )
        # Convert from (L, N, V) to (N, L, V)
        self.W_UV = W_UV.transpose(0, 1)
        # Convert from (L, N, P) to (N, P, L)
        self.W_UK_T = W_UK.permute(1, 2, 0)

    def _q_proj_and_k_up_proj(self, x):
        q_proj = self.q_proj if self.q_lora_rank is None else self.q_b_proj
        q_nope, q_pe = (
            q_proj(x)
            .view(-1, self.num_heads, self.head_size)
            .split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
        )

        # Convert from (B, N, P) to (N, B, P)
        q_nope = q_nope.transpose(0, 1)
        # Multiply (N, B, P) x (N, P, L) -> (N, B, L)
        ql_nope = torch.bmm(q_nope, self.W_UK_T)
        # Convert from (N, B, L) to (B, N, L)
        return ql_nope.transpose(0, 1), q_pe

    def _v_up_proj_and_o_proj(self, x):
        # Convert from (B, N, L) to (N, B, L)
        x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)
        # Multiply (N, B, L) x (N, L, V) -> (N, B, V)
        x = torch.bmm(x, self.W_UV)
        # Convert from (N, B, V) to (B, N * V)
        x = x.transpose(0, 1).reshape(-1, self.num_heads * self.value_head_size)
        return self.o_proj(x)

    def forward(
        self,
        hidden_states: torch.Tensor,
        cos: torch.Tensor,
        sin: torch.Tensor,
        cu_seqlen_prefill: torch.Tensor,
        kv_cache: KVCache,
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
    ):
        if self.q_lora_rank is None:
            hidden_states_or_q_c = hidden_states
        else:
            hidden_states_or_q_c = self.q_a_layernorm(self.q_a_proj(hidden_states))[0]

        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
        compressed_kv, key_pe = torch.split(
            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
        )

        key_pe = key_pe.view(-1, 1, self.qk_rope_head_dim)
        kv_c_normed = self.kv_a_layernorm(compressed_kv.contiguous())[0]

        # Prefill
        if cu_seqlen_prefill is not None:
            q_proj = self.q_proj if self.q_lora_rank is None else self.q_b_proj
            query = q_proj(hidden_states_or_q_c)
            query = query.view(-1, self.num_heads, self.head_size)
            query_nope, query_pe = torch.split(
                query, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
            )
        else:
            query_nope, query_pe = self._q_proj_and_k_up_proj(hidden_states_or_q_c)

        batch_size, heads, head_dim = query_pe.shape
        query_pe = (
            query_pe.view(batch_size, heads, head_dim // 2, 2)
            .transpose(2, 3)
            .reshape(batch_size, heads, head_dim)
        )
        batch_size, heads, head_dim = key_pe.shape
        key_pe = (
            key_pe.view(batch_size, heads, head_dim // 2, 2)
            .transpose(2, 3)
            .reshape(batch_size, heads, head_dim)
        )
        self.rotary_emb(query_pe, key_pe, cos, sin)
        latent_vec_k = torch.concat(
            (kv_c_normed, key_pe.view(-1, self.qk_rope_head_dim)), dim=-1
        )
        latent_vec_k = latent_vec_k.view(-1, self.qk_rope_head_dim + self.kv_lora_rank)

        latent_vec_k = latent_vec_k.unflatten(0, (slots.size(0), -1))

        kv_cache.store(
            key=latent_vec_k,
            value=None,
            slots=slots,
            kv_scales=self.kv_scales,
        )

        if cu_seqlen_prefill is not None:
            kv = self.kv_b_proj(kv_c_normed).view(
                -1,
                self.num_key_value_heads,
                self.qk_nope_head_dim + self.value_head_size,
            )

            key_nope, value = torch.split(
                kv, [self.qk_nope_head_dim, self.value_head_size], dim=-1
            )
            query[..., self.qk_nope_head_dim :] = query_pe
            key = torch.empty_like(query)
            key[..., : self.qk_nope_head_dim] = key_nope
            key[..., self.qk_nope_head_dim :] = key_pe

            # We need to pad the heads because Flash Attention does not support
            # qk and v with different head sizes.
            query = torch.nn.functional.pad(
                query, (0, self.head_pad_size - self.head_size), value=0
            )
            key = torch.nn.functional.pad(
                key, (0, self.head_pad_size - self.head_size), value=0
            )
            value = torch.nn.functional.pad(
                value, (0, self.head_pad_size - self.value_head_size), value=0
            )

            # flash attention
            attn_output = attention(
                query=query,
                key=key,
                value=value,
                kv_cache=kv_cache,
                kv_scales=self.kv_scales,
                seqlen=seqlen,
                softmax_scale=self.softmax_scale,
            )
            attn_output = attn_output[..., : self.value_head_size]

            return self.o_proj(
                attn_output.reshape(-1, self.num_heads * self.value_head_size)
            )
        else:
            # Decode
            query = torch.cat([query_nope, query_pe], dim=-1)
            attn_output = paged_attention_mla(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                seqlen,
                kv_scales=self.kv_scales,
                hpu_attention_meta=hpu_attention_meta,
                kv_lora_rank=self.kv_lora_rank,
            )
            attn_output = self._v_up_proj_and_o_proj(attn_output)
            return attn_output


class DeepseekV3MLP(nn.Module):
    def __init__(self, prefix: str, config, weights, intermediate_size: int):
        super().__init__()
        self.hidden_act = config.hidden_act
        if self.hidden_act != "silu":
            # Bail out because MoE only supports silu.
            raise NotImplementedError(
                "Currently only `silu` is supported as an activation for Deepseek V2."
            )
        self.act = ACT2FN[self.hidden_act]

        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
            weights=weights,
            dim=0,
            bias=False,
        )

        self.down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.down_proj",
            weights=weights,
            bias=False,
        )

        self.intermediate_size = intermediate_size // weights.process_group.size()

        # TODO: This is a hotfix to be removed & properly refactored.
        self.quantize = config.quantize

    def forward(self, hidden_states: torch.Tensor, reduce: bool = True):
        gate_up_states = self.gate_up_proj(hidden_states)
        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
        return self.down_proj(
            self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], reduce=reduce
        )


class DeepseekV3MoE(nn.Module):
    def __init__(
        self,
        prefix,
        config: DeepseekV3Config,
        moe_layer_cls: Type[MoELayer],
        weights,
    ):
        super().__init__()

        self.hidden_dim = config.hidden_size
        self.moe_intermediate_size = (
            config.moe_intermediate_size // weights.process_group.size()
        )
        self.routed_scaling_factor = config.routed_scaling_factor

        # Gating
        self.gate = FastLinear.load(config, f"{prefix}.gate", weights, bias=False)

        if config.topk_method == "noaux_tc":
            self.gate.e_score_correction_bias = torch.zeros(
                config.n_routed_experts, device=weights.device
            )
        else:
            self.gate.e_score_correction_bias = None

        self.moe_layer = moe_layer_cls(
            prefix=f"{prefix}.experts",
            n_experts=config.n_routed_experts,
            n_expert_group=config.n_group,
            renormalize=config.norm_topk_prob,
            topk=config.num_experts_per_tok,
            topk_group=config.topk_group,
            weights=weights,
            scoring_func=config.scoring_func,
            e_score_correction_bias=self.gate.e_score_correction_bias,
        )
        assert isinstance(self.moe_layer, MoELayer)

        if config.n_shared_experts is not None:
            self.shared_experts = DeepseekV3MLP(
                prefix=f"{prefix}.shared_experts",
                config=config,
                weights=weights,
                intermediate_size=config.moe_intermediate_size
                * config.n_shared_experts,
            )
        else:
            self.shared_experts = None

        self.process_group = weights.process_group

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if self.shared_experts is not None:
            shared_output = self.shared_experts(x, reduce=False)
        else:
            shared_output = None

        router_logits = self.gate(x)

        out = self.moe_layer(x, gating_output=router_logits)

        if shared_output is not None:
            out = out + shared_output

        # Reduce sum
        if self.process_group.size() > 1:
            torch.distributed.all_reduce(out, group=self.process_group)

        return out.view(*x.shape)


class DeepseekV3Layer(nn.Module):
    def __init__(self, prefix, layer_id, config, weights, rotary_emb):
        super().__init__()
        prefix = f"{prefix}.layers.{layer_id}"

        self.self_attn = DeepseekV3Attention(
            prefix=f"{prefix}.self_attn",
            config=config,
            weights=weights,
            rotary_emb=rotary_emb,
        )

        if (
            config.n_routed_experts is not None
            and layer_id >= config.first_k_dense_replace
            and layer_id % config.moe_layer_freq == 0
        ):
            moe_layer_cls = (
                SparseMoELayer
                if SparseMoELayer.is_supported(weights)
                else DenseMoELayer
            )
            self.mlp = DeepseekV3MoE(f"{prefix}.mlp", config, moe_layer_cls, weights)
        else:
            self.mlp = DeepseekV3MLP(
                prefix=f"{prefix}.mlp",
                config=config,
                weights=weights,
                intermediate_size=config.intermediate_size,
            )

        self.input_layernorm = FastRMSNorm.load(
            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
        )
        self.post_attention_layernorm = FastRMSNorm.load(
            prefix=f"{prefix}.post_attention_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        residual: torch.Tensor,
        cos: torch.Tensor,
        sin: torch.Tensor,
        cu_seqlen_prefill: torch.Tensor,
        kv_cache,
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
    ):
        normed_hidden_states, residual = self.input_layernorm(hidden_states, residual)

        # Self Attention
        attn_output = self.self_attn(
            normed_hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            hpu_attention_meta,
        )

        # faster post attention rms norm
        normed_attn_res_output, residual = self.post_attention_layernorm(
            attn_output, residual
        )

        output = self.mlp(normed_attn_res_output)

        return output, residual


class DeepseekV3Model(torch.nn.Module):
    def __init__(self, prefix: str, config, weights: Weights):
        super().__init__()

        self.embed_tokens = TensorParallelEmbedding(
            prefix=f"{prefix}.embed_tokens", weights=weights
        )
        rotary_emb = PositionRotaryEmbedding.static(
            config=config,
            dim=config.qk_rope_head_dim,
            base=config.rope_theta,
            device=weights.device,
        )

        self.layers = nn.ModuleList(
            [
                DeepseekV3Layer(
                    prefix,
                    layer_id,
                    config,
                    weights,
                    rotary_emb,
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )
        self.norm = FastRMSNorm.load(
            prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
        )

        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads
        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
    ) -> torch.Tensor:
        if hpu_attention_meta is not None:
            hpu_attention_meta = set_block_mapping(
                hpu_attention_meta, input_ids.shape[0]
            )
        hidden_states = self.embed_tokens(input_ids)

        # Get rotary cos and sin for this forward
        # Avoid to index in each layer
        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(position_ids)

        residual = None
        lazy_mode = htorch.utils.internal.is_lazy()
        if lazy_mode:
            htorch.core.mark_step()
        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                slots,
                seqlen,
                hpu_attention_meta,
            )
            if lazy_mode:
                htorch.core.mark_step()

        hidden_states, _ = self.norm(hidden_states, residual)

        return hidden_states


class FlashDeepseekV3ForCausalLM(torch.nn.Module):
    def __init__(self, prefix: str, config, weights: Weights):
        super().__init__()

        self.model = DeepseekV3Model(
            "model" if not prefix else f"{prefix}.model", config, weights
        )
        self.lm_head = SpeculativeHead.load(
            config,
            prefix="lm_head" if not prefix else f"{prefix}.lm_head",
            weights=weights,
        )

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        hidden_states = self.model(
            input_ids,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            hpu_attention_meta,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.lm_head(hidden_states)
        return logits, speculative_logits


================================================
FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py
================================================
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
import torch.distributed

from torch import nn
from transformers.activations import ACT2FN
from transformers.configuration_utils import PretrainedConfig
from typing import Optional, List, Tuple
from text_generation_server.layers.attention import (
    paged_attention,
    attention,
    set_block_mapping,
    Seqlen,
    HPUPagedAttentionMetadata,
)
from text_generation_server.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    SpeculativeHead,
    get_linear,
    TensorParallelMultiAdapterLinear,
    TensorParallelAdapterRowLinear,
)
from text_generation_server.layers.attention.kv_cache import get_kv_scales
from text_generation_server.layers.rotary import PositionRotaryEmbedding
from text_generation_server.layers.layernorm import (
    FastRMSNorm,
)
from text_generation_server.utils.weights import UnquantizedWeight
import habana_frameworks.torch as htorch


class Gemma2Config(PretrainedConfig):
    def __init__(
        self,
        vocab_size=256128,
        hidden_size=3072,
        intermediate_size=24576,
        num_hidden_layers=28,
        num_attention_heads=16,
        num_key_value_heads=16,
        head_dim=256,
        hidden_act="gelu_pytorch_tanh",
        max_position_embeddings=8192,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        pad_token_id=None,
        bos_token_id=1,
        eos_token_id=2,
        tie_word_embeddings=True,
        rope_theta=10000.0,
        rope_scaling=None,
        attention_bias=False,
        attention_dropout=0.0,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.head_dim = head_dim
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads

        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )


class Gemma2FastRMSNorm(FastRMSNorm):
    @classmethod
    def load(cls, prefix: str, weights, eps=1e-6):
        dtype = weights.dtype
        weights.dtype = torch.float32
        weight = weights.get_tensor(f"{prefix}.weight") + 1
        weights.dtype = dtype
        new = cls(weight, eps)
        new.dtype = dtype
        return new

    # perform the multiplication in full precision and downcast after
    def forward(self, hidden_states, residual=None):
        if residual is not None:
            hidden_states += residual
        residual = hidden_states
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        hidden_states = hidden_states * self.weight
        return hidden_states.to(self.dtype), residual


def load_attention(config, prefix: str, weights):
    if config.num_attention_heads != config.num_key_value_heads:
        return _load_gqa(config, prefix, weights)
    else:
        return TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
            dim=0,
            weights=weights,
            bias=False,
        )


def _load_gqa(config, prefix: str, weights):
    assert config.num_attention_heads % weights.process_group.size() == 0

    weight = weights.get_multi_weights_col(
        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
        dim=0,
    )

    if isinstance(weight, UnquantizedWeight):
        weight.weight = weight.weight.to(dtype=weights.dtype).to(device=weights.device)

        head_size = config.head_dim
        num_heads = config.num_attention_heads // weights.process_group.size()
        num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
        assert list(weight.weight.shape) == [
            (num_heads + 2 * num_key_value_heads) * head_size,
            config.hidden_size,
        ], f"{list(weight.weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"

    return TensorParallelColumnLinear(get_linear(weight, bias=None))


class FlashGemma2Attention(torch.nn.Module):
    def __init__(
        self,
        prefix: str,
        config,
        weights,
        layer_id,
        causal: bool,
        is_sliding: bool,
        rotary_emb,
    ):
        super().__init__()
        self.num_heads = config.num_attention_heads
        self.head_size = config.head_dim
        self.causal = causal
        if is_sliding:
            self.window_size = config.sliding_window
        else:
            self.window_size = -1
        self.rotary_emb = rotary_emb

        # self.softmax_scale = self.head_size**-0.5
        self.softmax_scale = config.query_pre_attn_scalar**-0.5

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()
        self.num_key_value_heads = (
            config.num_key_value_heads // weights.process_group.size()
        )
        self.softcap = config.attn_logit_softcapping

        query_key_value = load_attention(config, prefix, weights)
        self.query_key_value = TensorParallelMultiAdapterLinear.load(
            query_key_value,
            layer_id,
            ["q_proj", "k_proj", "v_proj"],
            sizes=[
                self.head_size * config.num_attention_heads,
                self.head_size * config.num_key_value_heads,
                self.head_size * config.num_key_value_heads,
            ],
            process_group=weights.process_group,
        )
        self.kv_scales = get_kv_scales(weights, f"{prefix}")

        o_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.o_proj",
            weights=weights,
            bias=False,
        )
        self.o_proj = TensorParallelAdapterRowLinear.load(
            o_proj,
            layer_id,
            "o_proj",
            process_group=weights.process_group,
        )

        self.num_groups = self.num_heads // self.num_key_value_heads
        self.kv_head_mapping = torch.arange(
            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
        ).repeat_interleave(self.num_groups)

    def forward(
        self,
        hidden_states,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        adapter_data,
        hpu_attention_meta,
    ):
        qkv = self.query_key_value(hidden_states, adapter_data)
        query, kv = qkv.split(
            [
                self.head_size * self.num_heads,
                2 * self.head_size * self.num_key_value_heads,
            ],
            dim=1,
        )
        query = query.view(-1, self.num_heads, self.head_size)
        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)

        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)

        kv_cache.store(
            key=kv[:, 0],
            value=kv[:, 1],
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            # sdpa
            attn_output = attention(
                query=query,
                key=kv[:, 0],
                value=kv[:, 1],
                kv_cache=kv_cache,
                kv_scales=self.kv_scales,
                seqlen=seqlen,
                softmax_scale=self.softmax_scale,
                window_size_left=self.window_size,
                softcap=self.softcap,
            )
        # Decode
        else:
            attn_output = paged_attention(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                seqlen,
                softcap=self.softcap,
                kv_scales=self.kv_scales,
                hpu_attention_meta=hpu_attention_meta,
                window_size_left=self.window_size,
            )

        return self.o_proj(
            attn_output.view(-1, self.num_heads * self.head_size), adapter_data
        )


class Gemma2MLP(nn.Module):
    def __init__(self, prefix, config, weights, layer_id):
        super().__init__()
        act = config.hidden_activation
        self.act = (
            ACT2FN[act]
            if "gelu" not in act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
                ),
            )
        )
        # Fuse gate and up proj
        gate_up_proj = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
            weights=weights,
            dim=0,
            bias=False,
        )
        self.gate_up_proj = TensorParallelMultiAdapterLinear.load(
            gate_up_proj,
            layer_id,
            ["gate_proj", "up_proj"],
            sizes=[
                config.intermediate_size,
                config.intermediate_size,
            ],
            process_group=weights.process_group,
        )

        down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.down_proj",
            weights=weights,
            bias=False,
        )
        self.down_proj = TensorParallelAdapterRowLinear.load(
            down_proj,
            layer_id,
            "down_proj",
            process_group=weights.process_group,
        )

        self.intermediate_size = (
            config.intermediate_size // weights.process_group.size()
        )

    def forward(self, hidden_states, adapter_data):
        gate_up_states = self.gate_up_proj(hidden_states, adapter_data)
        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
        return self.down_proj(
            self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], adapter_data
        )


class FlashGemma2Layer(nn.Module):
    def __init__(
        self,
        prefix: str,
        config,
        weights,
        layer_id,
        causal: bool,
        is_sliding: bool,
        rotary_emb,
    ):
        super().__init__()
        self.self_attn = FlashGemma2Attention(
            prefix=f"{prefix}.self_attn",
            config=config,
            weights=weights,
            layer_id=layer_id,
            causal=causal,
            is_sliding=is_sliding,
            rotary_emb=rotary_emb,
        )
        self.mlp = Gemma2MLP(
            prefix=f"{prefix}.mlp", config=config, weights=weights, layer_id=layer_id
        )

        self.input_layernorm = Gemma2FastRMSNorm.load(
            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
        )
        self.post_attention_layernorm = Gemma2FastRMSNorm.load(
            prefix=f"{prefix}.post_attention_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )
        self.pre_feedforward_layernorm = Gemma2FastRMSNorm.load(
            prefix=f"{prefix}.pre_feedforward_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )
        self.post_feedforward_layernorm = Gemma2FastRMSNorm.load(
            prefix=f"{prefix}.post_feedforward_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        adapter_data,
        hpu_attention_meta,
    ):
        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)

        # Self Attention
        attn_output = self.self_attn(
            normed_hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            adapter_data,
            hpu_attention_meta,
        )

        # faster post attention rms norm
        normed_attn_res_output, _ = self.post_attention_layernorm(attn_output)
        normed_attn_res_output = normed_attn_res_output + res
        res = normed_attn_res_output

        pre_normed, _ = self.pre_feedforward_layernorm(normed_attn_res_output)
        mlp_output = self.mlp(pre_normed, adapter_data)
        post_hidden_states, _ = self.post_feedforward_layernorm(mlp_output)

        return post_hidden_states, normed_attn_res_output


class FlashGemma2Model(torch.nn.Module):
    def __init__(self, prefix: str, config, weights, causal: bool):
        super().__init__()

        process_group = weights.process_group
        self.tp_rank = process_group.rank()
        self.tp_world_size = process_group.size()
        rotary_emb = PositionRotaryEmbedding.static(
            config=config,
            dim=config.head_dim,
            base=config.rope_theta,
            device=weights.device,
        )

        self.layers = nn.ModuleList(
            [
                FlashGemma2Layer(
                    prefix=f"{prefix}.layers.{layer_id}",
                    config=config,
                    weights=weights,
                    layer_id=layer_id,
                    causal=causal,
                    is_sliding=layer_id % 2 == 0,
                    rotary_emb=rotary_emb,
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )
        self.norm = Gemma2FastRMSNorm.load(
            prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
        )

        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads
        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads

    def forward(
        self,
        inputs_embeds: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        adapter_data: Optional[torch.Tensor],
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
    ) -> torch.Tensor:
        if hpu_attention_meta is not None:
            hpu_attention_meta = set_block_mapping(
                hpu_attention_meta, inputs_embeds.shape[0]
            )
        hidden_states = inputs_embeds

        # Get rotary cos and sin for this forward
        # Avoid to index in each layer
        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(position_ids)

        residual = None
        lazy_mode = htorch.utils.internal.is_lazy()
        if lazy_mode:
            htorch.core.mark_step()

        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                slots,
                seqlen,
                adapter_data,
                hpu_attention_meta,
            )
            if lazy_mode:
                htorch.core.mark_step()

        hidden_states, _ = self.norm(hidden_states, residual)

        return hidden_states


class FlashGemma2ForCausalLM(torch.nn.Module):
    def __init__(self, prefix: str, config, weights, *, causal: bool = True):
        super().__init__()

        embed_norm = config.hidden_size**0.5
        if not prefix:
            prefix = "model"
        else:
            prefix = f"{prefix}.model"

        self.embed_tokens = TensorParallelEmbedding(
            prefix=f"{prefix}.embed_tokens", weights=weights
        )
        self.embed_tokens.weight *= embed_norm

        self.model = FlashGemma2Model(
            prefix=prefix, config=config, weights=weights, causal=causal
        )
        self.lm_head = SpeculativeHead.load(
            prefix=(
                f"{prefix}.embed_tokens"
                if config.tie_word_embeddings
                else f"{prefix}.lm_head"
            ),
            config=config,
            weights=weights,
        )
        self.softcap = config.final_logit_softcapping
        assert isinstance(self.softcap, float)

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        input_embeds = self.embed_tokens(input_ids)
        hidden_states = self.model(
            input_embeds,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            adapter_data,
            hpu_attention_meta,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.lm_head(hidden_states)

        logits /= self.softcap
        logits = torch.tanh(logits)
        logits *= self.softcap

        return logits, speculative_logits


================================================
FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gemma3_modeling.py
================================================
# coding=utf-8
# Copyright 2024 HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
import torch.distributed
from torch import nn
from typing import Optional, List, Tuple
import copy

from text_generation_server.layers import (
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    TensorParallelRowLinear,
    get_linear,
    #
    SpeculativeHead,
    TensorParallelMultiAdapterLinear,
    TensorParallelAdapterRowLinear,
)

import torch


from text_generation_server.models.custom_modeling.vlm import (
    load_text_model,
    load_vision_model,
)


from text_generation_server.layers.attention.kv_cache import get_kv_scales
from text_generation_server.layers.rotary import PositionRotaryEmbedding
from text_generation_server.layers.layernorm import (
    FastRMSNorm,
)
from text_generation_server.utils.weights import UnquantizedWeight
from transformers.activations import ACT2FN
from text_generation_server.layers.attention import (
    paged_attention,
    attention,
    Seqlen,
    set_block_mapping,
    HPUPagedAttentionMetadata,
)
import habana_frameworks.torch as htorch

ATTENTION_TYPE_GLOBAL = "global"
ATTENTION_TYPE_LOCAL = "local_sliding"


class Gemma3FastRMSNorm(FastRMSNorm):
    @classmethod
    def load(cls, prefix: str, weights, eps=1e-6):
        dtype = weights.dtype
        weights.dtype = torch.float32
        weight = weights.get_tensor(f"{prefix}.weight") + 1
        weights.dtype = dtype
        new = cls(weight, eps)
        new.dtype = dtype
        return new

    # perform the multiplication in full precision and downcast after
    def forward(self, hidden_states, residual=None):
        if residual is not None:
            hidden_states += residual
        residual = hidden_states
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        hidden_states = hidden_states * self.weight
        return hidden_states.to(self.dtype), residual


def load_attention(config, prefix: str, weights):
    if config.num_attention_heads != config.num_key_value_heads:
        return _load_gqa(config, prefix, weights)
    else:
        return TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
            dim=0,
            weights=weights,
            bias=False,
        )


def _load_gqa(config, prefix: str, weights):
    assert config.num_attention_heads % weights.process_group.size() == 0

    weight = weights.get_multi_weights_col(
        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
        dim=0,
    )

    if isinstance(weight, UnquantizedWeight):
        weight.weight = weight.weight.to(dtype=weights.dtype).to(device=weights.device)

        head_size = config.head_dim
        num_heads = config.num_attention_heads // weights.process_group.size()
        num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
        assert list(weight.weight.shape) == [
            (num_heads + 2 * num_key_value_heads) * head_size,
            config.hidden_size,
        ], f"{list(weight.weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"

    return TensorParallelColumnLinear(get_linear(weight, bias=None))


class FlashGemma3Attention(torch.nn.Module):
    def __init__(
        self,
        prefix: str,
        config,
        weights,
        layer_id,
        causal: bool,
        is_sliding: bool,
        local_rotary_emb,
        global_rotary_emb,
    ):
        super().__init__()
        self.num_heads = config.num_attention_heads
        self.head_size = config.head_dim
        self.causal = causal
        if is_sliding:
            self.window_size = config.sliding_window
            self.rotary_emb = local_rotary_emb
        else:
            self.window_size = -1
            self.rotary_emb = global_rotary_emb

        self.softmax_scale = (
            config.query_pre_attn_scalar**-0.5
            if config.query_pre_attn_scalar is not None
            else None
        )
        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()
        self.num_key_value_heads = (
            config.num_key_value_heads // weights.process_group.size()
        )
        self.softcap = None  # config.attn_logit_softcapping

        query_key_value = load_attention(config, prefix, weights)
        self.query_key_value = TensorParallelMultiAdapterLinear.load(
            query_key_value,
            layer_id,
            ["q_proj", "k_proj", "v_proj"],
            sizes=[
                self.head_size * config.num_attention_heads,
                self.head_size * config.num_key_value_heads,
                self.head_size * config.num_key_value_heads,
            ],
            process_group=weights.process_group,
        )
        self.kv_scales = get_kv_scales(weights, f"{prefix}")

        o_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.o_proj",
            weights=weights,
            bias=False,
        )
        self.o_proj = TensorParallelAdapterRowLinear.load(
            o_proj,
            layer_id,
            "o_proj",
            process_group=weights.process_group,
        )

        self.num_groups = self.num_heads // self.num_key_value_heads
        self.kv_head_mapping = torch.arange(
            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
        ).repeat_interleave(self.num_groups)
        self.q_norm = Gemma3FastRMSNorm.load(
            prefix=f"{prefix}.q_norm", weights=weights, eps=config.rms_norm_eps
        )
        self.k_norm = Gemma3FastRMSNorm.load(
            prefix=f"{prefix}.k_norm", weights=weights, eps=config.rms_norm_eps
        )
        self.enable_gqa = self.num_heads != self.num_key_value_heads

    def forward(
        self,
        hidden_states,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        adapter_data,
        hpu_attention_meta,
    ):

        qkv = self.query_key_value(hidden_states, adapter_data)
        query, kv = qkv.split(
            [
                self.head_size * self.num_heads,
                2 * self.head_size * self.num_key_value_heads,
            ],
            dim=1,
        )

        kv = kv.view(-1, 2, self.num_key_value_heads * self.head_size)
        key = kv[:, 0]
        value = kv[:, 1]

        query = query.reshape(-1, self.head_size)
        key = key.reshape(-1, self.head_size)

        query, _ = self.q_norm(query.contiguous())
        key, _ = self.k_norm(key.contiguous())

        query = query.view(-1, self.num_heads, self.head_size)
        key = key.view(-1, self.num_key_value_heads, self.head_size)
        value = value.view(-1, self.num_key_value_heads, self.head_size)

        self.rotary_emb(query, key, cos, sin)

        kv_cache.store(
            key=key,
            value=value,
            slots=slots,
            kv_scales=self.kv_scales,
        )
        # Prefill
        if cu_seqlen_prefill is not None:
            # sdpa
            attn_output = attention(
                query=query,
                key=key,
                value=value,
                kv_cache=kv_cache,
                kv_scales=self.kv_scales,
                seqlen=seqlen,
                softmax_scale=self.softmax_scale,
                window_size_left=self.window_size,
                softcap=self.softcap,
            )
        # Decode
        else:
            attn_output = paged_attention(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                seqlen,
                softcap=self.softcap,
                kv_scales=self.kv_scales,
                hpu_attention_meta=hpu_attention_meta,
                window_size_left=self.window_size,
            )

        return self.o_proj(
            attn_output.view(-1, self.num_heads * self.head_size), adapter_data
        )


class Gemma3MLP(nn.Module):
    def __init__(self, prefix, config, weights, layer_id):
        super().__init__()
        act = config.hidden_activation
        self.act = (
            ACT2FN[act]
            if "gelu" not in act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
                ),
            )
        )
        # Fuse gate and up proj
        gate_up_proj = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
            weights=weights,
            dim=0,
            bias=False,
        )
        self.gate_up_proj = TensorParallelMultiAdapterLinear.load(
            gate_up_proj,
            layer_id,
            ["gate_proj", "up_proj"],
            sizes=[
                config.intermediate_size,
                config.intermediate_size,
            ],
            process_group=weights.process_group,
        )

        down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.down_proj",
            weights=weights,
            bias=False,
        )
        self.down_proj = TensorParallelAdapterRowLinear.load(
            down_proj,
            layer_id,
            "down_proj",
            process_group=weights.process_group,
        )

        self.intermediate_size = (
            config.intermediate_size // weights.process_group.size()
        )

    def forward(self, hidden_states, adapter_data):
        gate_up_states = self.gate_up_proj(hidden_states, adapter_data)
        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
        return self.down_proj(
            self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], adapter_data
        )


class FlashGemma3Layer(nn.Module):
    def __init__(
        self,
        prefix: str,
        config,
        weights,
        layer_id,
        causal: bool,
        is_sliding: bool,
        local_rotary_emb,
        global_rotary_emb,
    ):
        super().__init__()
        self.self_attn = FlashGemma3Attention(
            prefix=f"{prefix}.self_attn",
            config=config,
            weights=weights,
            layer_id=layer_id,
            causal=causal,
            is_sliding=is_sliding,
            local_rotary_emb=local_rotary_emb,
            global_rotary_emb=global_rotary_emb,
        )
        self.mlp = Gemma3MLP(
            prefix=f"{prefix}.mlp", config=config, weights=weights, layer_id=layer_id
        )

        self.input_layernorm = Gemma3FastRMSNorm.load(
            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
        )
        self.post_attention_layernorm = Gemma3FastRMSNorm.load(
            prefix=f"{prefix}.post_attention_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )
        self.pre_feedforward_layernorm = Gemma3FastRMSNorm.load(
            prefix=f"{prefix}.pre_feedforward_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )
        self.post_feedforward_layernorm = Gemma3FastRMSNorm.load(
            prefix=f"{prefix}.post_feedforward_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        adapter_data,
        hpu_attention_meta,
    ):
        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)

        # Self Attention
        attn_output = self.self_attn(
            normed_hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            adapter_data,
            hpu_attention_meta,
        )

        # faster post attention rms norm
        normed_attn_res_output, _ = self.post_attention_layernorm(attn_output)
        normed_attn_res_output = normed_attn_res_output + res
        res = normed_attn_res_output

        pre_normed, _ = self.pre_feedforward_layernorm(normed_attn_res_output)
        mlp_output = self.mlp(pre_normed, adapter_data)
        post_hidden_states, _ = self.post_feedforward_layernorm(mlp_output)

        return post_hidden_states, normed_attn_res_output


class FlashGemma3Model(torch.nn.Module):
    def __init__(self, prefix: str, config, weights, causal: bool):
        super().__init__()

        process_group = weights.process_group
        self.tp_rank = process_group.rank()
        self.tp_world_size = process_group.size()
        local_config = copy.deepcopy(config)
        local_config.rope_scaling = dict(rope_type="default")
        local_rotary_emb = PositionRotaryEmbedding.static(
            config=local_config,
            dim=config.head_dim,
            base=config.rope_local_base_freq,
            device=weights.device,
        )
        global_rotary_emb = PositionRotaryEmbedding.static(
            config=config,
            dim=config.head_dim,
            base=config.rope_theta,
            device=weights.device,
        )

        self.layers = nn.ModuleList(
            [
                FlashGemma3Layer(
                    prefix=f"{prefix}.layers.{layer_id}",
                    config=config,
                    weights=weights,
                    layer_id=layer_id,
                    causal=causal,
                    is_sliding=bool((layer_id + 1) % config.sliding_window_pattern),
                    local_rotary_emb=local_rotary_emb,
                    global_rotary_emb=global_rotary_emb,
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )
        self.norm = Gemma3FastRMSNorm.load(
            prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
        )

        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads
        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads

    def forward(
        self,
        inputs_embeds: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        adapter_data: Optional[torch.Tensor],
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
    ) -> torch.Tensor:
        if hpu_attention_meta is not None:
            hpu_attention_meta = set_block_mapping(
                hpu_attention_meta, inputs_embeds.shape[0]
            )
        hidden_states = inputs_embeds

        residual = None
        lazy_mode = htorch.utils.internal.is_lazy()
        if lazy_mode:
            htorch.core.mark_step()

        # Get rotary cos and sin for this forward
        # Avoid to index in each layer

        residual = None
        for i, layer in enumerate(self.layers):
            # Get rotary cos and sin for this forward
            # Avoid to index in each layer
            cos, sin = layer.self_attn.rotary_emb.get_cos_sin(position_ids)
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                slots,
                seqlen,
                adapter_data,
                hpu_attention_meta,
            )
            if lazy_mode:
                htorch.core.mark_step()

        hidden_states, _ = self.norm(hidden_states, residual)

        return hidden_states


class FlashGemma3ForCausalLM(torch.nn.Module):
    def __init__(self, prefix: str, config, weights, *, causal: bool = True):
        super().__init__()

        embed_norm = config.hidden_size**0.5
        if not prefix:
            prefix = "model"
        else:
            prefix = f"{prefix}.model"

        self.embed_tokens = TensorParallelEmbedding(
            prefix=f"{prefix}.embed_tokens", weights=weights
        )
        self.embed_tokens.weight *= embed_norm

        self.model = FlashGemma3Model(
            prefix=prefix, config=config, weights=weights, causal=causal
        )
        self.lm_head = SpeculativeHead.load(
            prefix=(
                f"{prefix}.embed_tokens"
                if config.tie_word_embeddings
                else f"{prefix}.lm_head"
            ),
            config=config,
            weights=weights,
        )
        # self.softcap = config.attn_logit_softcapping
        # assert isinstance(self.softcap, float)
        self.softcap = None

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        input_embeds = self.embed_tokens(input_ids)

        hidden_states = self.model(
            input_embeds,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            adapter_data,
            hpu_attention_meta,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.lm_head(hidden_states)

        return logits, speculative_logits


class Gemma3MultimodalInputProjection(torch.nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()

        self.mm_input_projection_weight = weights.get_tensor(
            "multi_modal_projector.mm_input_projection_weight"
        )

        self.mm_soft_emb_norm = Gemma3FastRMSNorm.load(
            prefix=f"{prefix}.mm_soft_emb_norm",
            weights=weights,
            eps=config.vision_config.layer_norm_eps,
        )

        self.patches_per_image = int(
            config.vision_config.image_size // config.vision_config.patch_size
        )
        self.tokens_per_side = int(config.mm_tokens_per_image**0.5)
        self.kernel_size = self.patches_per_image // self.tokens_per_side
        self.avg_pool = nn.AvgPool2d(
            kernel_size=self.kernel_size, stride=self.kernel_size
        )

    def forward(self, vision_outputs: torch.Tensor):
        batch_size, _, seq_length = vision_outputs.shape

        reshaped_vision_outputs = vision_outputs.transpose(1, 2)
        reshaped_vision_outputs = reshaped_vision_outputs.reshape(
            batch_size, seq_length, self.patches_per_image, self.patches_per_image
        )
        reshaped_vision_outputs = reshaped_vision_outputs.contiguous()

        pooled_vision_outputs = self.avg_pool(reshaped_vision_outputs)
        pooled_vision_outputs = pooled_vision_outputs.flatten(2)
        pooled_vision_outputs = pooled_vision_outputs.transpose(1, 2)

        normed_vision_outputs, _ = self.mm_soft_emb_norm(pooled_vision_outputs)

        projected_vision_outputs = torch.matmul(
            normed_vision_outputs, self.mm_input_projection_weight
        )
        return projected_vision_outputs.type_as(vision_outputs)


class Gemma3ForConditionalGeneration(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()

        self.config = config

        if config.vision_config is not None:

            config.vision_config.quantize = config.quantize

            self.post_vision_model_layernorm = nn.LayerNorm.load(
                prefix="vision_tower.vision_model.post_layernorm",
                weights=weights,
                eps=config.vision_config.layer_norm_eps,
            )

            self.multimodal_projector = Gemma3MultimodalInputProjection(
                prefix="multi_modal_projector",
                config=config,
                weights=weights,
            )

            text_config = config.text_config
            text_config.speculator = config.speculator
            text_config.quantize = config.quantize

            self.vision_model = load_vision_model(
                prefix="vision_tower" if not prefix else f"{prefix}.vision_tower",
                config=config.vision_config,
                weights=weights,
            )

            self.text_model = load_text_model(
                prefix="language_model" if not prefix else f"{prefix}.language_model",
                config=config.text_config,
                weights=weights,
            )
        else:
            config.text_config.quantize = config.quantize
            config.text_config.speculator = config.speculator
            self.text_model = load_text_model(
                prefix=prefix,
                config=config.text_config,
                weights=weights,
            )

        self.pad_token_id = (
            config.pad_token_id if config.pad_token_id is not None else -1
        )
        self.dtype = weights.dtype

    def get_vision_embeds(
        self,
        pixel_values: torch.FloatTensor,
        pixel_attention_mask: Optional[torch.FloatTensor] = None,
        image_sizes: Optional[torch.Tensor] = None,
        image_grid_thw: Optional[torch.LongTensor] = None,
    ):
        pixel_values = pixel_values.to(dtype=self.dtype)
        image_outputs = self.vision_model(pixel_values)
        vision_outputs = self.post_vision_model_layernorm(
            image_outputs.last_hidden_state
        )
        image_features = self.multimodal_projector(vision_outputs)
        image_features = image_features.view(-1, image_features.shape[-1])
        return image_features

    def get_inputs_embeds(
        self,
        input_ids: torch.Tensor,
        vision_embeds: torch.Tensor = None,
    ):
        inputs_embeds = self.text_model.embed_tokens(input_ids)

        if vision_embeds is not None:
            # Replace the image token embeddings with the vision features
            image_token_mask = (input_ids == self.config.image_token_index).to(
                input_ids.device
            )
            inputs_embeds[image_token_mask] = vision_embeds.view(
                -1, vision_embeds.shape[-1]
            )
        return inputs_embeds

    def forward(
        self,
        inputs_embeds: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
        lm_head_indices: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.BoolTensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        if cu_seqlen_prefill is not None:
            position_ids += 1

        if attention_mask is not None:
            min_dtype = torch.finfo(inputs_embeds.dtype).min
            # prefill may be larger than sliding window
            effective_seq_len = max(
                position_ids.shape[0], self.config.text_config.sliding_window
            )
            sliding_window_mask = torch.tril(
                torch.ones_like(attention_mask, dtype=torch.bool),
                diagonal=-self.config.text_config.sliding_window,
            )
            attention_mask_local = torch.where(
                sliding_window_mask, min_dtype, attention_mask
            )
            offset = max(0, position_ids.shape[0] - effective_seq_len)
            attention_mask_local = attention_mask_local[
                :, :, :, offset : offset + effective_seq_len
            ]
        else:
            attention_mask_local = None

        hidden_states = self.text_model.model(
            inputs_embeds=inputs_embeds,
            position_ids=position_ids,
            cu_seqlen_prefill=cu_seqlen_prefill,
            kv_cache=kv_cache,
            slots=slots,
            seqlen=seqlen,
            hpu_attention_meta=hpu_attention_meta,
            adapter_data=adapter_data,
        )

        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.text_model.lm_head(hidden_states)

        return logits, speculative_logits


================================================
FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
================================================
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
import torch.distributed

from torch import nn
from transformers.activations import ACT2FN
from transformers.configuration_utils import PretrainedConfig
from typing import Optional, List, Tuple
from text_generation_server.layers.attention import (
    paged_attention,
    attention,
    set_block_mapping,
    Seqlen,
    HPUPagedAttentionMetadata,
)
from text_generation_server.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    SpeculativeHead,
    get_linear,
)
from text_generation_server.layers.attention.kv_cache import get_kv_scales
from text_generation_server.layers.rotary import PositionRotaryEmbedding
from text_generation_server.layers.layernorm import (
    FastRMSNorm,
)
from text_generation_server.utils.weights import UnquantizedWeight
import habana_frameworks.torch as htorch


class GemmaConfig(PretrainedConfig):
    def __init__(
        self,
        vocab_size=256128,
        hidden_size=3072,
        intermediate_size=24576,
        num_hidden_layers=28,
        num_attention_heads=16,
        num_key_value_heads=16,
        head_dim=256,
        hidden_act="gelu_pytorch_tanh",
        max_position_embeddings=8192,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        pad_token_id=None,
        bos_token_id=1,
        eos_token_id=2,
        tie_word_embeddings=True,
        rope_theta=10000.0,
        rope_scaling=None,
        attention_bias=False,
        attention_dropout=0.0,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.head_dim = head_dim
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads

        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )


class GemmaFastRMSNorm(FastRMSNorm):
    @classmethod
    def load(cls, prefix: str, weights, eps=1e-6):
        dtype = weights.dtype
        weights.dtype = torch.float32
        weight = weights.get_tensor(f"{prefix}.weight") + 1
        weights.dtype = dtype
        new = cls(weight, eps)
        new.dtype = dtype
        return new

    # perform the multiplication in full precision and downcast after
    def forward(self, hidden_states, residual=None):
        if residual is not None:
            hidden_states += residual
        residual = hidden_states
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        hidden_states = hidden_states * self.weight
        return hidden_states.to(self.dtype), residual


def load_attention(config, prefix: str, weights):
    if config.num_attention_heads != config.num_key_value_heads:
        return _load_gqa(config, prefix, weights)
    else:
        return TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
            dim=0,
            weights=weights,
            bias=False,
        )


def _load_gqa(config, prefix: str, weights):
    assert config.num_attention_heads % weights.process_group.size() == 0

    weight = weights.get_multi_weights_col(
        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
        dim=0,
    )

    if isinstance(weight, UnquantizedWeight):
        weight.weight = weight.weight.to(dtype=weights.dtype).to(device=weights.device)

        head_size = config.head_dim
        num_heads = config.num_attention_heads // weights.process_group.size()
        num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
        assert list(weight.weight.shape) == [
            (num_heads + 2 * num_key_value_heads) * head_size,
            config.hidden_size,
        ], f"{list(weight.weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"

    return TensorParallelColumnLinear(get_linear(weight, bias=None))


class FlashGemmaAttention(torch.nn.Module):
    def __init__(self, prefix: str, config, weights, causal: bool, rotary_emb):
        super().__init__()
        self.num_heads = config.num_attention_heads
        self.head_size = config.head_dim
        self.causal = causal
        self.rotary_emb = rotary_emb
        self.softmax_scale = self.head_size**-0.5

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()
        self.num_key_value_heads = (
            config.num_key_value_heads // weights.process_group.size()
        )

        self.query_key_value = load_attention(config, prefix, weights)
        self.kv_scales = get_kv_scales(weights, f"{prefix}")

        self.o_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.o_proj",
            weights=weights,
            bias=False,
        )
        self.num_groups = self.num_heads // self.num_key_value_heads
        self.kv_head_mapping = torch.arange(
            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
        ).repeat_interleave(self.num_groups)

    def forward(
        self,
        hidden_states,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        hpu_attention_meta,
    ):
        qkv = self.query_key_value(hidden_states)
        query, kv = qkv.split(
            [
                self.head_size * self.num_heads,
                2 * self.head_size * self.num_key_value_heads,
            ],
            dim=1,
        )
        query = query.view(-1, self.num_heads, self.head_size)
        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)

        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)

        kv_cache.store(
            key=kv[:, 0],
            value=kv[:, 1],
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            # sdpa
            attn_output = attention(
                query=query,
                key=kv[:, 0],
                value=kv[:, 1],
                kv_cache=kv_cache,
                kv_scales=self.kv_scales,
                seqlen=seqlen,
                softmax_scale=self.softmax_scale,
                causal=self.causal,
            )
        # Decode
        else:
            attn_output = paged_attention(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                seqlen,
                kv_scales=self.kv_scales,
                hpu_attention_meta=hpu_attention_meta,
            )

        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))


class GemmaMLP(nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()
        act = config.hidden_act
        self.act = (
            ACT2FN[act]
            if "gelu" not in act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
                ),
            )
        )
        # Fuse gate and up proj
        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
            weights=weights,
            dim=0,
            bias=False,
        )
        self.down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.down_proj",
            weights=weights,
            bias=False,
        )
        self.intermediate_size = (
            config.intermediate_size // weights.process_group.size()
        )

    def forward(self, hidden_states):
        gate_up_states = self.gate_up_proj(hidden_states)
        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
        return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1])


class FlashGemmaLayer(nn.Module):
    def __init__(self, prefix: str, config, weights, causal: bool, rotary_emb):
        super().__init__()
        self.self_attn = FlashGemmaAttention(
            prefix=f"{prefix}.self_attn",
            config=config,
            weights=weights,
            causal=causal,
            rotary_emb=rotary_emb,
        )
        self.mlp = GemmaMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)

        self.input_layernorm = GemmaFastRMSNorm.load(
            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
        )
        self.post_attention_layernorm = GemmaFastRMSNorm.load(
            prefix=f"{prefix}.post_attention_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        hpu_attention_meta,
    ):
        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)

        # Self Attention
        attn_output = self.self_attn(
            normed_hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            hpu_attention_meta,
        )

        # faster post attention rms norm
        normed_attn_res_output, attn_res = self.post_attention_layernorm(
            attn_output, res
        )

        mlp_output = self.mlp(normed_attn_res_output)

        return mlp_output, attn_res


class FlashGemmaModel(torch.nn.Module):
    def __init__(self, prefix: str, config, weights, causal: bool):
        super().__init__()

        process_group = weights.process_group
        self.tp_rank = process_group.rank()
        self.tp_world_size = process_group.size()
        rotary_emb = PositionRotaryEmbedding.static(
            config=config,
            dim=config.head_dim,
            base=config.rope_theta,
            device=weights.device,
        )

        self.layers = nn.ModuleList(
            [
                FlashGemmaLayer(
                    prefix=f"{prefix}.layers.{layer_id}",
                    config=config,
                    weights=weights,
                    causal=causal,
                    rotary_emb=rotary_emb,
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )
        self.norm = GemmaFastRMSNorm.load(
            prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
        )

        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads
        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads

    def forward(
        self,
        inputs_embeds: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        adapter_data: Optional[torch.Tensor],
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
    ) -> torch.Tensor:
        if hpu_attention_meta is not None:
            hpu_attention_meta = set_block_mapping(
                hpu_attention_meta, inputs_embeds.shape[0]
            )
        hidden_states = inputs_embeds

        # Get rotary cos and sin for this forward
        # Avoid to index in each layer
        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(position_ids)

        residual = None
        lazy_mode = htorch.utils.internal.is_lazy()
        if lazy_mode:
            htorch.core.mark_step()
        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                slots,
                seqlen,
                hpu_attention_meta,
            )
            if lazy_mode:
                htorch.core.mark_step()

        hidden_states, _ = self.norm(hidden_states, residual)

        return hidden_states


class FlashGemmaForCausalLM(torch.nn.Module):
    def __init__(self, prefix: str, config, weights, *, causal: bool = True):
        super().__init__()

        embed_norm = config.hidden_size**0.5
        if not prefix:
            prefix = "model"
        else:
            prefix = f"{prefix}.model"

        self.embed_tokens = TensorParallelEmbedding(
            prefix=f"{prefix}.embed_tokens", weights=weights
        )
        self.embed_tokens.weight *= embed_norm

        self.model = FlashGemmaModel(
            prefix=prefix, config=config, weights=weights, causal=causal
        )
        self.lm_head = SpeculativeHead.load(
            prefix=(
                f"{prefix}.embed_tokens"
                if config.tie_word_embeddings
                else f"{prefix}.lm_head"
            ),
            config=config,
            weights=weights,
        )

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        input_embeds = self.embed_tokens(input_ids)
        hidden_states = self.model(
            input_embeds,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            adapter_data,
            hpu_attention_meta,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.lm_head(hidden_states)
        return logits, speculative_logits


================================================
FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py
================================================
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
import torch.distributed

from torch import nn
from transformers.activations import ACT2FN
from typing import Optional, List, Tuple
from text_generation_server.layers.attention import (
    paged_attention,
    attention,
    set_block_mapping,
    Seqlen,
    HPUPagedAttentionMetadata,
)
from text_generation_server.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    SpeculativeHead,
    get_linear,
)
from text_generation_server.layers.attention.kv_cache import get_kv_scales
import habana_frameworks.torch as htorch


def load_qkv(config, prefix: str, weights, head_size, num_heads):
    if config.quantize == "gptq":
        return _load_qkv_gptq(
            config,
            prefix,
            weights,
        )
    else:
        return _load_qkv(config, prefix, weights, head_size, num_heads)


def _load_qkv_gptq(config, prefix: str, weights):
    world_size = weights.process_group.size()
    rank = weights.process_group.rank()

    # Weights
    weight = weights.get_weights_col_packed_qkv(
        f"{prefix}.c_attn",
        config.num_attention_heads,
        config.num_attention_heads,
    )

    # Bias
    slice_ = weights._get_slice(f"{prefix}.c_attn.bias")
    shape = slice_.get_shape()
    total_size = shape[0]
    assert total_size % 3 == 0, f"Prepacked is not divisible by {3}"
    single_size = total_size // 3
    assert single_size % world_size == 0
    block_size = single_size // world_size
    start = rank * block_size
    stop = (rank + 1) * block_size
    tensors = []
    for i in range(3):
        tensor = slice_[start + i * single_size : stop + i * single_size]
        tensors.append(tensor)
    bias = torch.cat(tensors, dim=0)
    bias = bias.to(device=weights.device)

    return TensorParallelColumnLinear(get_linear(weight, bias))


def _load_qkv(config, prefix: str, weights, head_size, num_heads):
    """Load QKV from a single, transposed matrix."""

    slice_ = weights._get_slice(f"{prefix}.c_attn.weight")
    shape = slice_.get_shape()
    total_size = shape[1]
    assert total_size % 3 == 0, f"Prepacked is not divisible by {3}"
    world_size = weights.process_group.size()
    single_size = total_size // 3
    assert single_size % world_size == 0
    rank = weights.process_group.rank()

    # Weights
    block_size = single_size // world_size
    start = rank * block_size
    stop = (rank + 1) * block_size
    tensors = []
    for i in range(3):
        tensor = slice_[:, start + i * single_size : stop + i * single_size]
        tensors.append(tensor)
    weight = torch.cat(tensors, dim=1).T
    weight = weight.to(dtype=weights.dtype)
    weight = weight.to(device=weights.device)

    # Bias
    slice_ = weights._get_slice(f"{prefix}.c_attn.bias")
    shape = slice_.get_shape()
    total_size = shape[0]
    single_size = total_size // 3
    block_size = single_size // world_size
    assert single_size % world_size == 0
    start = rank * block_size
    stop = (rank + 1) * block_size
    b = []
    for i in range(3):
        tensor = slice_[start + i * single_size : stop + i * single_size]
        b.append(tensor)
    bias = torch.cat(b, dim=0)
    bias = bias.to(dtype=weights.dtype)
    bias = bias.to(device=weights.device)
    assert list(bias.shape) == [
        3 * num_heads * head_size
    ], f"{weight.shape} != {[3 * num_heads * head_size]}"

    return TensorParallelColumnLinear(get_linear(weight, bias))


def load_row(config, prefix: str, weights, bias: bool):
    """load_row, but with transposed weight matrices."""

    if config.quantize == "gptq":
        weight = weights.get_weights_row(prefix)
    else:
        weight = weights.get_sharded(f"{prefix}.weight", dim=0).T

    if bias and weights.process_group.rank() == 0:
        # Rank is only on the first rank process
        bias = weights.get_tensor(f"{prefix}.bias")
    else:
        bias = None

    return TensorParallelRowLinear(
        get_linear(weight, bias), process_group=weights.process_group
    )


def load_col(config, prefix: str, weights, bias: bool):
    """load_col, but with transposed weight matrices."""
    if config.quantize == "gptq":
        weight = weights.get_multi_weights_col([prefix], dim=1)
    else:
        weight = weights.get_sharded(f"{prefix}.weight", dim=1).T

    if bias:
        bias = weights.get_sharded(f"{prefix}.bias", dim=0)
    else:
        bias = None

    return TensorParallelColumnLinear(get_linear(weight, bias))


class FlashGPT2Attention(torch.nn.Module):
    def __init__(
        self,
        prefix: str,
        config,
        weights,
    ):
        super().__init__()
        self.num_heads = config.num_attention_heads
        self.hidden_size = config.hidden_size

        self.head_size = self.hidden_size // self.num_heads
        self.softmax_scale = self.head_size**-0.5

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()

        self.query_key_value = load_qkv(
            config,
            prefix=prefix,
            weights=weights,
            head_size=self.head_size,
            num_heads=self.num_heads,
        )
        self.kv_scales = get_kv_scales(weights, f"{prefix}")

        self.o_proj = load_row(
            config,
            prefix=f"{prefix}.c_proj",
            weights=weights,
            bias=True,
        )

        self.kv_head_mapping = torch.arange(
            0, self.num_heads, dtype=torch.int32, device=weights.device
        )

    def forward(
        self,
        hidden_states,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        hpu_attention_meta,
    ):
        query, key, value = self.query_key_value(hidden_states).split(
            self.head_size * self.num_heads, dim=1
        )
        query = query.view(-1, self.num_heads, self.head_size)
        key = key.view(-1, self.num_heads, self.head_size)
        value = value.view(-1, self.num_heads, self.head_size)

        kv_cache.store(
            key=key,
            value=value,
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            # sdpa
            attn_output = attention(
                query=query,
                key=key,
                value=value,
                kv_cache=kv_cache,
                kv_scales=self.kv_scales,
                seqlen=seqlen,
                softmax_scale=self.softmax_scale,
            )
        # Decode
        else:
            attn_output = paged_attention(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                seqlen,
                kv_scales=self.kv_scales,
                hpu_attention_meta=hpu_attention_meta,
            )

        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))


class GPT2MLP(nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()
        act = config.activation_function
        self.act = (
            ACT2FN[act]
            if "gelu" not in act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
                ),
            )
        )

        self.c_fc = load_col(
            config, prefix=f"{prefix}.c_fc", weights=weights, bias=True
        )
        self.c_proj = load_row(
            config,
            prefix=f"{prefix}.c_proj",
            weights=weights,
            bias=True,
        )

        intermediate_size = (
            config.n_inner if config.n_inner is not None else 4 * config.hidden_size
        )

        self.intermediate_size = intermediate_size // weights.process_group.size()

    def forward(self, hidden_states):
        hidden_states = self.c_fc(hidden_states)
        hidden_states = self.act(hidden_states)
        return self.c_proj(hidden_states)


class FlashGPT2Layer(nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()
        self.self_attn = FlashGPT2Attention(
            prefix=f"{prefix}.attn", config=config, weights=weights
        )
        self.mlp = GPT2MLP(prefix=f"{prefix}.mlp", config=config, weights=weights)

        self.input_layernorm = nn.LayerNorm.load(
            prefix=f"{prefix}.ln_1", weights=weights, eps=config.layer_norm_epsilon
        )
        self.post_attention_layernorm = nn.LayerNorm.load(
            prefix=f"{prefix}.ln_2",
            weights=weights,
            eps=config.layer_norm_epsilon,
        )

    def forward(
        self,
        hidden_states,
        residual,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        hpu_attention_meta,
    ):
        residual = hidden_states
        hidden_states = self.input_layernorm(hidden_states)

        # Self Attention
        attn_output = self.self_attn(
            hidden_states,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            hpu_attention_meta,
        )

        hidden_states = attn_output + residual
        residual = hidden_states

        hidden_states = self.post_attention_layernorm(hidden_states)

        mlp_output = self.mlp(hidden_states)

        return residual + mlp_output, residual


class FlashGPT2Model(torch.nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()

        process_group = weights.process_group
        self.tp_rank = process_group.rank()
        self.tp_world_size = process_group.size()
        self.layers = nn.ModuleList(
            [
                FlashGPT2Layer(
                    prefix=(
                        f"h.{layer_id}" if not prefix else f"{prefix}.h.{layer_id}"
                    ),
                    config=config,
                    weights=weights,
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )

        self.norm = nn.LayerNorm.load(
            prefix="ln_f" if not prefix else f"{prefix}.ln_f",
            weights=weights,
            eps=config.layer_norm_epsilon,
        )

        self.gradient_checkpointing = False

        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads

    def forward(
        self,
        inputs_embeds: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
    ) -> torch.Tensor:
        if hpu_attention_meta is not None:
            hpu_attention_meta = set_block_mapping(
                hpu_attention_meta, inputs_embeds.shape[0]
            )
        hidden_states = inputs_embeds

        residual = None
        lazy_mode = htorch.utils.internal.is_lazy()
        if lazy_mode:
            htorch.core.mark_step()

        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cu_seqlen_prefill,
                kv_cache[i],
                slots,
                seqlen,
                hpu_attention_meta,
            )
            if lazy_mode:
                htorch.core.mark_step()

        hidden_states = self.norm(hidden_states)

        return hidden_states


class FlashGPT2ForCausalLM(torch.nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()

        self.embed_tokens = TensorParallelEmbedding(
            prefix=("wte" if not prefix else f"{prefix}.wte"),
            weights=weights,
        )
        self.embed_positions = TensorParallelEmbedding(
            prefix=("wpe" if not prefix else f"{prefix}.wpe"),
            weights=weights,
        )

        self.model = FlashGPT2Model(prefix, config, weights)
        self.lm_head = SpeculativeHead.load(
            config,
            prefix="wte" if not prefix else f"{prefix}.wte",
            weights=weights,
        )

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        token_embeds = self.embed_tokens(input_ids)
        position_embeds = self.embed_positions(position_ids)
        inputs_embeds = token_embeds + position_embeds
        hidden_states = self.model(
            inputs_embeds,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            hpu_attention_meta=hpu_attention_meta,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.lm_head(hidden_states)
        return logits, speculative_logits


================================================
FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py
================================================
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
import torch.distributed

from torch import nn
from transformers.activations import ACT2FN
from typing import Optional, List, Tuple
from text_generation_server.layers.attention.kv_cache import get_kv_scales
from text_generation_server.layers.attention import (
    paged_attention,
    attention,
    set_block_mapping,
    Seqlen,
    HPUPagedAttentionMetadata,
)
from text_generation_server.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    SpeculativeHead,
    get_linear,
)
from text_generation_server.layers.rotary import (
    PositionRotaryEmbedding,
)
from text_generation_server.layers.layernorm import (
    FastLayerNorm,
)
from habana_frameworks.torch.hpex.kernels import (
    RotaryPosEmbeddingMode,
    apply_rotary_pos_emb,
)
import habana_frameworks.torch as htorch


def load_attention(config, prefix: str, weights):
    return TensorParallelColumnLinear.load_multi(
        config,
        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
        dim=0,
        weights=weights,
        bias=False,
    )


def load_row(config, prefix: str, weights, bias: bool):
    weight = weights.get_weights_row(prefix)

    if bias and weights.process_group.rank() == 0:
        # Rank is only on the first rank process
        bias = weights.get_tensor(f"{prefix}.bias")
    else:
        bias = None

    linear = get_linear(weight, bias)
    return TensorParallelRowLinear(linear, process_group=weights.process_group)


class GPTJRotary(PositionRotaryEmbedding):
    def forward(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        cos: torch.Tensor,
        sin: torch.Tensor,
    ):
        num_tokens = query.shape[0]
        head_size = query.shape[-1]
        rope_mode = RotaryPosEmbeddingMode.PAIRWISE
        sin = torch.repeat_interleave(sin, 2, dim=-1)
        cos = torch.repeat_interleave(cos, 2, dim=-1)
        rotary_dim = cos.shape[-1]
        query_shape = query.shape
        query = query.view(num_tokens, -1, head_size)
        query_rot = query[..., :rotary_dim]
        query_pass = query[..., rotary_dim:]
        query_rot = apply_rotary_pos_emb(query_rot, cos, sin, None, 0, rope_mode)
        query.copy_(torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape))

        key_shape = key.shape
        key = key.view(num_tokens, -1, head_size)
        key_rot = key[..., :rotary_dim]
        key_pass = key[..., rotary_dim:]
        key_rot = apply_rotary_pos_emb(key_rot, cos, sin, None, 0, rope_mode)
        key.copy_(torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape))


class FlashGPTJAttention(torch.nn.Module):
    def __init__(
        self,
        prefix: str,
        config,
        weights,
        rotary_emb,
    ):
        super().__init__()
        self.num_heads = config.num_attention_heads
        self.hidden_size = config.hidden_size

        self.head_size = self.hidden_size // self.num_heads
        self.softmax_scale = self.head_size**-0.5
        self.rotary_dim = config.rotary_dim

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()

        self.query_key_value = load_attention(
            config,
            prefix=prefix,
            weights=weights,
        )
        self.kv_scales = get_kv_scales(weights, f"{prefix}")

        self.o_proj = load_row(
            config,
            prefix=f"{prefix}.out_proj",
            weights=weights,
            bias=False,
        )

        self.kv_head_mapping = torch.arange(
            0, self.num_heads, dtype=torch.int32, device=weights.device
        )
        self.rotary_emb = rotary_emb

    def forward(
        self,
        hidden_states,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        hpu_attention_meta,
    ):
        query, key, value = self.query_key_value(hidden_states).split(
            self.head_size * self.num_heads, dim=1
        )
        query = query.view(-1, self.num_heads, self.head_size)
        key = key.view(-1, self.num_heads, self.head_size)
        value = value.view(-1, self.num_heads, self.head_size)

        # Compute rotary embeddings on rotary_ndims
        if self.rotary_dim is not None:
            self.rotary_emb(
                query[..., : self.rotary_dim], key[..., : self.rotary_dim], cos, sin
            )
        else:
            self.rotary_emb(query, key, cos, sin)

        kv_cache.store(
            key=key,
            value=value,
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            # sdpa
            attn_output = attention(
                query=query,
                key=key,
                value=value,
                kv_cache=kv_cache,
                kv_scales=self.kv_scales,
                seqlen=seqlen,
                softmax_scale=self.softmax_scale,
            )
        # Decode
        else:
            attn_output = paged_attention(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                seqlen,
                kv_scales=self.kv_scales,
                hpu_attention_meta=hpu_attention_meta,
            )

        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))


class GPTJMLP(nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()
        act = config.activation_function
        self.act = (
            ACT2FN[act]
            if "gelu" not in act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
                ),
            )
        )

        self.fc_in = TensorParallelColumnLinear.load(
            config, prefix=f"{prefix}.fc_in", weights=weights, bias=True
        )

        self.fc_out = load_row(
            config,
            prefix=f"{prefix}.fc_out",
            weights=weights,
            bias=True,
        )

    def forward(self, hidden_states):
        hidden_states = self.fc_in(hidden_states)
        hidden_states = self.act(hidden_states)
        return self.fc_out(hidden_states)


class FlashGPTJLayer(nn.Module):
    def __init__(self, prefix: str, config, weights, rotary_emb):
        super().__init__()
        self.self_attn = FlashGPTJAttention(
            prefix=f"{prefix}.attn",
            config=config,
            weights=weights,
            rotary_emb=rotary_emb,
        )
        self.mlp = GPTJMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)

        self.input_layernorm = FastLayerNorm.load(
            prefix=f"{prefix}.ln_1", weights=weights, eps=config.layer_norm_epsilon
        )

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        hpu_attention_meta,
    ):
        hidden_states, residual = self.input_layernorm(hidden_states, residual)
        # Self Attention
        attn_output = self.self_attn(
            hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            hpu_attention_meta,
        )

        feed_forward_hidden_states = self.mlp(hidden_states)

        return attn_output + feed_forward_hidden_states, residual


class FlashGPTJModel(torch.nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()
        self.config = config

        self.wte = TensorParallelEmbedding(prefix=f"{prefix}.wte", weights=weights)
        rotary_emb = GPTJRotary.static(
            config=config,
            dim=config.rotary_dim,
            base=10000,
            device=weights.device,
        )
        self.layers = nn.ModuleList(
            [
                FlashGPTJLayer(
                    prefix=(
                        f"h.{layer_id}" if not prefix else f"{prefix}.h.{layer_id}"
                    ),
                    config=config,
                    weights=weights,
                    rotary_emb=rotary_emb,
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )

        self.ln_f = FastLayerNorm.load(
            prefix="ln_f" if not prefix else f"{prefix}.ln_f",
            weights=weights,
            eps=config.layer_norm_epsilon,
        )

        self.gradient_checkpointing = False

        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads

    def forward(
        self,
        input_ids: Optional[torch.LongTensor],
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
    ) -> torch.Tensor:
        if hpu_attention_meta is not None:
            hpu_attention_meta = set_block_mapping(
                hpu_attention_meta, input_ids.shape[0]
            )
        hidden_states = self.wte(input_ids)

        # Get rotary cos and sin for this forward
        # Avoid to index in each layer
        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(position_ids)

        residual = None
        lazy_mode = htorch.utils.internal.is_lazy()
        if lazy_mode:
            htorch.core.mark_step()
        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                slots,
                seqlen,
                hpu_attention_meta,
            )
            if lazy_mode:
                htorch.core.mark_step()

        hidden_states, _ = self.ln_f(hidden_states, residual)

        return hidden_states


class FlashGPTJForCausalLM(torch.nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()
        if not prefix:
            prefix = "transformer"
        else:
            prefix = f"{prefix}.transformer"
        self.model = FlashGPTJModel(prefix, config, weights)
        self.lm_head = SpeculativeHead.load(
            config,
            prefix="lm_head",
            weights=weights,
        )

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        hidden_states = self.model(
            input_ids,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            hpu_attention_meta=hpu_attention_meta,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.lm_head(hidden_states)
        return logits, speculative_logits


================================================
FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_llama4_modeling.py
================================================
# coding=utf-8
# Copyright 2025 The LLAMA4 and HuggingFace Inc. team. All rights reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List, Optional, Tuple, Union

import torch
import math
import torch.utils.checkpoint
from torch import nn
import torch.nn.functional as F

import habana_frameworks.torch as htorch
from transformers.cache_utils import Cache
from transformers.activations import ACT2FN
from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
from transformers.modeling_outputs import (
    BaseModelOutput,
)

from transformers.modeling_attn_mask_utils import AttentionMaskConverter

from text_generation_server.layers import (
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    TensorParallelRowLinear,
    SpeculativeHead,
    FastLinear,
)
from text_generation_server.layers.layernorm import FastRMSNorm
from text_generation_server.layers.attention import (
    KVCache,
    paged_attention,
    set_block_mapping,
    Seqlen,
    HPUPagedAttentionMetadata,
)
from text_generation_server.models.custom_modeling.flash_llama_modeling import (
    FlashLlamaAttention,
)


def reshape_for_broadcast(freqs: torch.Tensor, target):
    ndim = len(target)
    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(target)]
    return freqs.view(*shape)


def apply_rotary_emb(
    query: torch.Tensor,
    key: torch.Tensor,
    freqs_ci: torch.Tensor,
) -> Tuple[torch.Tensor, torch.Tensor]:
    query_shape = query.shape
    key_shape = key.shape
    cos_emb, sin_emb = freqs_ci.split(1, dim=-1)

    if len(query.shape) == 3:
        query = query.unsqueeze(0)
        key = key.unsqueeze(0)

    query_reshaped = query.float().reshape(*query.shape[:-1], -1, 2)
    key_reshaped = key.float().reshape(*key.shape[:-1], -1, 2)
    q_shape = query_reshaped.shape[:-1]
    cos_emb = reshape_for_broadcast(cos_emb, q_shape)
    sin_emb = reshape_for_broadcast(sin_emb, q_shape)
    x_q, y_q = query_reshaped.unbind(-1)
    x_k, y_k = key_reshaped.unbind(-1)

    x_q_rot = x_q * cos_emb - y_q * sin_emb
    y_q_rot = x_q * sin_emb + y_q * cos_emb
    x_k_rot = x_k * cos_emb - y_k * sin_emb
    y_k_rot = x_k * sin_emb + y_k * cos_emb

    query_out = torch.stack([x_q_rot, y_q_rot], dim=-1).flatten(-2)
    key_out = torch.stack([x_k_rot, y_k_rot], dim=-1).flatten(-2)
    query_out = query_out.view(*query_shape)
    key_out = key_out.view(*key_shape)
    return query_out.type_as(query), key_out.type_as(key)


def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(
        batch, num_key_value_heads, n_rep, slen, head_dim
    )
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


class Llama4TextExperts(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.process_group = weights.process_group
        self.num_experts = config.num_local_experts
        self.intermediate_size = (
            config.intermediate_size // weights.process_group.size()
        )
        self.hidden_size = config.hidden_size
        self.expert_dim = self.intermediate_size
        self.gate_up_proj = nn.Parameter(
            weights.get_packed_sharded(f"{prefix}.gate_up_proj", dim=-1, block_sizes=2),
            requires_grad=False,
        )
        self.down_proj = nn.Parameter(
            weights.get_sharded(f"{prefix}.down_proj", dim=1), requires_grad=False
        )
        self.act_fn = ACT2FN[config.hidden_act]

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        """
        This should really not be run on a single machine, as we are reaching compute bound:
        - the inputs are expected to be "sorted" per expert already.
        - the weights are viewed with another dim, to match num_expert, 1, shape * num_tokens, shape

        Args:
            hidden_states (torch.Tensor): (batch_size * token_num, hidden_size)
            selected_experts (torch.Tensor): (batch_size * token_num, top_k)
            routing_weights (torch.Tensor): (batch_size * token_num, top_k)
        Returns:
            torch.Tensor
        """
        gate_up_proj = self.gate_up_proj.view(self.num_experts, -1, 2 * self.expert_dim)
        down_proj = self.down_proj.view(self.num_experts, self.expert_dim, -1)
        hidden_states = hidden_states.view(self.num_experts, -1, self.hidden_size)
        gate_up = torch.bmm(hidden_states, gate_up_proj)
        gate, up = gate_up.chunk(2, dim=-1)  # not supported for DTensors
        next_states = torch.bmm((up * self.act_fn(gate)), down_proj)
        next_states = next_states.view(-1, self.hidden_size)

        # Reduce sum
        if self.process_group.size() > 1:
            torch.distributed.all_reduce(next_states, group=self.process_group)

        return next_states


# Phi3MLP
class Llama4TextMLP(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        self.hidden_size = config.hidden_size
        self.intermediate_size = (
            config.intermediate_size // weights.process_group.size()
        )
        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
            config=config,
            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
            weights=weights,
            dim=0,
            bias=False,
        )
        self.down_proj = TensorParallelRowLinear.load(
            config=config,
            prefix=f"{prefix}.down_proj",
            weights=weights,
            bias=False,
        )

        self.act_fn = ACT2FN[config.hidden_act]

    def forward(self, x):
        gate_up_states = self.gate_up_proj(x)
        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
        return self.down_proj(self.act_fn(gate_up_states[:, 0]) * gate_up_states[:, 1])


class Llama4TextL2Norm(torch.nn.Module):
    def __init__(self, eps: float = 1e-6):
        super().__init__()
        self.eps = eps

    def _norm(self, x):
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)

    def forward(self, x):
        return self._norm(x.float()).type_as(x)

    def extra_repr(self):
        return f"eps={self.eps}"


class Llama4TextMoe(nn.Module):
    def __init__(
        self,
        prefix,
        config,
        weights,
    ):
        super().__init__()
        self.top_k = config.num_experts_per_tok
        self.hidden_dim = config.hidden_size
        self.num_experts = config.num_local_experts
        self.experts = Llama4TextExperts(
            config=config, prefix=f"{prefix}.experts", weights=weights
        )
        self.router = FastLinear.load(
            config=config, prefix=f"{prefix}.router", weights=weights, bias=False
        )
        self.shared_expert = Llama4TextMLP(
            config=config, prefix=f"{prefix}.shared_expert", weights=weights
        )
        self.process_group = weights.process_group

    def forward(self, hidden_states, adapter_data):
        seq_len, hidden_dim = hidden_states.shape
        hidden_states = hidden_states.view(-1, self.hidden_dim)
        tokens_per_expert = hidden_states.shape[0]
        router_logits = self.router(hidden_states)

        router_top_value, router_indices = torch.topk(router_logits, self.top_k, dim=1)
        router_scores = (
            torch.full_like(router_logits, float("-inf"))
            .scatter_(1, router_indices, router_top_value)
            .transpose(0, 1)
        )
        # We do this to make sure we have -inf for non topK tokens before going through the !
        # Here we are just creating a tensor to index each and every single one of the hidden states. Let s maybe register a buffer for this!
        router_indices = (
            torch.arange(tokens_per_expert, device=hidden_states.device)
            .view(1, -1)
            .expand(router_scores.size(0), -1)
        )
        router_scores = torch.sigmoid(router_scores.float()).to(hidden_states.dtype)

        router_indices = router_indices.reshape(-1, 1).expand(-1, self.hidden_dim)
        routed_in = torch.gather(
            input=hidden_states,
            dim=0,
            index=router_indices,
        ).to(hidden_states.device)

        # we gather inputs corresponding to each expert based on the router indices
        routed_in = routed_in * router_scores.reshape(-1, 1)
        routed_out = self.experts(routed_in)
        out = self.shared_expert(hidden_states)

        # now that we finished expert computation -> we scatter add because we gathered previously
        # we have to do this because we used all experts on all tokens. This is faster than the for loop, tho you are compute bound
        # this scales a lot better if you do EP!
        out.scatter_add_(
            dim=0, index=router_indices, src=routed_out.view(-1, self.hidden_dim)
        )
        return out


class Llama4TextRotaryEmbedding(nn.Module):
    def __init__(self, config, device=None):
        super().__init__()
        # BC: "rope_type" was originally "type"
        self.rope_type = "llama3" if config.rope_scaling is not None else "default"

        self.max_seq_len_cached = config.max_position_embeddings
        self.original_max_seq_len = config.max_position_embeddings

        self.config = config
        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]

        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
        self.register_buffer("inv_freq", inv_freq, persistent=False)
        self.original_inv_freq = self.inv_freq

    def forward(self, x, position_ids):
        inv_freq_expanded = (
            self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
        )
        position_ids_expanded = position_ids[:, None, :].float()
        device_type = (
            x.device.type
            if isinstance(x.device.type, str) and x.device.type != "mps"
            else "cpu"
        )
        inv_freq_expanded = inv_freq_expanded.to(device_type)
        position_ids_expanded = position_ids_expanded.to(device_type)
        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
            freqs = (inv_freq_expanded @ position_ids_expanded).transpose(1, 2)
            freqs_cis = (
                torch.stack([torch.cos(freqs), torch.sin(freqs)], dim=-1)
                * self.attention_scaling
            )
        return freqs_cis.to(dtype=x.dtype, device=x.device)


class Llama4TextAttention(FlashLlamaAttention):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, prefix, config, weights, layer_idx):
        super().__init__(layer_idx, prefix, config, weights, None)
        self.config = config
        self.layer_idx = layer_idx
        self.head_dim = getattr(
            config, "head_dim", config.hidden_size // config.num_attention_heads
        )
        self.num_key_value_groups = (
            config.num_attention_heads // config.num_key_value_heads
        )
        self.scaling = self.head_dim**-0.5
        self.attn_scale = config.attn_scale
        self.floor_scale = config.floor_scale
        self.attn_temperature_tuning = config.attn_temperature_tuning
        self.attention_dropout = config.attention_dropout
        self.use_rope = int((layer_idx + 1) % 4 != 0)  # rope unused for dense layers

        if self.config.use_qk_norm and self.use_rope:
            self.qk_norm = Llama4TextL2Norm(config.rms_norm_eps)

    def forward(
        self,
        hidden_states: torch.Tensor,
        freqs_ci,
        cu_seqlen_prefill,
        kv_cache: KVCache,
        slots,
        seqlen,
        adapter_data,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        bs = seqlen.input_lengths.shape[0]
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, self.head_dim)
        qkv = self.query_key_value(hidden_states, adapter_data)
        query_states, key_states, value_states = qkv.split(
            [
                self.head_dim * self.num_heads,
                self.head_dim * self.num_key_value_heads,
                self.head_dim * self.num_key_value_heads,
            ],
            dim=-1,
        )

        query_states = query_states.view(hidden_shape)
        key_states = key_states.view(hidden_shape)
        value_states = value_states.view(hidden_shape)

        if self.use_rope:  # the 16E model skips rope for long context on certain layers
            query_states, key_states = apply_rotary_emb(
                query_states, key_states, freqs_ci
            )

        if hasattr(self, "qk_norm"):  # the 128E model does not use qk_norm
            query_states = self.qk_norm(query_states)
            key_states = self.qk_norm(key_states)

        kv_cache.store(
            key=key_states,
            value=value_states,
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Use temperature tuning from https://arxiv.org/abs/2501.19399) to NoROPE layers
        if self.attn_temperature_tuning and not self.use_rope:
            attn_scales = (
                torch.log(
                    torch.floor((position_ids.float() + 1.0) / self.floor_scale) + 1.0
                )
                * self.attn_scale
                + 1.0
            )
            attn_scales = attn_scales.view(*input_shape, 1, 1)
            query_states = (query_states * attn_scales).to(query_states.dtype)

        # Prefill
        if cu_seqlen_prefill is not None:
            # sdpa
            query = query_states.view(bs, -1, self.num_heads, self.head_dim).transpose(
                1, 2
            )
            key = key_states.view(
                bs, -1, self.num_key_value_heads, self.head_dim
            ).transpose(1, 2)
            value = value_states.view(
                bs, -1, self.num_key_value_heads, self.head_dim
            ).transpose(1, 2)
            key = repeat_kv(key, self.num_key_value_groups)
            value = repeat_kv(value, self.num_key_value_groups)

            causal_mask = attention_mask
            if attention_mask is not None and causal_mask.ndim == 4:
                causal_mask = causal_mask[:, :, :, : key.shape[-2]]
            is_causal = query.shape[2] > 1 and causal_mask is None
            # SDPA with memory-efficient backend is bugged with non-contiguous inputs and custom attn_mask for some torch versions
            # Reference: https://github.com/pytorch/pytorch/issues/112577.
            query = query.contiguous()
            key = key.contiguous()
            value = value.contiguous()

            attn_output = torch.nn.functional.scaled_dot_product_attention(
                query,
                key,
                value,
                attn_mask=causal_mask,
                dropout_p=0,
                scale=self.scaling,
                is_causal=is_causal,
            )
            attn_output = attn_output.transpose(1, 2).contiguous()
        # Decode
        else:
            attn_output = paged_attention(
                query_states,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                seqlen,
                kv_scales=self.kv_scales,
                hpu_attention_meta=hpu_attention_meta,
            )

        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
        attn_output = self.o_proj(attn_output, adapter_data)
        return attn_output


class Llama4TextDecoderLayer(nn.Module):
    def __init__(self, prefix, config, weights, layer_idx):
        super().__init__()
        self.hidden_size = config.hidden_size
        self.self_attn = Llama4TextAttention(
            f"{prefix}.self_attn", config, weights, layer_idx
        )
        self.use_chunked_attention = int((layer_idx + 1) % 4 != 0)  # <=> use rope
        self.is_moe_layer = layer_idx in config.moe_layers
        if self.is_moe_layer:  # the 128E model interleaves dense / sparse
            self.feed_forward = Llama4TextMoe(f"{prefix}.feed_forward", config, weights)
        else:
            self.feed_forward = Llama4TextMLP(f"{prefix}.feed_forward", config, weights)

        self.input_layernorm = FastRMSNorm.load(
            prefix=f"{prefix}.input_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )
        self.post_attention_layernorm = FastRMSNorm.load(
            prefix=f"{prefix}.post_attention_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )

    def forward(
        self,
        hidden_states,
        freqs_ci,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        adapter_data,
        attention_mask: Optional[torch.Tensor] = None,
        chunk_causal_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata] = None,
    ) -> Tuple[
        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
    ]:
        residual = hidden_states
        hidden_states, _ = self.input_layernorm(hidden_states)

        # use local attention mask for ROPE layers
        if self.use_chunked_attention and chunk_causal_mask is not None:
            attention_mask = chunk_causal_mask

        attention_states = self.self_attn(
            hidden_states,
            freqs_ci,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            adapter_data,
            attention_mask=attention_mask,
            position_ids=position_ids,
            hpu_attention_meta=hpu_attention_meta,
        )

        hidden_states = residual + attention_states

        # Fully Connected
        residual = hidden_states

        hidden_states, _ = self.post_attention_layernorm(hidden_states)
        hidden_states = self.feed_forward(hidden_states, adapter_data)
        hidden_states = residual + hidden_states.view(residual.shape)
        return hidden_states


class Llama4TextModel(nn.Module):

    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size

        self.embed_tokens = TensorParallelEmbedding(
            prefix=f"{prefix}.embed_tokens", weights=weights
        )
        self.layers = nn.ModuleList(
            [
                Llama4TextDecoderLayer(
                    prefix=f"{prefix}.layers.{layer_idx}",
                    config=config,
                    weights=weights,
                    layer_idx=layer_idx,
                )
                for layer_idx in range(config.num_hidden_layers)
            ]
        )

        # self.norm = Llama4TextRMSNorm(prefix=f"{prefix}.norm", config=config, weights=weights)
        self.norm = FastRMSNorm.load(
            prefix=f"{prefix}.norm",
            weights=weights,
            eps=config.rms_norm_eps,
        )

        self.rotary_emb = Llama4TextRotaryEmbedding(config=config)
        self.gradient_checkpointing = False

    def forward(
        self,
        inputs_embeds: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        adapter_data,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
        attention_mask: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        if hpu_attention_meta is not None:
            hpu_attention_meta = set_block_mapping(
                hpu_attention_meta, inputs_embeds.shape[0]
            )

        hidden_states = inputs_embeds
        bs = seqlen.input_lengths.shape[0]
        seq_len = inputs_embeds.shape[0] / bs
        cache_position = torch.arange(0, seq_len, device=inputs_embeds.device)

        if position_ids is None:
            position_ids = cache_position.unsqueeze(0)

        causal_mask, chunk_causal_mask = self._update_causal_mask(
            attention_mask,
            inputs_embeds.view(bs, int(seq_len), -1),
            cache_position,
            None,
            output_attentions=False,
            use_cache=False,
        )

        freqs_ci = self.rotary_emb(hidden_states, position_ids.view(bs, -1))
        lazy_mode = htorch.utils.internal.is_lazy()
        if lazy_mode:
            htorch.core.mark_step()

        for i, layer in enumerate(self.layers):
            hidden_states = layer(
                hidden_states,
                freqs_ci,
                cu_seqlen_prefill,
                kv_cache[i],
                slots,
                seqlen,
                adapter_data,
                attention_mask=causal_mask,
                chunk_causal_mask=chunk_causal_mask,
                position_ids=position_ids,
                hpu_attention_meta=hpu_attention_meta,
            )
            if lazy_mode:
                htorch.core.mark_step()

        hidden_states, _ = self.norm(hidden_states)

        return hidden_states

    def _update_causal_mask(
        self,
        attention_mask: torch.Tensor,
        input_tensor: torch.Tensor,
        cache_position: torch.Tensor,
        past_key_values: Cache,
        output_attentions: bool = False,
        chunked_attention_mask=None,
        use_cache=True,
    ):
        if self.config._attn_implementation == "flash_attention_2":
            if attention_mask is not None and (attention_mask == 0.0).any():
                return (
                    attention_mask,
                    attention_mask,
                )  # flash does not support chunked attn TODO support flash
            return None, None

        if self.config._attn_implementation not in ["sdpa", "flex_attention", "eager"]:
            return None, None

        sequence_length = input_tensor.shape[1]
        attention_chunk_size = self.config.attention_chunk_size

        first_cache_position = cache_position[0]

        if past_key_values is not None:
            full_cache_length = past_key_values.get_max_cache_shape() or sequence_length
        else:
            full_cache_length = (
                attention_mask.shape[-1]
                if attention_mask is not None
                else sequence_length
            )

        cond1 = first_cache_position >= attention_chunk_size
        cond2 = (first_cache_position < attention_chunk_size) & (
            first_cache_position + sequence_length > attention_chunk_size
        )
        key_length = (
            torch.where(
                cond1,
                attention_chunk_size + sequence_length - 1,
                torch.where(
                    cond2, first_cache_position + sequence_length, attention_chunk_size
                ),
            )
            if use_cache
            else full_cache_length
        )

        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
        dtype, device = input_tensor.dtype, input_tensor.device
        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
            attention_mask,
            sequence_length=sequence_length,
            target_length=max(full_cache_length, attention_chunk_size),
            dtype=dtype,
            cache_position=cache_position,
            batch_size=input_tensor.shape[0],
            device=device,
        )
        if full_cache_length > self.config.attention_chunk_size:
            start_idx = max(first_cache_position - attention_chunk_size + 1, 0)
            end_idx = start_idx + key_length
            chunked_attention_mask = self.create_chunked_attention_mask(
                self.config.attention_chunk_size,
                start=start_idx,  # same offset as with flex
                end=end_idx,
                device=device,
            )

            local_attention_mask = attention_mask[
                :, start_idx:end_idx
            ]  # offset here as well
            # It may be smaller than attention_chunk_size -> pad it
            requires_padding = local_attention_mask.shape[-1] < attention_chunk_size
            if requires_padding:
                local_attention_mask = nn.functional.pad(
                    local_attention_mask,
                    (0, attention_chunk_size - local_attention_mask.shape[-1]),
                )
            # Depending on the padding, take the query tokens from the end or the cache_position
            if not requires_padding:
                chunked_attention_mask = chunked_attention_mask[
                    None, None, -sequence_length:, :
                ]
            else:
                chunked_attention_mask = chunked_attention_mask[
                    None, None, cache_position, :
                ]

            chunked_attention_mask = chunked_attention_mask.expand(
                input_tensor.shape[0], -1, -1, -1
            )
            chunked_attention_mask = (
                chunked_attention_mask * local_attention_mask[:, None, None, :]
            )
            if self.config._attn_implementation == "eager":
                min_dtype = torch.finfo(dtype).min
                chunked_attention_mask = torch.where(
                    chunked_attention_mask == 0, min_dtype, 0.0
                ).to(dtype)

        if (
            self.config._attn_implementation == "sdpa"
            and attention_mask is not None
            and attention_mask.device.type in ["cuda", "xpu", "npu"]
            and attention_mask.ndim == 4
            and not output_attentions  # Only unmask for 4d masks
        ):
            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
            # Details: https://github.com/pytorch/pytorch/issues/110213
            min_dtype = torch.finfo(dtype).min
            causal_mask = AttentionMaskConverter._unmask_unattended(
                causal_mask, min_dtype
            )

        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
        if (
            self.config._attn_implementation == "sdpa"
            and chunked_attention_mask is not None
        ):
            chunked_attention_mask = chunked_attention_mask.bool()
            causal_mask = causal_mask.bool()
            if AttentionMaskConverter._ignore_causal_mask_sdpa(
                attention_mask,
                inputs_embeds=input_tensor,
                past_key_values_length=first_cache_position,
                is_training=self.training,
            ):
                causal_mask = None
        return causal_mask, chunked_attention_mask

    def create_chunked_attention_mask(
        self, attention_chunk_size: int, start: int, end: int, device: torch.device
    ) -> torch.Tensor:
        """
        Generate the following:

        'What'      :  0 ■ ⬚ ⬚ ⬚ ⬚ ⬚    |
        '▁is'       :  1 ■ ■ ⬚ ⬚ ⬚ ⬚     |
        '▁ch'       :  2 ■ ■ ■ ⬚ ⬚ ⬚     |
        'unked'     :  3 ⬚ ⬚ ⬚ ■ ⬚ ⬚    |
        '▁attention':  4 ⬚ ⬚ ⬚ ■ ■ ⬚    |
        '?'         :  5 ⬚ ⬚ ⬚ ■ ■ ■     |

        If the chunk size is 3.
        This can just be applied over the already created attention mask
        """
        arange_vector = torch.arange(start, end, device=device)
        block_pos = torch.abs(
            arange_vector.unsqueeze(0) // attention_chunk_size
            - arange_vector.unsqueeze(1) // attention_chunk_size
        )
        token_pos = arange_vector.unsqueeze(0) - arange_vector.unsqueeze(1)
        mask = (block_pos == 0) & (token_pos <= 0)
        return mask.to(device)

    @staticmethod
    def _prepare_4d_causal_attention_mask_with_cache_position(
        attention_mask: torch.Tensor,
        sequence_length: int,
        target_length: int,
        dtype: torch.dtype,
        device: torch.device,
        cache_position: torch.Tensor,
        batch_size: int,
        **kwargs,
    ):
        """
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            device (`torch.device`):
                The device to place the 4D attention mask on.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        """
        if attention_mask is not None and attention_mask.dim() == 4:
            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
            causal_mask = attention_mask
        else:
            min_dtype = torch.finfo(dtype).min
            causal_mask = torch.full(
                (sequence_length, target_length),
                fill_value=min_dtype,
                dtype=dtype,
                device=device,
            )
            if sequence_length != 1:
                causal_mask = torch.triu(causal_mask, diagonal=1)
            causal_mask *= torch.arange(
                target_length, device=device
            ) > cache_position.to(device).reshape(-1, 1)
            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
            if attention_mask is not None:
                causal_mask = (
                    causal_mask.clone()
                )  # copy to contiguous memory for in-place edit
                mask_length = attention_mask.shape[-1]
                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[
                    :, None, None, :
                ].to(device)
                padding_mask = padding_mask == 0
                causal_mask[:, :, :, :mask_length] = causal_mask[
                    :, :, :, :mask_length
                ].masked_fill(padding_mask, min_dtype)

        return causal_mask


class Llama4ForCausalLM(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.model = Llama4TextModel(
            prefix=f"{prefix}.model", config=config, weights=weights
        )
        self.vocab_size = config.vocab_size
        self.lm_head = SpeculativeHead.load(
            config,
            f"{prefix}.lm_head",
            weights,
        )

    def forward(
        self,
        inputs_embeds: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
        adapter_data: Optional[torch.Tensor] = None,
        lm_head_indices: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:

        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
        hidden_states = self.model(
            inputs_embeds,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            adapter_data=adapter_data,
            hpu_attention_meta=hpu_attention_meta,
            attention_mask=attention_mask,
        )

        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]

        logits, speculative_logits = self.lm_head(hidden_states)
        return logits, speculative_logits


class Llama4VisionMLP2(torch.nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.hidden_size = config.hidden_size
        self.intermediate_size = config.intermediate_size
        self.fc1 = TensorParallelColumnLinear.load(
            config=config, prefix=f"{prefix}.fc1", weights=weights, bias=False
        )
        self.fc2 = TensorParallelRowLinear.load(
            config=config, prefix=f"{prefix}.fc2", weights=weights, bias=False
        )
        self.activation_fn = nn.GELU()  # ACT2FN[config.hidden_act]
        self.dropout = config.projector_dropout

    def forward(self, hidden_states):
        hidden_states = self.fc1(hidden_states)
        hidden_states = self.activation_fn(hidden_states)
        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
        hidden_states = self.fc2(hidden_states)
        return self.activation_fn(
            hidden_states
        )  # TODO: check if we need to apply activation again


class Llama4MultiModalProjector(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.linear_1 = FastLinear.load(
            config=config, prefix=f"{prefix}.linear_1", weights=weights, bias=False
        )

    def forward(self, image_features):
        hidden_states = self.linear_1(image_features)
        return hidden_states


def pixel_shuffle(input_tensor, shuffle_ratio):
    # input_tensor: [batch_size, num_patches, channels]
    batch_size, num_patches, channels = input_tensor.shape
    patch_size = int(math.sqrt(num_patches))

    input_tensor = input_tensor.view(batch_size, patch_size, patch_size, -1)
    batch_size, height, width, channels = input_tensor.size()
    reshaped_tensor = input_tensor.view(
        batch_size, height, int(width * shuffle_ratio), int(channels / shuffle_ratio)
    )
    reshaped_tensor = reshaped_tensor.permute(0, 2, 1, 3).contiguous()
    reshaped_tensor = reshaped_tensor.view(
        batch_size,
        int(height * shuffle_ratio),
        int(width * shuffle_ratio),
        int(channels / (shuffle_ratio**2)),
    )
    reshaped_tensor = reshaped_tensor.permute(0, 2, 1, 3).contiguous()

    output_tensor = reshaped_tensor.view(batch_size, -1, reshaped_tensor.shape[-1])
    return output_tensor


class Llama4VisionPixelShuffleMLP(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.pixel_shuffle_ratio = config.pixel_shuffle_ratio
        self.inner_dim = int(
            config.projector_input_dim // (self.pixel_shuffle_ratio**2)
        )
        self.output_dim = config.projector_output_dim
        self.mlp = Llama4VisionMLP2(
            prefix=f"{prefix}.mlp", config=config, weights=weights
        )

    def forward(self, encoded_patches: torch.Tensor) -> torch.Tensor:
        encoded_patches = pixel_shuffle(encoded_patches, self.pixel_shuffle_ratio)
        return self.mlp(encoded_patches)


# TODO there is a different RoPE for vision encoder, defined as below
def vision_reshape_for_broadcast(freqs_ci: torch.Tensor, query: torch.Tensor):
    ndim = query.ndim
    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(query.shape)]
    return freqs_ci.view(*shape)


class Llama4VisionAttention(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size
        self.num_heads = config.num_attention_heads // weights.process_group.size()
        self.progress_group = weights.process_group

        self.head_dim = config.hidden_size // config.num_attention_heads
        self.num_key_value_groups = 1
        self.attention_dropout = config.attention_dropout
        self.qkv_proj = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
            dim=0,
            weights=weights,
            bias=True,
        )
        self.o_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.o_proj",
            weights=weights,
            bias=True,
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        freqs_ci: torch.Tensor,  # Now takes (cos_theta, sin_theta) instead of complex
        attention_mask: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, self.head_dim)

        qkv = self.qkv_proj(hidden_states)

        query_states, key_states, value_states = qkv.split(
            [
                self.head_dim * self.num_heads,
                self.head_dim * self.num_heads,
                self.head_dim * self.num_heads,
            ],
            dim=2,
        )
        query_states = query_states.view(hidden_shape)
        key_states = key_states.view(hidden_shape)
        value_states = value_states.view(hidden_shape)

        query_states, key_states = apply_rotary_emb(
            query_states, key_states, freqs_ci=freqs_ci
        )

        query_states = query_states.transpose(1, 2)
        key_states = key_states.transpose(1, 2)
        value_states = value_states.transpose(1, 2)

        attn_output = F.scaled_dot_product_attention(
            query_states,
            key_states,
            value_states,
            attn_mask=attention_mask,
            is_causal=False,
            dropout_p=0,
        )

        attn_output = attn_output.transpose(1, 2)
        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
        attn_output = self.o_proj(attn_output)
        return attn_output


class Llama4VisionMLP(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        self.activation_fn = nn.GELU()  # ACT2FN[config.hidden_act]
        self.fc1 = TensorParallelColumnLinear.load(
            prefix=f"{prefix}.fc1", weights=weights, config=config, bias=True
        )
        self.fc2 = TensorParallelRowLinear.load(
            prefix=f"{prefix}.fc2", weights=weights, config=config, bias=True
        )

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.fc1(hidden_states)
        hidden_states = self.activation_fn(hidden_states)
        hidden_states = self.fc2(hidden_states)
        return hidden_states


class Llama4VisionEncoderLayer(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.hidden_size = config.hidden_size

        self.self_attn = Llama4VisionAttention(
            prefix=f"{prefix}.self_attn", config=config, weights=weights
        )
        self.mlp = Llama4VisionMLP(
            prefix=f"{prefix}.mlp", config=config, weights=weights
        )

        self.input_layernorm = nn.LayerNorm.load(
            prefix=f"{prefix}.input_layernorm", weights=weights, eps=1e-05
        )
        self.post_attention_layernorm = nn.LayerNorm.load(
            prefix=f"{prefix}.post_attention_layernorm", weights=weights, eps=1e-05
        )

    def forward(
        self,
        hidden_state: torch.Tensor,
        freqs_ci: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
    ):
        # Self Attention
        residual = hidden_state

        hidden_state = self.input_layernorm(hidden_state)

        hidden_state = self.self_attn(
            hidden_state,
            freqs_ci=freqs_ci,
            attention_mask=attention_mask,
        )

        hidden_state = residual + hidden_state

        # Feed forward
        residual = hidden_state
        hidden_state = self.post_attention_layernorm(hidden_state)
        hidden_state = self.mlp(hidden_state)
        hidden_state = residual + hidden_state
        outputs = (hidden_state,)
        return outputs


class Llama4VisionEncoder(nn.Module):
    """
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Llama4VisionEncoderLayer`].

    Args:
        config: Llama4VisionConfig
    """

    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        self.layers = nn.ModuleList(
            [
                Llama4VisionEncoderLayer(
                    prefix=f"{prefix}.layers.{layer_id}", config=config, weights=weights
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )
        self.gradient_checkpointing = False
        self.config = config

    def forward(
        self,
        hidden_states: torch.Tensor,
        freqs_ci: torch.Tensor,  # TODO move this to an attribute instead of keeping it around
        attention_mask: Optional[torch.Tensor] = None,
    ) -> Union[Tuple, BaseModelOutput]:

        for encoder_layer in self.layers:
            layer_outputs = encoder_layer(
                hidden_state=hidden_states,
                attention_mask=attention_mask,
                freqs_ci=freqs_ci,
            )

            hidden_states = layer_outputs[0]

        return hidden_states


class Llama4UnfoldConvolution(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        kernel_size = config.patch_size
        if isinstance(kernel_size, int):
            kernel_size = (kernel_size, kernel_size)
        self.unfold = torch.nn.Unfold(kernel_size=kernel_size, stride=config.patch_size)
        self.linear = FastLinear.load(
            config=config, prefix=f"{prefix}.linear", weights=weights, bias=False
        )

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.unfold(hidden_states)
        hidden_states = hidden_states.permute(0, 2, 1)
        hidden_states = self.linear(hidden_states)
        return hidden_states


class Llama4VisionRotaryEmbedding(nn.Module):
    def __init__(self, config, weights):
        super().__init__()
        # Calculate image grid indices
        idx = config.image_size // config.patch_size
        img_idx = torch.arange(
            idx**2, dtype=torch.int32, device=weights.device
        ).reshape(idx**2, 1)
        img_idx = torch.cat([img_idx, img_idx[:1]], dim=0)

        img_idx[-1, -1] = -2  # ID_CLS_TOKEN
        # Calculate x and y coordinates
        frequencies_x = img_idx % idx  # x coordinates
        frequencies_y = torch.div(img_idx, idx, rounding_mode="floor")  # y coordinates
        # Calculate frequency components
        freq_dim = config.hidden_size // config.num_attention_heads // 2
        rope_freq = 1.0 / (
            config.rope_theta
            ** (
                torch.arange(0, freq_dim, 2, device=weights.device)[
                    : (freq_dim // 2)
                ].float()
                / freq_dim
            )
        )

        # Compute frequencies for x and y directions
        freqs_x = (frequencies_x + 1)[..., None] * rope_freq[None, None, :]
        freqs_x = freqs_x.repeat_interleave(2, dim=-1)
        freqs_y = (frequencies_y + 1)[..., None] * rope_freq[None, None, :]
        freqs_y = freqs_y.repeat_interleave(2, dim=-1)

        # Combine frequencies and mask special tokens
        freqs = torch.cat([freqs_x, freqs_y], dim=-1).float().contiguous()[..., ::2]
        freqs = freqs.masked_fill(img_idx.reshape(-1, 1, 1) < 0, 0)

        freq_cis = torch.stack([torch.cos(freqs), torch.sin(freqs)], dim=-1)
        self.freqs_ci = freq_cis  # idx**2, idx**2, idx * 2

    def forward(self, hidden_states):
        """
        Returns the rotary embedding components (cosθ, sinθ) for the given hidden states
        """
        return self.freqs_ci.to(dtype=hidden_states.dtype, device=hidden_states.device)


class Llama4VisionModel(nn.Module):

    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        self.image_size = config.image_size
        self.patch_size = config.patch_size
        self.hidden_size = config.hidden_size
        self.num_channels = config.num_channels

        self.num_patches = (self.image_size // self.patch_size) ** 2 + 1
        self.scale = config.hidden_size**-0.5

        self.patch_embedding = Llama4UnfoldConvolution(
            prefix=f"{prefix}.patch_embedding", config=config, weights=weights
        )

        self.class_embedding = nn.Parameter(
            weights.get_tensor(f"{prefix}.class_embedding"), requires_grad=False
        )

        self.positional_embedding_vlm = nn.Parameter(
            weights.get_tensor(f"{prefix}.positional_embedding_vlm"),
            requires_grad=False,
        )

        self.rotary_embedding = Llama4VisionRotaryEmbedding(config, weights)

        # layer norms
        self.layernorm_pre = nn.LayerNorm.load(
            prefix=f"{prefix}.layernorm_pre", weights=weights, eps=config.norm_eps
        )
        self.layernorm_post = nn.LayerNorm.load(
            prefix=f"{prefix}.layernorm_post", weights=weights, eps=config.norm_eps
        )

        # encoders
        self.model = Llama4VisionEncoder(
            prefix=f"{prefix}.model", config=config, weights=weights
        )
        self.vision_adapter = Llama4VisionPixelShuffleMLP(
            prefix=f"{prefix}.vision_adapter", config=config, weights=weights
        )

    def forward(
        self,
        pixel_values: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        return_dict: Optional[bool] = None,
    ):
        # num_concurrent_media and num_chunks are both currently 1
        batch_size_times_num_tiles, num_channels, height, width = pixel_values.shape
        num_concurrent_media = 1
        num_chunks = 1
        hidden_state = self.patch_embedding(pixel_values)
        _, num_patches, hidden_dim = hidden_state.shape

        # Add cls token
        hidden_state = hidden_state.reshape(
            batch_size_times_num_tiles * num_concurrent_media * num_chunks,
            num_patches,
            hidden_dim,
        )
        class_embedding = self.class_embedding.expand(
            hidden_state.shape[0], 1, hidden_state.shape[-1]
        )
        hidden_state = torch.cat([hidden_state, class_embedding], dim=1)
        num_patches += 1

        # Position embeddings
        hidden_state = hidden_state.reshape(
            batch_size_times_num_tiles * num_concurrent_media,
            num_chunks,
            num_patches,
            hidden_dim,
        )
        positional_embedding = self.positional_embedding_vlm.to(
            dtype=hidden_state.dtype, device=hidden_state.device
        )
        hidden_state = hidden_state + positional_embedding
        hidden_state = self.layernorm_pre(hidden_state)
        hidden_state = hidden_state.view(batch_size_times_num_tiles, -1, hidden_dim)
        freqs_ci = self.rotary_embedding(pixel_values)

        hidden_state = self.model(
            hidden_state,
            attention_mask=None,
            freqs_ci=freqs_ci,
        )

        hidden_state = self.layernorm_post(hidden_state)

        hidden_state = hidden_state[:, :-1, :]

        # now, we use Llama4VisionPixelShuffle + mlp to project embeddings
        hidden_state = self.vision_adapter(hidden_state)
        return hidden_state


class Llama4ForConditionalGeneration(nn.Module):

    def __init__(self, prefix: str, config, weights):
        super().__init__()
        self.config = config
        config.vision_config.quantize = None
        config.vision_config.speculator = config.speculator
        config.text_config.quantize = config.quantize
        config.text_config.speculator = config.speculator
        config.text_config._attn_implementation = None

        self.vision_model = Llama4VisionModel(
            prefix="vision_model", config=config.vision_config, weights=weights
        )

        self.multi_modal_projector = Llama4MultiModalProjector(
            prefix="multi_modal_projector", config=config, weights=weights
        )

        self.text_model = Llama4ForCausalLM(
            prefix="language_model", config=config.text_config, weights=weights
        )
        self.vocab_size = config.text_config.vocab_size
        self.pad_token_id = (
            self.config.pad_token_id if self.config.pad_token_id is not None else -1
        )
        self.config = config
        self.dtype = weights.dtype
        self.device = weights.device

    def get_image_features(
        self,
        pixel_values: torch.FloatTensor,
        vision_feature_layer: Union[int, List[int]],
        vision_feature_select_strategy: str,
        **kwargs,
    ):
        """
        Obtains image last hidden states from the vision tower and apply al projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
            vision_feature_layer (`Union[int, List[int]]`):
                The index of the layer to select the vision feature. If multiple indices are provided,
                the vision feature of the corresponding indices will be concatenated to form the
                vision features.
            vision_feature_select_strategy (`str`):
                The feature selection strategy used to select the vision feature from the vision backbone.
                Can be one of `"default"` or `"full"`
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        """
        if vision_feature_select_strategy not in ["default", "full"]:
            raise ValueError(
                f"Unexpected select feature strategy: {self.vision_feature_select_strategy}"
            )
        kwargs = {k: v for k, v in kwargs.items() if v is not None}
        hidden_state = self.vision_model(pixel_values)
        return hidden_state

    def get_vision_embeds(
        self,
        pixel_values: torch.FloatTensor,
        pixel_attention_mask: Optional[torch.FloatTensor] = None,
        image_sizes: Optional[torch.Tensor] = None,
        image_grid_thw: Optional[torch.LongTensor] = None,
    ):
        image_features = self.get_image_features(
            pixel_values=pixel_values,
            vision_feature_layer=self.config.vision_config.vision_feature_layer,
            vision_feature_select_strategy=self.config.vision_config.vision_feature_select_strategy,
            image_sizes=image_sizes,
        )
        vision_flat = image_features.view(-1, image_features.size(-1))
        image_features = self.multi_modal_projector(vision_flat)
        return image_features

    def get_inputs_embeds(
        self,
        input_ids: torch.Tensor,
        vision_embeds: torch.Tensor = None,
        pixel_values: torch.FloatTensor = None,
        image_sizes: Optional[torch.LongTensor] = None,
    ):
        inputs_embeds = self.text_model.model.embed_tokens(input_ids)

        if vision_embeds is not None:
            # When we generate, we don't want to replace the potential image_token_id that we generated by images
            # that simply don't exist
            original_inputs_embeds_shape = inputs_embeds.shape
            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(
                -1
            )
            final_mask = special_image_mask.to(inputs_embeds.device)
            inputs_embeds = inputs_embeds.view(-1, inputs_embeds.size(-1))

            final_mask_1d = final_mask[..., 0].reshape(-1)
            num_tokens_to_fill = final_mask_1d.sum()

            if num_tokens_to_fill != vision_embeds.size(0):
                raise ValueError(
                    f"Mismatch: final_mask wants {num_tokens_to_fill} embeddings, "
                    f"but multi_modal_projector returned {vision_embeds.size(0)}"
                )

            expanded_mask = final_mask_1d.unsqueeze(-1).expand(
                -1, inputs_embeds.size(-1)
            )
            inputs_embeds = inputs_embeds.masked_scatter(expanded_mask, vision_embeds)
            inputs_embeds = inputs_embeds.view(original_inputs_embeds_shape)
        return inputs_embeds

    def forward(
        self,
        inputs_embeds: torch.Tensor,
        position_ids: Optional[torch.LongTensor] = None,
        cu_seqlen_prefill: Optional[torch.Tensor] = None,
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]] = None,
        slots: torch.Tensor = None,
        seqlen: Seqlen = None,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata] = None,
        lm_head_indices: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
        **lm_kwargs,
    ) -> Tuple[torch.Tensor, torch.Tensor]:

        logits, speculative_logits = self.text_model(
            inputs_embeds,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            hpu_attention_meta,
            adapter_data,
            lm_head_indices,
            attention_mask,
        )

        return logits, speculative_logits


================================================
FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
================================================
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from contextlib import contextmanager
from typing import List, Optional, Tuple, Type

import torch
import torch.distributed

from torch import nn
from transformers.activations import ACT2FN
import habana_frameworks.torch as htorch
from text_generation_server.layers.attention import (
    KVCache,
    get_kv_scales,
)
from text_generation_server.layers.moe import DenseMoELayer, MoELayer, SparseMoELayer
from text_generation_server.layers.attention import (
    paged_attention,
    attention,
    set_block_mapping,
    Seqlen,
    HPUPagedAttentionMetadata,
)
from text_generation_server.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    SpeculativeHead,
    TensorParallelMultiAdapterLinear,
    TensorParallelAdapterRowLinear,
)
from text_generation_server.layers.rotary import PositionRotaryEmbedding
from text_generation_server.layers.layernorm import (
    FastRMSNorm,
    FastLayerNorm,
)
from text_generation_server.layers import (
    FastLinear,
)
from text_generation_server.utils.weights import (
    Weights,
)
from text_generation_server.layers.fp8 import HybridFP8UnquantLoader


def load_attention(config, prefix: str, weights, layer_id):
    # Only defined in granite.
    bias = getattr(config, "attention_bias", False)
    head_size = config.hidden_size // config.num_attention_heads
    sizes = None
    prefixes = None

    if config.model_type == "phi3":
        base_layer = TensorParallelColumnLinear.load_qkv(
            config,
            prefix=f"{prefix}.qkv_proj",
            weights=weights,
            bias=bias,
            num_heads=config.num_attention_heads,
            num_key_value_heads=config.num_key_value_heads,
        )
        prefixes = ["qkv_proj"]
    elif config.model_type == "baichuan":
        prefix = f"{prefix}.W_pack"
        base_layer = TensorParallelColumnLinear.load_qkv(
            config,
            prefix=prefix,
            weights=weights,
            bias=bias,
            num_heads=config.num_attention_heads,
            num_key_value_heads=config.num_key_value_heads,
        )
        prefixes = [prefix]
    else:
        prefixes = ["q_proj", "k_proj", "v_proj"]
        sizes = [
            head_size * config.num_attention_heads,
            head_size * config.num_key_value_heads,
            head_size * config.num_key_value_heads,
        ]
        base_layer = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
            dim=0,
            weights=weights,
            bias=bias,
        )

    return TensorParallelMultiAdapterLinear.load(
        base_layer=base_layer,
        layer_id=layer_id,
        layer_names=prefixes,
        sizes=sizes,
        process_group=weights.process_group,
    )


@contextmanager
def no_fp8(weights: Weights):
    """De-activate fp8 auto conversion for the duration of this context manager"""
    weights_loader = weights.weights_loader
    if isinstance(weights_loader, HybridFP8UnquantLoader) and weights_loader.to_fp8:
        weights_loader = HybridFP8UnquantLoader(
            weights_loader.activation_scale_ub, to_fp8=False
        )

    with weights.use_loader(weights_loader):
        yield


class FlashLlamaAttention(torch.nn.Module):
    def __init__(
        self,
        index: int,
        prefix: str,
        config,
        weights,
        rotary_emb,
    ):
        super().__init__()
        self.num_heads = config.num_attention_heads
        self.hidden_size = config.hidden_size
        self.head_size = self.hidden_size // self.num_heads

        self.rotary_emb = rotary_emb

        # `config.attention_multiplier` is used in Granite
        self.softmax_scale = getattr(
            config, "attention_multiplier", self.head_size**-0.5
        )

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        if config.num_key_value_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_key_value_heads` must be divisible by `num_shards` (got `num_key_value_heads`: {config.num_key_value_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()
        self.num_key_value_heads = (
            config.num_key_value_heads // weights.process_group.size()
        )

        self.query_key_value = load_attention(config, prefix, weights, index)
        self.index = index

        self.kv_scales = get_kv_scales(weights, f"{prefix}")

        o_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.o_proj",
            weights=weights,
            bias=getattr(config, "attention_bias", False),
        )

        self.o_proj = TensorParallelAdapterRowLinear.load(
            o_proj,
            index,
            "o_proj",
            process_group=weights.process_group,
        )

        self.num_groups = self.num_heads // self.num_key_value_heads
        self.kv_head_mapping = torch.arange(
            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
        ).repeat_interleave(self.num_groups)

    def forward(
        self,
        hidden_states,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache: KVCache,
        slots,
        seqlen,
        adapter_data,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
    ):
        qkv = self.query_key_value(hidden_states, adapter_data)
        query, kv = qkv.split(
            [
                self.head_size * self.num_heads,
                2 * self.head_size * self.num_key_value_heads,
            ],
            dim=1,
        )
        query = query.view(-1, self.num_heads, self.head_size)
        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)

        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)

        kv_cache.store(
            key=kv[:, 0],
            value=kv[:, 1],
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            # sdpa
            attn_output = attention(
                query=query,
                key=kv[:, 0],
                value=kv[:, 1],
                kv_scales=self.kv_scales,
                kv_cache=kv_cache,
                seqlen=seqlen,
                softmax_scale=self.softmax_scale,
            )
        # Decode
        else:
            attn_output = paged_attention(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                seqlen,
                kv_scales=self.kv_scales,
                hpu_attention_meta=hpu_attention_meta,
            )

        return self.o_proj(
            attn_output.view(-1, self.num_heads * self.head_size), adapter_data
        )


class Phi3MoE(nn.Module):
    def __init__(
        self, prefix: str, config, moe_layer_cls: Type[MoELayer], weights: Weights
    ):
        super().__init__()

        # gating
        self.gate = FastLinear.load(config, f"{prefix}.gate", weights, bias=False)

        self.moe = moe_layer_cls(
            prefix=f"{prefix}.experts",
            n_experts=config.num_local_experts,
            n_expert_group=None,
            renormalize=True,
            topk=config.num_experts_per_tok,
            topk_group=None,
            weights=weights,
            gate_proj_name="w1",
            up_proj_name="w3",
            down_proj_name="w2",
        )

        self.process_group = weights.process_group

    def forward(self, x, adapter_data) -> torch.Tensor:
        # router_logits: (num_tokens, n_experts)
        router_logits = self.gate(x)
        out = self.moe(x, gating_output=router_logits)

        # Reduce sum
        if self.process_group.size() > 1:
            torch.distributed.all_reduce(out, group=self.process_group)

        return out.view(*x.shape)


class LlamaMLP(nn.Module):
    def __init__(self, prefix, config, weights, index):
        super().__init__()
        self.hidden_act = config.hidden_act
        self.act = (
            ACT2FN[self.hidden_act]
            if "gelu" not in self.hidden_act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh"
                    if self.hidden_act in ["gelu_fast", "gelu_pytorch_tanh"]
                    else "none"
                ),
            )
        )
        prefixes = None
        sizes = None

        # Fuse gate and up proj
        bias = getattr(config, "mlp_bias", False)
        if config.model_type == "phi3":
            gate_up_proj = TensorParallelColumnLinear.load_gate_up(
                config,
                prefix=f"{prefix}.gate_up_proj",
                weights=weights,
                bias=bias,
            )
        else:
            prefixes = ["gate_proj", "up_proj"]
            sizes = [
                config.intermediate_size,
                config.intermediate_size,
            ]
            gate_up_proj = TensorParallelColumnLinear.load_multi(
                config,
                prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
                weights=weights,
                dim=0,
                bias=bias,
            )

        self.gate_up_proj = TensorParallelMultiAdapterLinear.load(
            gate_up_proj,
            index,
            layer_names=prefixes,
            sizes=sizes,
            process_group=weights.process_group,
        )

        down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.down_proj",
            weights=weights,
            bias=bias,
        )

        self.down_proj = TensorParallelAdapterRowLinear.load(
            down_proj,
            index,
            "down_proj",
            process_group=weights.process_group,
        )

        self.intermediate_size = (
            config.intermediate_size // weights.process_group.size()
        )

        # TODO: This is a hotfix to be removed & properly refactored.
        self.quantize = config.quantize

        self.hidden_size = config.hidden_size

    def forward(self, hidden_states, adapter_data):
        gate_up_states = self.gate_up_proj(hidden_states, adapter_data)
        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
        return self.down_proj(
            self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], adapter_data
        )


class FlashLlamaLayer(nn.Module):
    def __init__(self, index, prefix, config, weights, rotary_emb):
        super().__init__()

        with no_fp8(weights):
            self.self_attn = FlashLlamaAttention(
                index=index,
                prefix=f"{prefix}.self_attn",
                config=config,
                weights=weights,
                rotary_emb=rotary_emb,
            )

        if config.model_type == "phimoe":
            moe_layer_cls = (
                SparseMoELayer
                if SparseMoELayer.is_supported(weights)
                else DenseMoELayer
            )
            self.mlp = Phi3MoE(
                f"{prefix}.block_sparse_moe", config, moe_layer_cls, weights
            )
            # with moe the layernorms are are not rmsnorms and they have bias
            self.input_layernorm = FastLayerNorm.load(
                prefix=f"{prefix}.input_layernorm",
                weights=weights,
                eps=config.rms_norm_eps,
            )
            self.post_attention_layernorm = FastLayerNorm.load(
                prefix=f"{prefix}.post_attention_layernorm",
                weights=weights,
                eps=config.rms_norm_eps,
            )
        else:
            self.mlp = LlamaMLP(
                prefix=f"{prefix}.mlp", config=config, weights=weights, index=index
            )
            self.input_layernorm = FastRMSNorm.load(
                prefix=f"{prefix}.input_layernorm",
                weights=weights,
                eps=config.rms_norm_eps,
            )
            self.post_attention_layernorm = FastRMSNorm.load(
                prefix=f"{prefix}.post_attention_layernorm",
                weights=weights,
                eps=config.rms_norm_eps,
            )

        # Used in Granite
        # This could eventually be baked into the weights like we do for the embeddings/lm_head
        # but this would mean modifying the lora code
        self.residual_multiplier = getattr(config, "residual_multiplier", None)

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        adapter_data,
        cross_attention_states,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
    ):
        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)

        # Self Attention
        attn_output = self.self_attn(
            normed_hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            adapter_data,
            hpu_attention_meta=hpu_attention_meta,
        )
        if self.residual_multiplier is not None:
            attn_output *= self.residual_multiplier

        normed_attn_res_output, attn_res = self.post_attention_layernorm(
            attn_output, res
        )

        mlp_output = self.mlp(normed_attn_res_output, adapter_data)
        if self.residual_multiplier is not None:
            mlp_output *= self.residual_multiplier

        return mlp_output, attn_res


class FlashLlamaModel(torch.nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()

        process_group = weights.process_group
        self.tp_rank = process_group.rank()
        self.tp_world_size = process_group.size()

        # Skip fp8 quant for first and last layers
        self.layers = nn.ModuleList()
        self.cross_attention_layers = getattr(config, "cross_attention_layers", [])
        # Setting defaults for baichuan custom config which doesn't apply them.
        config.rope_theta = getattr(config, "rope_theta", 10000)
        config.num_key_value_heads = getattr(
            config, "num_key_value_heads", config.num_attention_heads
        )
        rotary_emb = PositionRotaryEmbedding.static(
            config=config,
            dim=config.hidden_size // config.num_attention_heads,
            base=config.rope_theta,
            device=weights.device,
        )
        with no_fp8(weights):
            self.layers.append(
                FlashLlamaLayer(
                    index=0,
                    prefix=f"{prefix}.layers.0",
                    config=config,
                    weights=weights,
                    rotary_emb=rotary_emb,
                )
            )

        # Skip first and last layers
        for layer_id in range(1, config.num_hidden_layers - 1):
            if layer_id in self.cross_attention_layers:
                from text_generation_server.models.custom_modeling.flash_mllama import (
                    FlashLlamaCrossLayer,
                )

                self.layers.append(
                    FlashLlamaCrossLayer(
                        index=layer_id,
                        prefix=(f"{prefix}.layers.{layer_id}"),
                        config=config,
                        weights=weights,
                    )
                )
            else:
                self.layers.append(
                    FlashLlamaLayer(
                        index=layer_id,
                        prefix=(f"{prefix}.layers.{layer_id}"),
                        config=config,
                        weights=weights,
                        rotary_emb=rotary_emb,
                    )
                )

        with no_fp8(weights):
            last_layer_id = config.num_hidden_layers - 1
            self.layers.append(
                FlashLlamaLayer(
                    index=last_layer_id,
                    prefix=(f"{prefix}.layers.{last_layer_id}"),
                    config=config,
                    weights=weights,
                    rotary_emb=rotary_emb,
                )
            )

        self.norm = FastRMSNorm.load(
            prefix=f"{prefix}.norm",
            weights=weights,
            eps=config.rms_norm_eps,
        )

        self.gradient_checkpointing = False

        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads
        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads

    def forward(
        self,
        inputs_embeds: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        adapter_data,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
        cross_attention_states=None,
    ) -> torch.Tensor:
        if hpu_attention_meta is not None:
            hpu_attention_meta = set_block_mapping(
                hpu_attention_meta, inputs_embeds.shape[0]
            )

        hidden_states = inputs_embeds

        # Get rotary cos and sin for this forward
        # Avoid to index in each layer
        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(position_ids)

        residual = None
        lazy_mode = htorch.utils.internal.is_lazy()
        if lazy_mode:
            htorch.core.mark_step()
        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                slots,
                seqlen,
                adapter_data,
                cross_attention_states,
                hpu_attention_meta=hpu_attention_meta,
            )
            if lazy_mode:
                htorch.core.mark_step()

        hidden_states, _ = self.norm(hidden_states, residual)

        return hidden_states


class FlashLlamaForCausalLM(torch.nn.Module):
    def __init__(self, prefix: str, config, weights, name=None):
        if name is None:
            name = "model"
        super().__init__()
        with no_fp8(weights):
            self.embed_tokens = TensorParallelEmbedding(
                prefix=(
                    f"{name}.embed_tokens"
                    if not prefix
                    else f"{prefix}.{name}.embed_tokens"
                ),
                weights=weights,
            )
        self.model = FlashLlamaModel(
            prefix=name if not prefix else f"{prefix}.{name}",
            config=config,
            weights=weights,
        )
        if config.tie_word_embeddings:
            suffix = "model.embed_tokens"
        else:
            suffix = "lm_head"

        # Used in Granite
        embedding_multiplier = getattr(config, "embedding_multiplier", None)
        if embedding_multiplier is not None:
            self.embed_tokens.weight.data *= embedding_multiplier
        prefix = suffix if not prefix or name != "model" else f"{prefix}.{suffix}"
        with no_fp8(weights):
            self.lm_head = SpeculativeHead.load(
                config,
                prefix,
                weights,
            )

        # Used in Granite
        self.logits_scaling = getattr(config, "logits_scaling", None)
        if self.logits_scaling is not None and self.lm_head.head is not None:
            try:
                # Scale the weights directly
                self.lm_head.head.linear.weight.data /= self.logits_scaling
                self.logits_scaled = True
            except Exception:
                self.logits_scaled = False

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
        cross_attention_states=None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        inputs_embeds = self.embed_tokens(input_ids)
        hidden_states = self.model(
            inputs_embeds,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            adapter_data=adapter_data,
            cross_attention_states=cross_attention_states,
            hpu_attention_meta=hpu_attention_meta,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.lm_head(hidden_states)

        # Used in Granite
        if self.logits_scaling is not None and not self.logits_scaled:
            logits /= self.logits_scaling
            if speculative_logits is not None:
                speculative_logits /= self.logits_scaling

        return logits, speculative_logits


================================================
FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_llava_next.py
================================================
# coding=utf-8
# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch Llava-NeXT model."""

from typing import List, Optional, Tuple

import torch
import torch.utils.checkpoint
from torch import nn

from transformers.activations import ACT2FN
from transformers.image_processing_utils import select_best_resolution

from text_generation_server.layers.attention import Seqlen, HPUPagedAttentionMetadata
from text_generation_server.models.custom_modeling.vlm import (
    load_text_model,
    load_vision_model,
)
from text_generation_server.layers import (
    TensorParallelColumnLinear,
    TensorParallelRowLinear,
)


def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
    """
    Calculate the shape of the image patch grid after the preprocessing for images of any resolution.

    Args:
        image_size (`tuple`):
            The size of the input image in the format (height, width).
        grid_pinpoints (`List`):
            A list containing possible resolutions. Each item in the list should be a tuple or list
            of the form `(height, width)`.
        patch_size (`int`):
            The size of each image patch.

    Returns:
        tuple: The shape of the image patch grid in the format (height, width).
    """
    if not isinstance(grid_pinpoints, list):
        raise ValueError("grid_pinpoints should be a list of tuples or lists")

    height, width = select_best_resolution(image_size, grid_pinpoints)
    return height // patch_size, width // patch_size


def unpad_image(tensor, original_size):
    """
    Unpads a PyTorch tensor of a padded and resized image.

    Args:
        tensor (`torch.Tensor`):
            The image tensor, assumed to be of shape (num_channels, height, width).
        original_size (`tuple`):
            The original size of the image (height, width).

    Returns:
        `torch.Tensor`: The unpadded image tensor.
    """
    original_height, original_width = original_size
    current_height, current_width = tensor.shape[1:]

    original_aspect_ratio = original_width / original_height
    current_aspect_ratio = current_width / current_height

    if original_aspect_ratio > current_aspect_ratio:
        scale_factor = current_width / original_width
        new_height = int(original_height * scale_factor)
        padding = (current_height - new_height) // 2
        unpadded_tensor = tensor[:, padding : current_height - padding, :]
    else:
        scale_factor = current_height / original_height
        new_width = int(original_width * scale_factor)
        padding = (current_width - new_width) // 2
        unpadded_tensor = tensor[:, :, padding : current_width - padding]

    return unpadded_tensor


# Copied from transformers.models.llava.modeling_llava.LlavaMultiModalProjector with Llava->LlavaNext
class LlavaNextMultiModalProjector(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()

        self.linear_1 = TensorParallelColumnLinear.load(
            prefix=f"{prefix}.linear_1", config=config, weights=weights, bias=True
        )
        self.act = ACT2FN[config.projector_hidden_act]
        self.linear_2 = TensorParallelRowLinear.load(
            prefix=f"{prefix}.linear_2", config=config, weights=weights, bias=True
        )

    def forward(self, image_features):
        hidden_states = self.linear_1(image_features)
        hidden_states = self.act(hidden_states)
        hidden_states = self.linear_2(hidden_states)
        return hidden_states


class FlashLlavaNextForConditionalGeneration(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        config.vision_config.quantize = config.quantize
        vision_config = config.vision_config
        # Instead of selecting in hidden_states[-2].
        # Instead compute only the n -2 + 1 layers and don't pool
        if config.vision_feature_layer < 0:
            vision_config.num_hidden_layers += config.vision_feature_layer + 1
        else:
            vision_config.num_hidden_layers = config.vision_feature_layer + 1
        self.vision_tower = load_vision_model(
            prefix="vision_tower" if not prefix else f"{prefix}.vision_tower",
            config=config.vision_config,
            weights=weights,
        )

        self.multi_modal_projector = LlavaNextMultiModalProjector(
            prefix="multi_modal_projector", config=config, weights=weights
        )

        self.image_newline = weights.get_tensor("image_newline")

        self.vocab_size = config.text_config.vocab_size
        self.config = config
        config.text_config.quantize = config.quantize
        config.text_config.speculator = config.speculator
        self.text_model = load_text_model(
            prefix="language_model" if not prefix else f"{prefix}.language_model",
            config=config.text_config,
            weights=weights,
        )
        self.pad_token_id = (
            config.pad_token_id if config.pad_token_id is not None else -1
        )

    def _merge_input_ids_with_image_features(
        self,
        input_ids: torch.Tensor,
        inputs_embeds: torch.Tensor,
        image_features: torch.Tensor,
    ):
        """In place merges in vision_embeddings with inputs_embeds."""
        mask = torch.where(input_ids == self.config.image_token_index)
        # Let's pray we have enabled enough slots !
        try:
            inputs_embeds[mask] = image_features.view(-1, image_features.shape[-1])
        except Exception as e:
            raise RuntimeError(
                f"Cannot fill images right now. If error happens at warmup, make sure you have enough `--max-input-tokens`  to handle images. If error happens at regular runtime, please fill in an issue: {e}"
            )
        return inputs_embeds

    def get_vision_embeds(
        self,
        pixel_values: torch.FloatTensor,
        pixel_attention_mask: Optional[torch.FloatTensor] = None,
        image_sizes: Optional[torch.Tensor] = None,
        image_grid_thw: Optional[torch.LongTensor] = None,
    ):
        # num_special_image_tokens = (input_ids == self.config.image_token_index).sum()
        # assert num_special_image_tokens == len(pixel_values), f"Received {num_special_image_tokens} for {len(pixel_values)} images, this is invalid"
        # 1. Extract the input embeddings

        # 2. Merge text and images
        num_images, num_patches, channels, height, width = pixel_values.shape
        pixel_values = pixel_values.view(
            num_images * num_patches, channels, height, width
        )
        image_features = self.vision_tower(pixel_values)

        # selected_image_feature = image_features.hidden_states[self.config.vision_feature_layer]
        # Already done within the clip model
        selected_image_feature = image_features.last_hidden_state

        if self.config.vision_feature_select_strategy == "default":
            selected_image_feature = selected_image_feature[:, 1:]
        elif self.config.vision_feature_select_strategy == "full":
            selected_image_feature = selected_image_feature
        else:
            raise RuntimeError(
                f"Strategy `{self.config.vision_feature_select_strategy}` is not supported/valid."
            )

        image_features = self.multi_modal_projector(selected_image_feature)

        # split up image_features for each of the individual images
        # hence we get a list of image_features, each of shape (5, num_patches, hidden_size)
        # if we assume each image has 5 image features (base image + 4 patches)
        split_sizes = [num_patches] * num_images
        image_features = torch.split(image_features, split_sizes, dim=0)

        # NOTE we only support multimodal_patch_merge_type == "spatial_unpad"
        height = width = (
            self.config.vision_config.image_size // self.config.vision_config.patch_size
        )

        new_image_features = []
        for image_idx, image_feature in enumerate(image_features):
            if image_feature.shape[0] > 1:
                base_image_feature = image_feature[0]
                image_feature = image_feature[1:]

                if height * width != base_image_feature.shape[0]:
                    raise ValueError(
                        "The number of patches is not consistent with the image size."
                    )

                # Dimensions are intentionally swapped to be bug-compatible with
                # upstream: https://github.com/LLaVA-VL/LLaVA-NeXT/issues/59
                num_patch_width, num_patch_height = get_anyres_image_grid_shape(
                    image_sizes[image_idx],
                    self.config.image_grid_pinpoints,
                    self.config.vision_config.image_size,
                )
                image_feature = image_feature.view(
                    num_patch_height, num_patch_width, height, width, -1
                )
                image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
                image_feature = image_feature.flatten(1, 2).flatten(2, 3)
                image_feature = unpad_image(image_feature, image_sizes[image_idx])
                image_feature = torch.cat(
                    (
                        image_feature,
                        self.image_newline[:, None, None].expand(
                            *image_feature.shape[:-1], 1
                        ),
                    ),
                    dim=-1,
                )
                image_feature = image_feature.flatten(1, 2).transpose(0, 1)
                image_feature = torch.cat((base_image_feature, image_feature), dim=0)
            else:
                image_feature = image_feature[0]
                image_feature = torch.cat(
                    (image_feature, self.image_newline[None]), dim=0
                )
            new_image_features.append(image_feature)
        image_features = torch.stack(new_image_features, dim=0)
        return image_features.view(-1, image_features.shape[-1])

    def get_inputs_embeds(
        self,
        input_ids: torch.Tensor,
        vision_embeds: torch.Tensor = None,
        pixel_values: torch.FloatTensor = None,
        image_sizes: Optional[torch.LongTensor] = None,
    ):
        inputs_embeds = self.text_model.embed_tokens(input_ids)

        if vision_embeds is not None:
            # When we generate, we don't want to replace the potential image_token_id that we generated by images
            # that simply don't exist
            inputs_embeds = self._merge_input_ids_with_image_features(
                input_ids, inputs_embeds, vision_embeds
            )
        return inputs_embeds

    def forward(
        self,
        inputs_embeds: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
        lm_head_indices: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.BoolTensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ):

        hidden_states = self.text_model.model(
            inputs_embeds=inputs_embeds,
            position_ids=position_ids,
            cu_seqlen_prefill=cu_seqlen_prefill,
            kv_cache=kv_cache,
            slots=slots,
            seqlen=seqlen,
            hpu_attention_meta=hpu_attention_meta,
            adapter_data=adapter_data,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.text_model.lm_head(hidden_states)
        return logits, speculative_logits


================================================
FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
================================================
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
import torch.distributed

from torch import nn
from transformers.activations import ACT2FN
from transformers.configuration_utils import PretrainedConfig
from typing import Optional, List, Tuple

from text_generation_server.layers.attention.kv_cache import get_kv_scales
from text_generation_server.layers.attention import (
    paged_attention,
    attention,
    set_block_mapping,
    Seqlen,
    HPUPagedAttentionMetadata,
)
from text_generation_server.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    SpeculativeHead,
    TensorParallelMultiAdapterLinear,
    TensorParallelAdapterRowLinear,
)
from text_generation_server.layers.rotary import PositionRotaryEmbedding
from text_generation_server.layers.layernorm import (
    FastRMSNorm,
)
import habana_frameworks.torch as htorch


class MistralConfig(PretrainedConfig):
    model_type = "mistral"

    def __init__(
        self,
        vocab_size=32000,
        hidden_size=4096,
        intermediate_size=14336,
        num_hidden_layers=32,
        num_attention_heads=32,
        num_key_value_heads=8,
        hidden_act="silu",
        max_position_embeddings=4096 * 32,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        pad_token_id=None,
        bos_token_id=1,
        eos_token_id=2,
        pretraining_tp=1,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        sliding_window=None,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.sliding_window = sliding_window

        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.pretraining_tp = pretraining_tp
        self.use_cache = use_cache
        self.rope_theta = rope_theta

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )


class MistralAttention(torch.nn.Module):
    def __init__(self, prefix: str, config, weights, layer_id, rotary_emb):
        super().__init__()
        self.max_past = (
            config.sliding_window if config.sliding_window is not None else -1
        )
        self.num_heads = config.num_attention_heads
        self.hidden_size = config.hidden_size

        if getattr(config, "head_dim", None) is not None:
            self.head_size = config.head_dim
        else:
            self.head_size = self.hidden_size // self.num_heads

        self.rotary_emb = rotary_emb

        self.softmax_scale = self.head_size**-0.5

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()
        self.num_key_value_heads = (
            config.num_key_value_heads // weights.process_group.size()
        )

        query_key_value = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
            dim=0,
            weights=weights,
            bias=False,
        )

        self.query_key_value = TensorParallelMultiAdapterLinear.load(
            query_key_value,
            layer_id,
            ["q_proj", "k_proj", "v_proj"],
            sizes=[
                self.head_size * config.num_attention_heads,
                self.head_size * config.num_key_value_heads,
                self.head_size * config.num_key_value_heads,
            ],
            process_group=weights.process_group,
        )
        self.kv_scales = get_kv_scales(weights, f"{prefix}")

        o_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.o_proj",
            weights=weights,
            bias=False,
        )
        self.o_proj = TensorParallelAdapterRowLinear.load(
            o_proj,
            layer_id,
            "o_proj",
            process_group=weights.process_group,
        )
        self.num_groups = self.num_heads // self.num_key_value_heads
        self.kv_head_mapping = torch.arange(
            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
        ).repeat_interleave(self.num_groups)

    def forward(
        self,
        hidden_states,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        adapter_data,
        hpu_attention_meta,
    ):
        qkv = self.query_key_value(hidden_states, adapter_data)
        query, kv = qkv.split(
            [
                self.head_size * self.num_heads,
                2 * self.head_size * self.num_key_value_heads,
            ],
            dim=1,
        )
        query = query.view(-1, self.num_heads, self.head_size)
        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)

        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)

        kv_cache.store(
            key=kv[:, 0],
            value=kv[:, 1],
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            # sdpa
            attn_output = attention(
                query=query,
                key=kv[:, 0],
                value=kv[:, 1],
                kv_cache=kv_cache,
                kv_scales=self.kv_scales,
                seqlen=seqlen,
                softmax_scale=self.softmax_scale,
                window_size_left=self.max_past,
            )
        # Decode
        else:
            attn_output = paged_attention(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                seqlen,
                kv_scales=self.kv_scales,
                hpu_attention_meta=hpu_attention_meta,
                window_size_left=self.max_past,
            )

        return self.o_proj(
            attn_output.view(-1, self.num_heads * self.head_size), adapter_data
        )


class MistralMLP(nn.Module):
    def __init__(self, prefix: str, config, weights, layer_id):
        super().__init__()
        self.hidden_act = config.hidden_act
        self.act = (
            ACT2FN[self.hidden_act]
            if "gelu" not in self.hidden_act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh"
                    if self.hidden_act in ["gelu_fast", "gelu_pytorch_tanh"]
                    else "none"
                ),
            )
        )
        # Fuse gate and up proj
        gate_up_proj = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
            weights=weights,
            dim=0,
            bias=False,
        )
        self.gate_up_proj = TensorParallelMultiAdapterLinear.load(
            gate_up_proj,
            layer_id,
            ["gate_proj", "up_proj"],
            sizes=[
                config.intermediate_size,
                config.intermediate_size,
            ],
            process_group=weights.process_group,
        )

        down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.down_proj",
            weights=weights,
            bias=False,
        )

        self.down_proj = TensorParallelAdapterRowLinear.load(
            down_proj,
            layer_id,
            "down_proj",
            process_group=weights.process_group,
        )
        self.intermediate_size = (
            config.intermediate_size // weights.process_group.size()
        )

        # TODO: This is a hotfix to be removed & properly refactored.
        self.quantize = config.quantize

    def forward(self, hidden_states, adapter_data):
        gate_up_states = self.gate_up_proj(hidden_states, adapter_data)
        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
        return self.down_proj(
            self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], adapter_data
        )


class MistralLayer(nn.Module):
    def __init__(self, prefix: str, config, weights, layer_id, rotary_emb):
        super().__init__()
        self.self_attn = MistralAttention(
            prefix=f"{prefix}.self_attn",
            config=config,
            weights=weights,
            layer_id=layer_id,
            rotary_emb=rotary_emb,
        )
        self.mlp = MistralMLP(
            prefix=f"{prefix}.mlp", config=config, weights=weights, layer_id=layer_id
        )

        self.input_layernorm = FastRMSNorm.load(
            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
        )
        self.post_attention_layernorm = FastRMSNorm.load(
            prefix=f"{prefix}.post_attention_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        adapter_data,
        hpu_attention_meta,
    ):
        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)

        # Self Attention
        attn_output = self.self_attn(
            normed_hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            adapter_data,
            hpu_attention_meta,
        )

        # faster post attention rms norm
        normed_attn_res_output, attn_res = self.post_attention_layernorm(
            attn_output, res
        )

        mlp_output = self.mlp(normed_attn_res_output, adapter_data)

        return mlp_output, attn_res


class MistralModel(torch.nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()

        process_group = weights.process_group
        self.tp_rank = process_group.rank()
        self.tp_world_size = process_group.size()

        if getattr(config, "head_dim", None) is not None:
            head_dim = config.head_dim
        else:
            head_dim = config.hidden_size // config.num_attention_heads

        rotary_emb = PositionRotaryEmbedding.static(
            config=config,
            dim=head_dim,
            base=config.rope_theta,
            device=weights.device,
        )

        self.layers = nn.ModuleList(
            [
                MistralLayer(
                    prefix=f"{prefix}.layers.{layer_id}",
                    config=config,
                    weights=weights,
                    layer_id=layer_id,
                    rotary_emb=rotary_emb,
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )
        self.norm = FastRMSNorm.load(
            prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
        )

        self.gradient_checkpointing = False

        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads
        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads

    def forward(
        self,
        inputs_embeds: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
        adapter_data: Optional[torch.Tensor] = None,
    ):
        if hpu_attention_meta is not None:
            hpu_attention_meta = set_block_mapping(
                hpu_attention_meta, inputs_embeds.shape[0]
            )
        hidden_states = inputs_embeds
        # Get rotary cos and sin for this forward
        # Avoid to index in each layer
        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(position_ids)

        residual = None
        lazy_mode = htorch.utils.internal.is_lazy()
        if lazy_mode:
            htorch.core.mark_step()
        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                slots,
                seqlen,
                adapter_data,
                hpu_attention_meta,
            )
            if lazy_mode:
                htorch.core.mark_step()

        hidden_states, _ = self.norm(hidden_states, residual)
        return hidden_states


class FlashMistralForCausalLM(torch.nn.Module):
    def __init__(self, prefix: str, config, weights, name=None):
        if name is None:
            name = "model"
        super().__init__()
        self.embed_tokens = TensorParallelEmbedding(
            prefix=(
                f"{name}.embed_tokens"
                if not prefix
                else f"{prefix}.{name}.embed_tokens"
            ),
            weights=weights,
        )
        self.model = MistralModel(
            prefix=name if not prefix else f"{prefix}.{name}",
            config=config,
            weights=weights,
        )
        self.lm_head = SpeculativeHead.load(
            config,
            # TODO dirty hack for idefics2.
            prefix=(
                "lm_head" if not prefix or name != "model" else f"{prefix}.lm_head"
            ),
            weights=weights,
        )
        self.max_past = config.sliding_window
        self.max_past_tensor = (
            torch.tensor(config.sliding_window, device=weights.device)
            if self.max_past is not None
            else None
        )

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        inputs_embeds = self.embed_tokens(input_ids)
        hidden_states = self.model(
            inputs_embeds,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            hpu_attention_meta,
            adapter_data,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits = self.lm_head(hidden_states)
        return logits


================================================
FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
================================================
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List, Optional, Tuple, Type

import torch
import torch.distributed
from torch import nn
from transformers.configuration_utils import PretrainedConfig

from text_generation_server.layers import (
    FastLinear,
    SpeculativeHead,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    TensorParallelRowLinear,
    get_linear,
)
from text_generation_server.layers.attention import (
    Seqlen,
    attention,
    paged_attention,
    set_block_mapping,
    HPUPagedAttentionMetadata,
)
from text_generation_server.layers.attention.kv_cache import get_kv_scales
from text_generation_server.layers.layernorm import FastRMSNorm
from text_generation_server.layers.moe import DenseMoELayer, MoELayer, SparseMoELayer
from text_generation_server.layers.rotary import PositionRotaryEmbedding
from text_generation_server.utils.weights import UnquantizedWeight
import habana_frameworks.torch as htorch


class MixtralConfig(PretrainedConfig):
    model_type = "mixtral"

    def __init__(
        self,
        vocab_size=32000,
        hidden_size=4096,
        intermediate_size=14336,
        num_hidden_layers=32,
        num_attention_heads=32,
        num_key_value_heads=8,
        hidden_act="silu",
        max_position_embeddings=4096 * 32,
        initializer_range=0.02,
        rms_norm_eps=1e-05,
        use_cache=True,
        pad_token_id=None,
        bos_token_id=1,
        eos_token_id=2,
        pretraining_tp=1,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        sliding_window=None,
        num_experts_per_tok=2,
        num_local_experts=8,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.sliding_window = sliding_window

        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.pretraining_tp = pretraining_tp
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.num_experts_per_tok = num_experts_per_tok
        self.num_local_experts = num_local_experts

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )


def promote_scalar(x: torch.Tensor) -> torch.Tensor:
    return x.view(1) if len(x.size()) == 0 else x


def load_attention(config, prefix: str, weights):
    if config.num_attention_heads != config.num_key_value_heads:
        return _load_gqa(config, prefix, weights)
    else:
        return TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
            dim=0,
            weights=weights,
            bias=False,
        )


def _load_gqa(config, prefix: str, weights):
    assert config.hidden_size % config.num_attention_heads == 0
    assert config.num_attention_heads % weights.process_group.size() == 0

    weight = weights.get_multi_weights_col(
        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
        dim=0,
    )

    if isinstance(weight, UnquantizedWeight):
        weight.weight = weight.weight.to(dtype=weights.dtype).to(device=weights.device)

        head_size = config.hidden_size // config.num_attention_heads
        num_heads = config.num_attention_heads // weights.process_group.size()
        num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
        assert list(weight.weight.shape) == [
            (num_heads + 2 * num_key_value_heads) * head_size,
            config.hidden_size,
        ], f"{list(weight.weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"

    return TensorParallelColumnLinear(get_linear(weight, bias=None))


def _load_experts(config, prefix: str, mat, weights):
    if config.quantize is not None:
        raise NotImplementedError("Mixtral does not support weight quantization yet.")

    assert mat in ["w1", "w2", "w3"]

    world_size = weights.process_group.size()
    rank = weights.process_group.rank()

    assert (
        config.intermediate_size % world_size == 0
    ), f"The chosen size {config.intermediate_size} is not compatible with sharding on {world_size} shards"

    block_size = config.intermediate_size // world_size
    start = rank * block_size
    stop = (rank + 1) * block_size

    tensor = torch.empty(
        (config.num_local_experts * block_size, config.hidden_size),
        dtype=weights.dtype,
        device=weights.device,
    )

    for i in range(config.num_local_experts):
        slice_ = weights._get_slice(f"{prefix}.{i}.{mat}.weight")

        if mat == "w2":
            expert_slice = slice_[:, start:stop].t().contiguous()
        else:
            expert_slice = slice_[start:stop]
        tensor[i * block_size : (i + 1) * block_size] = expert_slice.to(
            dtype=weights.dtype
        ).to(device=weights.device)
    return tensor


class MixtralAttention(torch.nn.Module):
    def __init__(
        self,
        prefix: str,
        config,
        weights,
        rotary_emb,
    ):
        super().__init__()
        self.max_past = (
            config.sliding_window if config.sliding_window is not None else -1
        )
        self.num_heads = config.num_attention_heads
        self.hidden_size = config.hidden_size
        self.head_size = self.hidden_size // self.num_heads
        self.rotary_emb = rotary_emb

        self.softmax_scale = self.head_size**-0.5

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()
        self.num_key_value_heads = (
            config.num_key_value_heads // weights.process_group.size()
        )

        self.query_key_value = load_attention(config, prefix, weights)
        self.kv_scales = get_kv_scales(weights, f"{prefix}")

        self.o_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.o_proj",
            weights=weights,
            bias=False,
        )
        self.num_groups = self.num_heads // self.num_key_value_heads
        self.kv_head_mapping = torch.arange(
            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
        ).repeat_interleave(self.num_groups)

    def forward(
        self,
        hidden_states,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        hpu_attention_meta,
    ):
        qkv = self.query_key_value(hidden_states)
        query, kv = qkv.split(
            [
                self.head_size * self.num_heads,
                2 * self.head_size * self.num_key_value_heads,
            ],
            dim=1,
        )
        query = query.view(-1, self.num_heads, self.head_size)
        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)

        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)

        kv_cache.store(
            key=kv[:, 0],
            value=kv[:, 1],
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            # sdpa
            attn_output = attention(
                query=query,
                key=kv[:, 0],
                value=kv[:, 1],
                kv_cache=kv_cache,
                kv_scales=self.kv_scales,
                seqlen=seqlen,
                softmax_scale=self.softmax_scale,
                window_size_left=self.max_past,
            )
        # Decode
        else:
            attn_output = paged_attention(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                seqlen,
                kv_scales=self.kv_scales,
                hpu_attention_meta=hpu_attention_meta,
            )

        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))


@torch.jit.script
def select_experts(gate_logits: torch.Tensor, top_k: int):
    # all_probs: (sequence_length, n_experts) and upcast for softmax
    all_probs = torch.nn.functional.softmax(gate_logits, dim=1, dtype=torch.float)
    # weights, selected_experts: (sequence_length, top-k)
    weights, selected_experts = torch.topk(all_probs, top_k, dim=-1)
    weights /= weights.sum(dim=-1, keepdim=True)
    weights = weights.view(-1)
    selected_experts = selected_experts.view(-1)

    return selected_experts, weights


@torch.jit.script
def round_up(x: torch.Tensor, value: int):
    return torch.div(x + (value - 1), value, rounding_mode="trunc") * value


class MixtralMoE(nn.Module):
    def __init__(
        self, prefix, config: MixtralConfig, moe_layer_cls: Type[MoELayer], weights
    ):
        super().__init__()

        # gating
        self.gate = FastLinear.load(config, f"{prefix}.gate", weights, bias=False)

        self.moe = moe_layer_cls(
            n_expert_group=None,
            n_experts=config.num_local_experts,
            prefix=f"{prefix}.experts",
            renormalize=True,
            topk=config.num_experts_per_tok,
            topk_group=None,
            weights=weights,
            gate_proj_name="w1",
            up_proj_name="w3",
            down_proj_name="w2",
        )
        assert isinstance(self.moe, MoELayer)

        self.process_group = weights.process_group

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # router_logits: (num_tokens, n_experts)
        router_logits = self.gate(x)
        out = self.moe(x, gating_output=router_logits)

        # Reduce sum
        if self.process_group.size() > 1:
            torch.distributed.all_reduce(out, group=self.process_group)

        return out.view(*x.shape)


class MixtralLayer(nn.Module):
    def __init__(self, prefix: str, layer_id, config, weights, rotary_emb):
        super().__init__()
        prefix = f"{prefix}.layers.{layer_id}"

        self.self_attn = MixtralAttention(
            prefix=f"{prefix}.self_attn",
            config=config,
            weights=weights,
            rotary_emb=rotary_emb,
        )

        moe_layer_cls = (
            SparseMoELayer if SparseMoELayer.is_supported(weights) else DenseMoELayer
        )
        self.moe = MixtralMoE(
            f"{prefix}.block_sparse_moe", config, moe_layer_cls, weights
        )

        self.input_layernorm = FastRMSNorm.load(
            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
        )
        self.post_attention_layernorm = FastRMSNorm.load(
            prefix=f"{prefix}.post_attention_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        hpu_attention_meta,
    ):
        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)

        # Self Attention
        attn_output = self.self_attn(
            normed_hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            hpu_attention_meta,
        )

        # faster post attention rms norm
        normed_attn_res_output, attn_res = self.post_attention_layernorm(
            attn_output, res
        )

        moe_output = self.moe(normed_attn_res_output)

        return moe_output, attn_res


class MixtralModel(torch.nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()

        self.embed_tokens = TensorParallelEmbedding(
            prefix=(
                "model.embed_tokens" if not prefix else f"{prefix}.model.embed_tokens"
            ),
            weights=weights,
        )

        rotary_emb = PositionRotaryEmbedding.static(
            config=config,
            dim=config.hidden_size // config.num_attention_heads,
            base=config.rope_theta,
            device=weights.device,
        )
        self.layers = nn.ModuleList(
            [
                MixtralLayer(
                    "model" if not prefix else f"{prefix}.model",
                    layer_id,
                    config,
                    weights,
                    rotary_emb,
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )
        self.norm = FastRMSNorm.load(
            prefix="model.norm" if not prefix else f"{prefix}.model.norm",
            weights=weights,
            eps=config.rms_norm_eps,
        )

        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads
        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
    ) -> torch.Tensor:
        if hpu_attention_meta is not None:
            hpu_attention_meta = set_block_mapping(
                hpu_attention_meta, input_ids.shape[0]
            )
        hidden_states = self.embed_tokens(input_ids)

        # Get rotary cos and sin for this forward
        # Avoid to index in each layer
        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(position_ids)

        residual = None
        lazy_mode = htorch.utils.internal.is_lazy()
        if lazy_mode:
            htorch.core.mark_step()
        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                slots,
                seqlen,
                hpu_attention_meta,
            )
            if lazy_mode:
                htorch.core.mark_step()

        hidden_states, _ = self.norm(hidden_states, residual)

        return hidden_states


class FlashMixtralForCausalLM(torch.nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()

        self.model = MixtralModel(prefix, config, weights)
        self.lm_head = SpeculativeHead.load(
            config,
            prefix="lm_head" if not prefix else f"{prefix}.lm_head",
            weights=weights,
        )
        self.max_past = config.sliding_window
        self.max_past_tensor = (
            torch.tensor(config.sliding_window, device=weights.device)
            if self.max_past is not None
            else None
        )

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        hidden_states = self.model(
            input_ids,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            hpu_attention_meta,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits = self.lm_head(hidden_states)
        return logits


================================================
FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_mllama.py
================================================
# coding=utf-8
# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch Mllama model."""

from typing import Optional, Tuple, List

import torch
import torch.utils.checkpoint
from torch import nn

from transformers.activations import ACT2FN
import torch.nn.functional as F

from text_generation_server.layers import (
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    TensorParallelRowLinear,
    FastLinear,
)
from text_generation_server.layers.attention import (
    Seqlen,
    HPUPagedAttentionMetadata,
)
from text_generation_server.models.custom_modeling.flash_llama_modeling import (
    FlashLlamaForCausalLM,
)
from habana_frameworks.torch.hpex.kernels import FusedSDPA
from vllm_hpu_extension.utils import ModuleFusedSDPA
import habana_frameworks.torch as htorch


def _prepare_aspect_ratio_attention_mask(
    aspect_ratio_mask: torch.Tensor,
    num_patches: int,
    target_length: int,
    dtype: torch.dtype,
) -> torch.Tensor:
    # Expand aspect ratio mask to target_length
    batch_size, max_num_tiles = aspect_ratio_mask.shape
    attention_mask = aspect_ratio_mask.view(batch_size, max_num_tiles, 1, 1).to(dtype)
    attention_mask = attention_mask.repeat(1, 1, target_length, 1)

    # Mask padding patches
    pad_patches = target_length - num_patches
    attention_mask[:, :, -pad_patches:] = 0

    # Invert the mask (0 -> 1, 1 -> 0)
    attention_mask = 1 - attention_mask

    # Reshape to 2D and create 4D attention mask
    # (batch_size, 1, max_num_tiles * target_length, max_num_tiles * target_length)
    attention_mask = attention_mask.reshape(
        batch_size, max_num_tiles * target_length, 1
    )
    attention_mask = (
        attention_mask @ attention_mask.transpose(-1, -2) * torch.finfo(dtype).min
    )
    attention_mask = attention_mask.unsqueeze(1)

    return attention_mask


# Copied from transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask_with_cache_position
def _prepare_4d_causal_attention_mask_with_cache_position(
    attention_mask: torch.Tensor,
    sequence_length: int,
    target_length: int,
    dtype: torch.dtype,
    device: torch.device,
    min_dtype: float,
    cache_position: torch.Tensor,
    batch_size: int,
):
    """
    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
    `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

    Args:
        attention_mask (`torch.Tensor`):
            A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
        sequence_length (`int`):
            The sequence length being processed.
        target_length (`int`):
            The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
        dtype (`torch.dtype`):
            The dtype to use for the 4D attention mask.
        device (`torch.device`):
            The device to plcae the 4D attention mask on.
        min_dtype (`float`):
            The minimum value representable with the dtype `dtype`.
        cache_position (`torch.Tensor`):
            Indices depicting the position of the input sequence tokens in the sequence.
        batch_size (`torch.Tensor`):
            Batch size.
    """
    if attention_mask is not None and attention_mask.dim() == 4:
        # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
        causal_mask = attention_mask
    else:
        causal_mask = torch.full(
            (sequence_length, target_length),
            fill_value=min_dtype,
            dtype=dtype,
            device=device,
        )
        if sequence_length != 1:
            causal_mask = torch.triu(causal_mask, diagonal=1)
        causal_mask *= torch.arange(
            target_length, device=device
        ) > cache_position.reshape(-1, 1)
        causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
        if attention_mask is not None:
            causal_mask = (
                causal_mask.clone()
            )  # copy to contiguous memory for in-place edit
            mask_length = attention_mask.shape[-1]
            padding_mask = (
                causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
            )
            padding_mask = padding_mask == 0
            causal_mask[:, :, :, :mask_length] = causal_mask[
                :, :, :, :mask_length
            ].masked_fill(padding_mask, min_dtype)

    return causal_mask


def _prepare_cross_attention_mask(
    cross_attention_mask: torch.Tensor,
    num_vision_tokens: int,
    dtype: str,
) -> Tuple[torch.Tensor, torch.Tensor]:
    # reshape so it can be used by attn module
    batch_size, text_total_length, *_ = cross_attention_mask.shape
    cross_attention_mask = cross_attention_mask.repeat_interleave(
        num_vision_tokens, dim=3
    )
    cross_attention_mask = cross_attention_mask.view(batch_size, text_total_length, -1)
    cross_attention_mask = cross_attention_mask.unsqueeze(1)

    # invert the mask
    inverted_cross_attn_mask = (1.0 - cross_attention_mask).to(dtype)
    cross_attention_mask = inverted_cross_attn_mask.masked_fill(
        inverted_cross_attn_mask.to(torch.bool), torch.finfo(dtype).min
    )

    # apply full-row bias, which return 4D tensor of shape [B, H, S1, 1] where value is 0 if the a full row in cross attn mask's
    # last dimension contains negative infinity values, otherwise it's 1
    negative_inf_value = torch.finfo(dtype).min
    full_text_row_masked_out_mask = (
        (cross_attention_mask != negative_inf_value)
        .any(dim=-1)
        .type_as(cross_attention_mask)[..., None]
    )
    cross_attention_mask *= full_text_row_masked_out_mask

    return cross_attention_mask, full_text_row_masked_out_mask


# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->MllamaVision
class MllamaVisionMLP(nn.Module):
    def __init__(self, *, prefix, config, weights):
        super().__init__()
        self.config = config
        self.activation_fn = ACT2FN[config.hidden_act]
        self.fc1 = TensorParallelColumnLinear.load(
            prefix=f"{prefix}.fc1", weights=weights, config=config, bias=True
        )
        self.fc2 = TensorParallelRowLinear.load(
            prefix=f"{prefix}.fc2", weights=weights, config=config, bias=True
        )

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.fc1(hidden_states)
        hidden_states = self.activation_fn(hidden_states)
        hidden_states = self.fc2(hidden_states)
        return hidden_states


class MllamaVisionSdpaAttention(nn.Module):
    def __init__(self, *, prefix, config, weights):
        super().__init__()

        self.embed_dim = config.hidden_size
        self.head_dim = config.hidden_size // config.attention_heads
        self.num_heads = config.attention_heads // weights.process_group.size()

        self.qkv_proj = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
            dim=0,
            weights=weights,
            bias=False,
        )
        self.o_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.o_proj",
            weights=weights,
            bias=False,
        )

    def forward(
        self,
        hidden_state: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        qkv = self.qkv_proj(hidden_state)
        query, key, value = qkv.split(
            [
                self.head_dim * self.num_heads,
                self.head_dim * self.num_heads,
                self.head_dim * self.num_heads,
            ],
            dim=2,
        )

        batch_size, q_seq_len, _ = query.shape
        _, kv_seq_len, _ = key.shape

        query = query.view(batch_size, q_seq_len, self.num_heads, self.head_dim)
        key = key.view(batch_size, kv_seq_len, self.num_heads, self.head_dim)
        value = value.view(batch_size, kv_seq_len, self.num_heads, self.head_dim)

        query = query.transpose(1, 2)
        key = key.transpose(1, 2)
        value = value.transpose(1, 2)

        fsdpa_op = ModuleFusedSDPA(FusedSDPA)
        attn_output = fsdpa_op(
            query,
            key,
            value,
            attn_mask=attention_mask,
            dropout_p=0.0,
            is_causal=False,
            scale=None,
            softmax_mode="None",
            recompute_mode=None,
            valid_sequence_lengths=None,
        )
        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.reshape(batch_size, q_seq_len, -1)

        output = self.o_proj(attn_output)
        return output


class MllamaVisionEncoderLayer(nn.Module):
    def __init__(self, *, prefix, config, weights, is_gated: bool):
        super().__init__()

        self.hidden_size = config.hidden_size
        self.num_attention_heads = config.attention_heads
        self.is_gated = is_gated
        self.intermediate_size = config.intermediate_size

        self.self_attn = MllamaVisionSdpaAttention(
            prefix=f"{prefix}.self_attn", config=config, weights=weights
        )
        self.mlp = MllamaVisionMLP(
            prefix=f"{prefix}.mlp", config=config, weights=weights
        )

        self.input_layernorm = nn.LayerNorm.load(
            prefix=f"{prefix}.input_layernorm", weights=weights, eps=1e-05
        )
        self.post_attention_layernorm = nn.LayerNorm.load(
            prefix=f"{prefix}.post_attention_layernorm", weights=weights, eps=1e-05
        )

        # there used to be an if else here, no code path
        if is_gated:
            self.gate_attn = nn.Parameter(
                weights.get_tensor(f"{prefix}.gate_attn"), requires_grad=False
            )
            self.gate_ffn = nn.Parameter(
                weights.get_tensor(f"{prefix}.gate_ffn"), requires_grad=False
            )

    def forward(
        self,
        hidden_state: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
    ):
        # Self Attention
        residual = hidden_state
        hidden_state = self.input_layernorm(hidden_state)
        hidden_state = self.self_attn(hidden_state, attention_mask=attention_mask)
        gate_attn = 1 if not self.is_gated else self.gate_attn.tanh()
        hidden_state = residual + gate_attn * hidden_state

        # Feed forward
        residual = hidden_state
        hidden_state = self.post_attention_layernorm(hidden_state)
        hidden_state = self.mlp(hidden_state)
        gate_ffn = 1 if not self.is_gated else self.gate_ffn.tanh()
        hidden_state = residual + gate_ffn * hidden_state
        return hidden_state


class MllamaVisionEncoder(nn.Module):
    def __init__(self, *, prefix, config, weights, is_gated: bool, num_layers: int):
        super().__init__()
        self.config = config
        self.layers = [
            MllamaVisionEncoderLayer(
                prefix=f"{prefix}.layers.{i}",
                config=config,
                weights=weights,
                is_gated=is_gated,
            )
            for i in range(num_layers)
        ]

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
    ):
        encoder_states = [hidden_states]
        lazy_mode = htorch.utils.internal.is_lazy()
        if lazy_mode:
            htorch.core.mark_step()
        for encoder_layer in self.layers:
            layer_outputs = encoder_layer(
                hidden_states,
                attention_mask,
            )

            hidden_states = layer_outputs
            encoder_states.append(hidden_states)
            if lazy_mode:
                htorch.core.mark_step()

        return hidden_states, encoder_states


class MllamaPrecomputedAspectRatioEmbedding(nn.Module):
    def __init__(self, *, prefix, config, weights):
        super().__init__()
        self.max_num_tiles = config.max_num_tiles
        self.hidden_size = config.hidden_size
        self.max_aspect_ratio_id = config.max_aspect_ratio_id

        self.embedding = TensorParallelEmbedding(
            prefix=f"{prefix}.embedding", weights=weights
        )
        self.gate = nn.Parameter(
            weights.get_tensor(f"{prefix}.gate"), requires_grad=False
        )

    def forward(
        self, hidden_state: torch.Tensor, aspect_ratio_ids: torch.Tensor
    ) -> torch.Tensor:
        embeddings = self.embedding(aspect_ratio_ids)
        embeddings = embeddings.reshape(-1, self.max_num_tiles, 1, self.hidden_size)

        # Always gated.
        embeddings = embeddings * self.gate.tanh()

        hidden_state = hidden_state + embeddings
        return hidden_state


class MllamaPrecomputedPositionEmbedding(nn.Module):
    def __init__(self, *, prefix, config, weights):
        super().__init__()
        self.max_num_tiles = config.max_num_tiles
        self.max_aspect_ratio_id = config.max_aspect_ratio_id
        self.num_patches = (config.image_size // config.patch_size) ** 2 + 1
        self.hidden_size = config.hidden_size
        self.scale = config.hidden_size**-0.5

        self.gate = nn.Parameter(
            weights.get_tensor(f"{prefix}.gate"), requires_grad=False
        )

        # position embedding
        embedding = nn.Parameter(
            weights.get_tensor(f"{prefix}.embedding"), requires_grad=False
        )
        self.gated_position_embedding = (1 - self.gate.tanh()) * embedding
        self.tile_embedding = TensorParallelEmbedding(
            prefix=f"{prefix}.tile_embedding", weights=weights
        )

    def forward(
        self, hidden_state: torch.Tensor, aspect_ratio_ids: torch.Tensor
    ) -> torch.Tensor:
        # position embeddings
        hidden_state = hidden_state + self.gated_position_embedding.view(
            1, 1, self.num_patches, self.hidden_size
        )

        # precomputed tile position embeddings
        tile_position_embedding = self.tile_embedding(aspect_ratio_ids)
        batch_size = hidden_state.shape[0]
        tile_position_embedding = tile_position_embedding.reshape(
            batch_size, self.max_num_tiles, self.num_patches, self.hidden_size
        )
        gated_tile_position_embedding = self.gate.tanh() * tile_position_embedding
        hidden_state = hidden_state + gated_tile_position_embedding

        return hidden_state


class MllamaVisionModel(nn.Module):
    def __init__(self, *, prefix, config, weights):
        super().__init__()
        self.image_size = config.image_size
        self.patch_size = config.patch_size
        self.max_num_tiles = config.max_num_tiles
        self.hidden_size = config.hidden_size
        self.num_channels = config.num_channels
        self.intermediate_layers_indices = config.intermediate_layers_indices

        self.num_patches = (self.image_size // self.patch_size) ** 2 + 1
        self.scale = config.hidden_size**-0.5
        self.dtype = weights.dtype

        self.patch_embedding = nn.Conv2d(
            in_channels=config.num_channels,
            out_channels=self.hidden_size,
            kernel_size=self.patch_size,
            stride=self.patch_size,
            padding="valid",
            bias=False,
        )
        self.patch_embedding.weight = nn.Parameter(
            weights.get_tensor(f"{prefix}.patch_embedding.weight"), requires_grad=False
        )

        self.class_embedding = nn.Parameter(
            weights.get_tensor(f"{prefix}.class_embedding"), requires_grad=False
        )

        self.gated_positional_embedding = MllamaPrecomputedPositionEmbedding(
            prefix=f"{prefix}.gated_positional_embedding",
            config=config,
            weights=weights,
        )

        self.pre_tile_positional_embedding = MllamaPrecomputedAspectRatioEmbedding(
            prefix=f"{prefix}.pre_tile_positional_embedding",
            config=config,
            weights=weights,
        )
        self.post_tile_positional_embedding = MllamaPrecomputedAspectRatioEmbedding(
            prefix=f"{prefix}.post_tile_positional_embedding",
            config=config,
            weights=weights,
        )

        ## layer norms
        self.layernorm_pre = nn.LayerNorm.load(
            prefix=f"{prefix}.layernorm_pre",
            weights=weights,
            # torch default
            eps=1e-05,
        )
        self.layernorm_post = nn.LayerNorm.load(
            prefix=f"{prefix}.layernorm_post",
            weights=weights,
            # torch default
            eps=1e-05,
        )

        ## encoders
        self.transformer = MllamaVisionEncoder(
            prefix=f"{prefix}.transformer",
            config=config,
            weights=weights,
            is_gated=False,
            num_layers=config.num_hidden_layers,
        )
        self.global_transformer = MllamaVisionEncoder(
            prefix=f"{prefix}.global_transformer",
            config=config,
            weights=weights,
            is_gated=True,
            num_layers=config.num_global_layers,
        )

    def apply_class_embedding(self, hidden_state: torch.Tensor) -> torch.Tensor:
        batch_size, _, hidden_size = hidden_state.shape
        class_embedding = self.class_embedding.expand(batch_size, 1, hidden_size)
        hidden_state = torch.cat([class_embedding, hidden_state], dim=1)
        return hidden_state

    def forward(
        self,
        pixel_values: torch.Tensor,
        aspect_ratio_ids: torch.Tensor,
        attention_mask: torch.Tensor,
    ) -> torch.Tensor:
        (
            batch_size,
            num_concurrent_media,
            num_tiles,
            num_channels,
            height,
            width,
        ) = pixel_values.shape

        pixel_values = pixel_values.reshape(
            batch_size * num_concurrent_media * num_tiles, num_channels, height, width
        )
        aspect_ratio_ids = aspect_ratio_ids.reshape(
            batch_size * num_concurrent_media, -1
        )

        # patch embedding
        patch_embeds = self.patch_embedding(pixel_values)
        hidden_state = patch_embeds.flatten(2).transpose(1, 2)

        # tile embeddings
        _, num_patches, dim = hidden_state.shape
        hidden_state = hidden_state.reshape(
            batch_size * num_concurrent_media, num_tiles, -1, dim
        )
        hidden_state = self.pre_tile_positional_embedding(
            hidden_state, aspect_ratio_ids
        )

        # apply cls token
        hidden_state = hidden_state.reshape(
            batch_size * num_concurrent_media * num_tiles, num_patches, dim
        )
        hidden_state = self.apply_class_embedding(hidden_state)
        num_patches += 1

        # apply position embeddings
        hidden_state = hidden_state.reshape(
            batch_size * num_concurrent_media, num_tiles, num_patches, dim
        )
        hidden_state = self.gated_positional_embedding(hidden_state, aspect_ratio_ids)

        # apply encoder
        hidden_state = self.layernorm_pre(hidden_state)

        # Compute the number of tokens to pad
        num_padding_patches = (8 - (hidden_state.shape[-2] % 8)) % 8
        # Compute padding tuple for pad function
        padding = (
            0,
            0,
            0,
            num_padding_patches,
        )  # (pad_left, pad_right, pad_left for dim -2, pad_right for dim -2)
        # Pad the tensor
        hidden_state = F.pad(hidden_state, padding, mode="constant", value=0)
        slice_index = -num_padding_patches if num_padding_patches > 0 else None

        if attention_mask is not None:
            attention_mask = attention_mask.reshape(
                batch_size * num_concurrent_media, -1
            )
            attention_mask = _prepare_aspect_ratio_attention_mask(
                aspect_ratio_mask=attention_mask,
                num_patches=self.num_patches,
                target_length=hidden_state.shape[2],
                dtype=self.dtype,
            )

        hidden_state = hidden_state.view(batch_size * num_concurrent_media, -1, dim)
        hidden_state, all_intermediate_hidden_states = self.transformer(
            hidden_state,
            attention_mask=attention_mask,
        )
        intermediate_hidden_states = [
            hidden_state
            for idx, hidden_state in enumerate(all_intermediate_hidden_states)
            if idx in self.intermediate_layers_indices
        ]
        intermediate_hidden_states = torch.stack(intermediate_hidden_states, dim=-1)

        # apply global encoder
        hidden_state = self.layernorm_post(hidden_state)
        hidden_state = hidden_state.reshape(
            batch_size * num_concurrent_media,
            num_tiles,
            num_patches + num_padding_patches,
            dim,
        )
        hidden_state = self.post_tile_positional_embedding(
            hidden_state, aspect_ratio_ids
        )
        hidden_state = hidden_state.reshape(
            batch_size * num_concurrent_media,
            num_tiles * (num_patches + num_padding_patches),
            dim,
        )
        hidden_state, _ = self.global_transformer(
            hidden_state, attention_mask=attention_mask
        )
        hidden_state = hidden_state.reshape(
            batch_size * num_concurrent_media,
            num_tiles,
            num_patches + num_padding_patches,
            dim,
        )
        hidden_state = hidden_state[:, :, :slice_index]

        # adding intermediate layer outputs
        hidden_state = hidden_state.reshape(
            batch_size, num_concurrent_media, num_tiles, num_patches, dim
        )
        intermediate_hidden_states = intermediate_hidden_states.reshape(
            batch_size * num_concurrent_media,
            num_tiles,
            num_patches + num_padding_patches,
            -1,
        )
        intermediate_hidden_states = intermediate_hidden_states[:, :, :slice_index]
        intermediate_hidden_states = intermediate_hidden_states.reshape(
            batch_size, num_concurrent_media, num_tiles, num_patches, -1
        )
        hidden_state = torch.cat([hidden_state, intermediate_hidden_states], dim=-1)
        return hidden_state


class MllamaTextCrossAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, *, prefix, config, weights, layer_idx):
        super().__init__()
        self.config = config
        self.num_heads = self.config.num_attention_heads
        self.num_key_value_heads = self.config.num_key_value_heads
        self.dropout = config.dropout
        self.hidden_size = config.hidden_size
        self.head_size = config.hidden_size // self.num_heads
        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
        self.layer_idx = layer_idx

        self.num_heads = self.num_heads // weights.process_group.size()
        self.num_key_value_heads = (
            self.num_key_value_heads // weights.process_group.size()
        )

        self.q_proj = TensorParallelColumnLinear.load(
            config,
            prefix=f"{prefix}.q_proj",
            weights=weights,
            bias=False,
        )
        self.k_proj = TensorParallelColumnLinear.load(
            config,
            prefix=f"{prefix}.k_proj",
            weights=weights,
            bias=False,
        )
        self.v_proj = TensorParallelColumnLinear.load(
            config,
            prefix=f"{prefix}.v_proj",
            weights=weights,
            bias=False,
        )
        self.o_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.o_proj",
            weights=weights,
            bias=False,
        )

        self.q_norm = MllamaTextRMSNorm.load(
            prefix=f"{prefix}.q_norm", weights=weights, eps=config.rms_norm_eps
        )
        self.k_norm = MllamaTextRMSNorm.load(
            prefix=f"{prefix}.k_norm", weights=weights, eps=config.rms_norm_eps
        )
        self.softmax_scale = self.head_size**-0.5

    def forward(
        self,
        hidden_states: torch.Tensor,
        cross_attention_states: Optional[torch.Tensor] = None,
        # past_key_value=None,
        # attention_mask: Optional[torch.Tensor] = None,
        # cache_position: Optional[torch.LongTensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        """Input shape: Batch x Time x Channel"""
        # hidden_states = hidden_states.unsqueeze(0)
        # bsz, q_len, _ = hidden_states.size()
        (
            cross_attention_states,
            cross_attention_len,
            indices,
        ) = cross_attention_states
        bs = cross_attention_len.size(0)
        query_states = self.q_proj(hidden_states)
        query_states = query_states.view(bs, -1, self.num_heads, self.head_size)
        query_states = self.q_norm(query_states)

        key_states = self.k_proj(cross_attention_states)
        value_states = self.v_proj(cross_attention_states)
        key_states = key_states.view(bs, -1, self.num_key_value_heads, self.head_size)
        value_states = value_states.view(
            bs, -1, self.num_key_value_heads, self.head_size
        )
        key_states = self.k_norm(key_states)

        # key_states = key_states.repeat(1, self.num_key_value_groups, 1)
        # value_states = value_states.repeat(1, self.num_key_value_groups, 1)
        # logger.info(
        #     f"Q: {query_states.shape} -K {key_states.shape} - V{value_states.shape}"
        # )
        # execute sdpa
        query_states = query_states.transpose(1, 2)
        key_states = key_states.transpose(1, 2)
        value_states = value_states.transpose(1, 2)
        fsdpa_op = ModuleFusedSDPA(FusedSDPA)
        attn_output = fsdpa_op(
            query_states,
            key_states,
            value_states,
            attn_mask=None,
            dropout_p=0.0,
            is_causal=False,
            scale=None,
            softmax_mode="None",
            recompute_mode=None,
            valid_sequence_lengths=None,
        )
        attn_output = attn_output.transpose(1, 2).squeeze(0).contiguous()
        attn_output = self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))

        return attn_output


# Copied from transformers.models.gemma2.modeling_gemma2.Gemma2MLP with Gemma2->MllamaText
class MllamaTextMLP(nn.Module):
    def __init__(self, *, prefix, config, weights):
        super().__init__()
        self.config = config
        self.hidden_size = config.hidden_size
        self.intermediate_size = (
            config.intermediate_size // weights.process_group.size()
        )
        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
            weights=weights,
            dim=0,
            bias=False,
        )
        self.down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.down_proj",
            weights=weights,
            bias=False,
        )
        self.act_fn = ACT2FN[config.hidden_act]

    def forward(self, x):
        shape = x.shape
        gate_up_states = self.gate_up_proj(x)
        gate_up_states = gate_up_states.view(*shape[:-1], 2, self.intermediate_size)
        result = self.down_proj(
            self.act_fn(gate_up_states[:, 0]) * gate_up_states[:, 1]
        )
        return result


class FlashLlamaCrossLayer(torch.nn.Module):
    """Cross-attention transformer block with tanh-gated attention and feedforward."""

    def __init__(self, *, prefix, config, weights, index) -> None:
        layer_idx = index
        super().__init__()
        self.cross_attn = MllamaTextCrossAttention(
            prefix=f"{prefix}.cross_attn",
            config=config,
            weights=weights,
            layer_idx=layer_idx,
        )

        self.input_layernorm = MllamaTextRMSNorm.load(
            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
        )
        self.cross_attn_attn_gate = torch.nn.Parameter(
            weights.get_tensor(f"{prefix}.cross_attn_attn_gate"), requires_grad=False
        )

        self.mlp = MllamaTextMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
        self.post_attention_layernorm = MllamaTextRMSNorm.load(
            prefix=f"{prefix}.post_attention_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )
        self.cross_attn_mlp_gate = torch.nn.Parameter(
            weights.get_tensor(f"{prefix}.cross_attn_mlp_gate"), requires_grad=False
        )
        self.layer_idx = layer_idx

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        adapter_data,
        cross_attention_states,  # [ IB, ...]
        hpu_attention_meta,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        if cross_attention_states is None:
            return hidden_states, residual
        if residual is not None:
            hidden_states += residual

        indices = cross_attention_states[-1]
        out_hidden_states = hidden_states[:]
        hidden_states = hidden_states[indices]
        residual = hidden_states
        hidden_states = self.input_layernorm(hidden_states)

        hidden_states = self.cross_attn(
            hidden_states=hidden_states,
            # attention_mask=cross_attention_mask,
            cross_attention_states=cross_attention_states,
        )
        hidden_states = residual + self.cross_attn_attn_gate.tanh() * hidden_states

        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + self.cross_attn_mlp_gate.tanh() * hidden_states

        out_hidden_states[indices] = hidden_states
        hidden_states = out_hidden_states

        return hidden_states, None


# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->MllamaText
class MllamaTextRMSNorm(nn.Module):
    def __init__(self, weight, eps):
        super().__init__()
        self.weight = weight
        self.variance_epsilon = eps

    @classmethod
    def load(cls, *, prefix, weights, eps):
        weight = nn.Parameter(
            weights.get_tensor(f"{prefix}.weight"), requires_grad=False
        )
        return cls(weight=weight, eps=eps)

    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)

    def extra_repr(self):
        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"


class FlashMllamaForConditionalGeneration(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        config.vision_config.quantize = None
        config.vision_config.speculator = config.speculator
        config.text_config.quantize = config.quantize
        config.text_config.speculator = config.speculator
        config.text_config._attn_implementation = "sdpa"
        self.hidden_size = config.text_config.hidden_size
        self.vision_model = MllamaVisionModel(
            prefix="vision_model", config=config.vision_config, weights=weights
        )
        self.multi_modal_projector = FastLinear.load(
            prefix="multi_modal_projector", config=config, weights=weights, bias=True
        )
        self.text_model = FlashLlamaForCausalLM(
            prefix="language_model", config=config.text_config, weights=weights
        )
        self.config = config
        self.dtype = weights.dtype
        self.device = weights.device

    def vision_forward(self, pixel_values, aspect_ratio_ids, aspect_ratio_mask):
        if aspect_ratio_ids is None:
            raise ValueError(
                "`aspect_ratio_ids` must be provided if `pixel_values` is provided"
            )
        # logger.info(f"PIxel values {pixel_values.shape}")
        batch_size = pixel_values.shape[0]
        vision_states = self.vision_model(
            pixel_values, aspect_ratio_ids, aspect_ratio_mask
        )
        cross_attention_states = self.multi_modal_projector(vision_states).reshape(
            -1, vision_states.shape[-2], self.hidden_size
        )
        _, _, h = cross_attention_states.shape
        cross_attention_states = cross_attention_states.view(batch_size, -1, h)
        # logger.info(f"cross {cross_attention_states.shape}")
        return cross_attention_states

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
        lm_head_indices: Optional[torch.Tensor],
        adapter_data: Optional[torch.Tensor] = None,
        cross_attention_states: Optional[torch.Tensor] = None,
        indices=None,
        cross_attention_len: Optional[torch.Tensor] = None,
    ):
        if cross_attention_states is not None:
            cross_attention_states = (
                cross_attention_states,
                cross_attention_len,
                indices,
            )

        outputs = self.text_model(
            input_ids=input_ids,
            position_ids=position_ids,
            cu_seqlen_prefill=cu_seqlen_prefill,
            kv_cache=kv_cache,
            slots=slots,
            seqlen=seqlen,
            hpu_attention_meta=hpu_attention_meta,
            lm_head_indices=lm_head_indices,
            adapter_data=adapter_data,
            cross_attention_states=cross_attention_states,
        )

        return outputs


================================================
FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
================================================
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
import torch.distributed

from torch import nn
from transformers.activations import ACT2FN
from transformers.modeling_utils import PreTrainedModel
from transformers.models.gpt_neox import GPTNeoXConfig as TransformersGPTNeoXConfig
from typing import Optional, List, Tuple
from text_generation_server.layers.attention import (
    paged_attention,
    attention,
    set_block_mapping,
    Seqlen,
    HPUPagedAttentionMetadata,
)
from text_generation_server.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    SpeculativeHead,
    get_linear,
)
from text_generation_server.layers.attention.kv_cache import get_kv_scales
from text_generation_server.layers.layernorm import (
    FastLayerNorm,
)
from text_generation_server.layers.rotary import (
    PositionRotaryEmbedding,
)
from text_generation_server.utils.weights import UnquantizedWeight
import habana_frameworks.torch as htorch


class GPTNeoXConfig(TransformersGPTNeoXConfig):
    attribute_map = {
        "num_key_value_heads": "num_attention_heads",
    }


def load_row(config, prefix: str, weights, bias: bool):
    weight = weights.get_weights_row(prefix)

    if bias and weights.process_group.rank() == 0:
        # Rank is only on the first rank process
        bias = weights.get_tensor(f"{prefix}.bias")
    else:
        bias = None

    linear = get_linear(weight, bias)
    if config.use_parallel_residual:
        return linear
    else:
        return TensorParallelRowLinear(linear, process_group=weights.process_group)


def load_qkv(config, prefix: str, weights, num_heads, head_size, hidden_size):
    weight = weights.get_multi_weights_col([prefix], dim=0)
    if isinstance(weight, UnquantizedWeight):
        # Only on non quantized versions
        weight.weight = (
            weight.weight.view(
                num_heads,
                3,
                head_size,
                hidden_size,
            )
            .permute(1, 0, 2, 3)
            .reshape(-1, hidden_size)
        )

    bias = weights.get_sharded(f"{prefix}.bias", dim=0)
    bias = bias.view(num_heads, 3, head_size).permute(1, 0, 2).reshape(-1)

    linear = get_linear(weight, bias)
    if config.use_parallel_residual:
        return linear
    else:
        return TensorParallelColumnLinear(linear)


class FlashNeoxAttention(torch.nn.Module):
    def __init__(self, config, prefix, weights, rotary_emb):
        super().__init__()
        num_heads = config.num_attention_heads
        hidden_size = config.hidden_size

        self.num_heads = num_heads
        self.hidden_size = hidden_size
        self.head_size = hidden_size // num_heads

        self.rotary_dim = int(config.rotary_pct * self.head_size)

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()
        self.rotary_emb = rotary_emb
        self.softmax_scale = self.head_size ** (-0.5)

        self.query_key_value = load_qkv(
            config,
            prefix=f"{prefix}.query_key_value",
            weights=weights,
            num_heads=self.num_heads,
            head_size=self.head_size,
            hidden_size=self.hidden_size,
        )
        self.kv_scales = get_kv_scales(weights, f"{prefix}")
        self.dense = load_row(
            config, prefix=f"{prefix}.dense", weights=weights, bias=True
        )
        self.kv_head_mapping = torch.arange(
            0, self.num_heads, dtype=torch.int32, device=weights.device
        )

    def forward(
        self,
        hidden_states,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        hpu_attention_meta,
    ):
        qkv = self.query_key_value(hidden_states)
        qkv = qkv.view(-1, 3, self.num_heads, self.head_size)

        # Compute rotary embeddings on rotary_ndims
        query_rot = qkv[:, 0][..., : self.rotary_dim]
        query_pass = qkv[:, 0][..., self.rotary_dim :]
        key_rot = qkv[:, 1][..., : self.rotary_dim]
        key_pass = qkv[:, 1][..., self.rotary_dim :]

        # Inplace rotary
        self.rotary_emb(query_rot, key_rot, cos, sin)
        qkv[:, 0] = torch.cat((query_rot, query_pass), dim=-1)
        qkv[:, 1] = torch.cat((key_rot, key_pass), dim=-1)

        kv_cache.store(
            key=qkv[:, 1],
            value=qkv[:, 2],
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            # sdpa
            attn_output = attention(
                query=qkv[:, 0],
                key=qkv[:, 1],
                value=qkv[:, 2],
                kv_cache=kv_cache,
                kv_scales=self.kv_scales,
                seqlen=seqlen,
                softmax_scale=self.softmax_scale,
            )
        # Decode
        else:
            attn_output = paged_attention(
                qkv[:, 0],
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                seqlen,
                kv_scales=self.kv_scales,
                hpu_attention_meta=hpu_attention_meta,
            )

        return self.dense(attn_output.view(-1, self.num_heads * self.head_size))


class FlashMLP(nn.Module):
    def __init__(self, config, prefix, weights):
        super().__init__()
        act = config.hidden_act
        self.act = (
            ACT2FN[act]
            if "gelu" not in act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
                ),
            )
        )

        self.dense_h_to_4h = TensorParallelColumnLinear.load(
            config, prefix=f"{prefix}.dense_h_to_4h", weights=weights, bias=True
        )
        self.dense_4h_to_h = load_row(
            config, prefix=f"{prefix}.dense_4h_to_h", weights=weights, bias=True
        )

    def forward(self, hidden_states):
        hidden_states = self.dense_h_to_4h(hidden_states)
        hidden_states = self.act(hidden_states)
        hidden_states = self.dense_4h_to_h(hidden_states)
        return hidden_states


class FlashNeoXLayer(nn.Module):
    def __init__(self, layer_id, config, weights, rotary_emb):
        super().__init__()

        layer_norm_eps = config.layer_norm_eps

        prefix = f"gpt_neox.layers.{layer_id}"

        self.use_parallel_residual = config.use_parallel_residual
        self.input_layernorm = FastLayerNorm.load(
            prefix=f"{prefix}.input_layernorm", weights=weights, eps=layer_norm_eps
        )
        self.post_attention_layernorm = FastLayerNorm.load(
            prefix=f"{prefix}.post_attention_layernorm",
            weights=weights,
            eps=layer_norm_eps,
        )
        self.attention = FlashNeoxAttention(
            config,
            prefix=f"{prefix}.attention",
            weights=weights,
            rotary_emb=rotary_emb,
        )

        self.mlp = FlashMLP(config, prefix=f"{prefix}.mlp", weights=weights)
        self.process_group = weights.process_group

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        hpu_attention_meta,
    ):
        if self.use_parallel_residual:
            ln1_hidden_states, _ = self.input_layernorm(hidden_states)

            attn_output = self.attention(
                ln1_hidden_states,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache,
                slots,
                seqlen,
                hpu_attention_meta,
            )

            ln2_hidden_states, _ = self.post_attention_layernorm(hidden_states)

            mlp_output = self.mlp(ln2_hidden_states)
            intermediate = mlp_output + attn_output

            if self.process_group.size() > 1:
                torch.distributed.all_reduce(intermediate, group=self.process_group)

            return intermediate + hidden_states, None
        else:
            hidden_states, residual = self.input_layernorm(hidden_states, residual)

            hidden_states = self.attention(
                hidden_states,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache,
                slots,
                seqlen,
                hpu_attention_meta,
            )

            hidden_states, residual = self.post_attention_layernorm(
                hidden_states, residual
            )

            mlp_output = self.mlp(hidden_states)

            return mlp_output, residual


class FlashGPTNeoXPreTrainedModel(PreTrainedModel):
    config_class = GPTNeoXConfig
    base_model_prefix = "gpt_neox"
    supports_gradient_checkpointing = False
    _no_split_modules = None


class FlashGPTNeoXModel(FlashGPTNeoXPreTrainedModel):
    def __init__(self, prefix: str, config, weights):
        super().__init__(config)
        self.config = config

        self.embed_in = TensorParallelEmbedding(
            prefix=f"{prefix}.embed_in", weights=weights
        )

        rotary_emb = PositionRotaryEmbedding.static(
            config=config,
            dim=int(
                config.rotary_pct * (config.hidden_size // config.num_attention_heads)
            ),
            base=config.rotary_emb_base,
            device=weights.device,
        )

        self.layers = nn.ModuleList(
            [
                FlashNeoXLayer(layer_id, config, weights, rotary_emb)
                for layer_id in range(config.num_hidden_layers)
            ]
        )
        self.final_layer_norm = FastLayerNorm.load(
            prefix=f"{prefix}.final_layer_norm",
            weights=weights,
            eps=config.layer_norm_eps,
        )

        self.gradient_checkpointing = False

        self.head_size = self.layers[0].attention.head_size
        self.num_heads = self.layers[0].attention.num_heads

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
    ) -> torch.Tensor:
        if hpu_attention_meta is not None:
            hpu_attention_meta = set_block_mapping(
                hpu_attention_meta, input_ids.shape[0]
            )
        hidden_states = self.embed_in(input_ids)

        # Get rotary cos and sin for this forward
        # Avoid to index in each layer
        cos, sin = self.layers[0].attention.rotary_emb.get_cos_sin(position_ids)

        residual = None
        lazy_mode = htorch.utils.internal.is_lazy()
        if lazy_mode:
            htorch.core.mark_step()
        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                slots,
                seqlen,
                hpu_attention_meta,
            )
            if lazy_mode:
                htorch.core.mark_step()

        hidden_states, _ = self.final_layer_norm(hidden_states, residual)

        return hidden_states


class FlashGPTNeoXForCausalLM(FlashGPTNeoXPreTrainedModel):
    def __init__(self, prefix, config, weights):
        super().__init__(config)

        if not prefix:
            prefix = "gpt_neox"
        else:
            prefix = f"{prefix}.gpt_neox"

        self.gpt_neox = FlashGPTNeoXModel(prefix, config, weights)

        self.embed_out = SpeculativeHead.load(
            config, prefix="embed_out", weights=weights
        )

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        hidden_states = self.gpt_neox(
            input_ids,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            hpu_attention_meta,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits = self.embed_out(hidden_states)
        return logits


================================================
FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py
================================================
# coding=utf-8
# Copyright 2024 HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
import torch.distributed
from torch import nn
from typing import Optional, List, Tuple

from text_generation_server.layers.tensor_parallel import TensorParallelColumnLinear
from text_generation_server.layers.attention import Seqlen, HPUPagedAttentionMetadata
from text_generation_server.models.custom_modeling.vlm import (
    load_text_model,
    load_vision_model,
)


class PaliGemmaForConditionalGeneration(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        config.vision_config.quantize = config.quantize
        self.vision_tower = load_vision_model(
            prefix="vision_tower" if not prefix else f"{prefix}.vision_tower",
            config=config.vision_config,
            weights=weights,
        )
        self.post_vision_tower_layernorm = nn.LayerNorm.load(
            prefix="vision_tower.vision_model.post_layernorm",
            weights=weights,
            eps=config.vision_config.layer_norm_eps,
        )

        self.multi_modal_projector = TensorParallelColumnLinear.load(
            config,
            prefix="multi_modal_projector.linear",
            weights=weights,
            bias=True,
        )

        self.vocab_size = config.text_config.vocab_size
        self.config = config

        text_config = config.text_config
        text_config.speculator = config.speculator
        text_config.quantize = config.quantize
        self.text_model = load_text_model(
            prefix="language_model" if not prefix else f"{prefix}.language_model",
            config=config.text_config,
            weights=weights,
        )
        self.pad_token_id = (
            config.pad_token_id if config.pad_token_id is not None else -1
        )
        self.dtype = weights.dtype

    def get_vision_embeds(
        self,
        pixel_values: torch.FloatTensor,
        pixel_attention_mask: Optional[torch.FloatTensor] = None,
        image_sizes: Optional[torch.Tensor] = None,
        image_grid_thw: Optional[torch.LongTensor] = None,
    ):
        pixel_values = pixel_values.to(dtype=self.dtype)
        image_outputs = self.vision_tower(pixel_values)
        last_hidden_state = self.post_vision_tower_layernorm(
            image_outputs.last_hidden_state
        )
        image_features = self.multi_modal_projector(last_hidden_state)
        image_features = image_features.view(-1, image_features.shape[-1])
        return image_features

    def get_inputs_embeds(
        self,
        input_ids: torch.Tensor,
        vision_embeds: torch.Tensor = None,
    ):
        inputs_embeds = self.text_model.embed_tokens(input_ids)

        if vision_embeds is not None:
            mask = input_ids == self.config.image_token_index
            inputs_embeds[mask] = vision_embeds

        return inputs_embeds

    def forward(
        self,
        inputs_embeds: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
        lm_head_indices: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.BoolTensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        # TODO This is odd but apparently pali gemma position ids start at 1.
        if cu_seqlen_prefill is not None:
            position_ids += 1

        hidden_states = self.text_model.model(
            inputs_embeds=inputs_embeds,
            position_ids=position_ids,
            cu_seqlen_prefill=cu_seqlen_prefill,
            kv_cache=kv_cache,
            slots=slots,
            seqlen=seqlen,
            hpu_attention_meta=hpu_attention_meta,
            adapter_data=adapter_data,
        )

        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.text_model.lm_head(hidden_states)

        return logits, speculative_logits


================================================
FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
================================================
import torch
import torch.distributed

from torch import nn
from transformers.activations import ACT2FN
from transformers.configuration_utils import PretrainedConfig
from typing import Optional, List, Tuple

from text_generation_server.layers.attention import (
    paged_attention,
    attention,
    set_block_mapping,
    Seqlen,
    HPUPagedAttentionMetadata,
)
from text_generation_server.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    SpeculativeHead,
    get_linear,
)
from text_generation_server.layers.attention.kv_cache import get_kv_scales
from text_generation_server.layers.layernorm import (
    FastLayerNorm,
)
from text_generation_server.layers.rotary import (
    PositionRotaryEmbedding,
)
import habana_frameworks.torch as htorch


class PhiConfig(PretrainedConfig):
    def __init__(
        self,
        vocab_size=51200,
        hidden_size=2560,
        num_hidden_layers=32,
        num_attention_heads=32,
        num_key_value_heads=32,
        hidden_act="gelu_fast",  # llama uses silu
        layer_norm_eps=1e-05,  # rms in llama,
        pad_token_id=0,
        bos_token_id=1,
        eos_token_id=2,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        resid_pdrop=0.1,  # llama doesn't have this
        partial_rotary_factor=0.5,  # important difference between llama and phi
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.layer_norm_eps = layer_norm_eps
        self.rope_theta = rope_theta
        self.resid_pdrop = resid_pdrop
        self.partial_rotary_factor = partial_rotary_factor

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )


# this is the same as llama except for Phi uses bias=True
def load_attention(config, prefix, weights):
    if config.num_attention_heads != config.num_key_value_heads:
        return _load_gqa(config, prefix, weights)
    else:
        return TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
            dim=0,
            weights=weights,
            bias=True,
        )


def _load_gqa(config, prefix: str, weights):
    assert config.hidden_size % config.num_attention_heads == 0
    assert config.num_attention_heads % weights.process_group.size() == 0

    weight = weights.get_multi_weights_col(
        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
        dim=0,
    )

    if config.quantize not in ["gptq", "awq"]:
        weight = weight.to(dtype=weights.dtype).to(device=weights.device)

        head_size = config.hidden_size // config.num_attention_heads
        num_heads = config.num_attention_heads // weights.process_group.size()
        num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
        assert list(weight.shape) == [
            (num_heads + 2 * num_key_value_heads) * head_size,
            config.hidden_size,
        ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"

    # this is the same as llama except for Phi uses bias=True
    return TensorParallelColumnLinear(get_linear(weight, bias=True))


class FlashPhiAttention(torch.nn.Module):
    def __init__(
        self,
        prefix: str,
        config,
        weights,
        rotary_emb,
    ):
        super().__init__()
        self.num_heads = config.num_attention_heads
        self.hidden_size = config.hidden_size
        self.head_size = self.hidden_size // self.num_heads

        self.softmax_scale = self.head_size**-0.5
        self.rotary_dim = int(config.partial_rotary_factor * self.head_size)
        self.rotary_emb = rotary_emb

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )

        self.num_heads = self.num_heads // weights.process_group.size()
        self.num_key_value_heads = (
            config.num_key_value_heads // weights.process_group.size()
        )

        self.query_key_value = load_attention(config, prefix, weights)
        self.kv_scales = get_kv_scales(weights, f"{prefix}")

        # in llama the dense layer is called "o_proj" and has bias=False
        self.dense = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.dense",
            weights=weights,
            bias=True,
        )
        self.num_groups = self.num_heads // self.num_key_value_heads
        self.kv_head_mapping = torch.arange(
            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
        ).repeat_interleave(self.num_groups)

    def forward(
        self,
        hidden_states,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        hpu_attention_meta,
    ):
        # Compute query, key, value and split
        qkv = self.query_key_value(hidden_states)
        query, kv = qkv.split(
            [
                self.head_size * self.num_heads,
                2 * self.head_size * self.num_key_value_heads,
            ],
            dim=1,
        )

        # Reshape query and key for rotary embeddings
        query = query.view(-1, self.num_heads, self.head_size)
        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)

        # NOTE: this is the main difference between Llama and Phi
        # in llama the rotary embeddings are applied to the whole query and key.
        # Phi uses PARTIAL rotary embeddings, which are applied to the first 32 dimensions
        #
        # Apply partial positional embeddings in place
        self.rotary_emb(
            query[:, :, : self.rotary_dim], kv[:, 0, :, : self.rotary_dim], cos, sin
        )

        # Reshape key and value and cache
        kv_cache.store(
            key=kv[:, 0],
            value=kv[:, 1],
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            attn_output = attention(
                query=query,
                key=kv[:, 0],
                value=kv[:, 1],
                kv_scales=self.kv_scales,
                kv_cache=kv_cache,
                seqlen=seqlen,
                softmax_scale=self.softmax_scale,
            )
        # Decode
        else:
            attn_output = paged_attention(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                seqlen,
                kv_scales=self.kv_scales,
                hpu_attention_meta=hpu_attention_meta,
            )

        return self.dense(attn_output.view(-1, self.num_heads * self.head_size))


class PhiMLP(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        act = config.hidden_act
        self.act = (
            ACT2FN[act]
            if "gelu" not in act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
                ),
            )
        )

        # llama weights are up_proj and down_proj and bias=False
        self.up_proj = TensorParallelColumnLinear.load(
            config,
            prefix=f"{prefix}.fc1",
            weights=weights,
            bias=True,
        )
        self.down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.fc2",
            weights=weights,
            bias=True,
        )

    def forward(self, hidden_states):
        # NOTE: Llama requires the gate up states to an intermediate size
        # Phi does not and we can avoid the `view` operation
        return self.down_proj(self.act(self.up_proj(hidden_states)))


class FlashPhiLayer(nn.Module):
    def __init__(self, prefix: str, layer_id, config, weights, rotary_emb):
        super().__init__()
        prefix = f"{prefix}.layers.{layer_id}"
        self.self_attn = FlashPhiAttention(
            prefix=f"{prefix}.self_attn",
            config=config,
            weights=weights,
            rotary_emb=rotary_emb,
        )
        self.mlp = PhiMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
        self.input_layernorm = FastLayerNorm.load(
            prefix=f"{prefix}.input_layernorm",
            weights=weights,
            eps=config.layer_norm_eps,
        )
        self.resid_dropout = torch.nn.Dropout(config.resid_pdrop)

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        hpu_attention_meta,
    ):
        hidden_states, res = self.input_layernorm(hidden_states, residual)
        # Self Attention
        attn_output = self.self_attn(
            hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            hpu_attention_meta,
        )

        hidden_states = self.resid_dropout(attn_output).add(
            self.resid_dropout(self.mlp(hidden_states))
        )

        return hidden_states, res


class FlashPhiModel(torch.nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()

        process_group = weights.process_group
        self.tp_rank = process_group.rank()
        self.tp_world_size = process_group.size()
        self.embed_tokens = TensorParallelEmbedding(
            prefix=f"{prefix}.embed_tokens", weights=weights
        )
        rotary_emb = PositionRotaryEmbedding.static(
            config=config,
            dim=int(
                config.partial_rotary_factor
                * (config.hidden_size // config.num_attention_heads)
            ),
            base=config.rope_theta,
            device=weights.device,
        )

        self.layers = nn.ModuleList(
            [
                FlashPhiLayer(
                    prefix,
                    layer_id,
                    config,
                    weights,
                    rotary_emb,
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )
        self.gradient_checkpointing = False

        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads
        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads

        self.norm = FastLayerNorm.load(
            prefix="model.final_layernorm",
            weights=weights,
            eps=config.layer_norm_eps,
        )

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
    ) -> torch.Tensor:
        if hpu_attention_meta is not None:
            hpu_attention_meta = set_block_mapping(
                hpu_attention_meta, input_ids.shape[0]
            )
        hidden_states = self.embed_tokens(input_ids)

        # Get rotary cos and sin for this forward
        # Avoid to index in each layer
        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(position_ids)

        residual = None
        lazy_mode = htorch.utils.internal.is_lazy()
        if lazy_mode:
            htorch.core.mark_step()
        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                slots,
                seqlen,
                hpu_attention_meta,
            )
            if lazy_mode:
                htorch.core.mark_step()

        hidden_states, _ = self.norm(hidden_states, residual)

        return hidden_states


class FlashPhiForCausalLM(torch.nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()

        if not prefix:
            prefix = "model"
        else:
            prefix = f"{prefix}.model"

        self.model = FlashPhiModel(prefix, config, weights)
        self.lm_head = SpeculativeHead.load(
            config,
            prefix="lm_head",
            weights=weights,
        )

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        hidden_states = self.model(
            input_ids,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            hpu_attention_meta,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]

        return self.lm_head(hidden_states)


================================================
FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_phi_moe_modeling.py
================================================
# coding=utf-8
# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""PyTorch Phi-MoE model."""

from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging

logger = logging.get_logger(__name__)


PHIMOE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "microsoft/Phi-3.5-MoE-instruct": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct/resolve/main/config.json",
}


class PhiMoEConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`PhiMoEModel`]. It is used to instantiate a Phi-MoE
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the
    [microsoft/Phi-3.5-MoE-instruct](https://huggingface.co/microsoft/Phi-3.5-MoE-instruct).

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 32064):
            Vocabulary size of the PhiMoE model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`PhiMoEModel`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 6400):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_key_value_heads (`int`, *optional*, defaults to 8):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
            The maximum sequence length that this model might ever be used with. Mixtral's sliding window attention
            allows sequence of up to 4096*32 tokens.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*):
            The id of the padding token.
        bos_token_id (`int`, *optional*, defaults to 1):
            The id of the "beginning-of-sequence" token.
        eos_token_id (`int`, *optional*, defaults to 2):
            The id of the "end-of-sequence" token.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be tied.
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        rope_scaling (`dict`, *optional*):
            The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
            contain the following keys: `type`, `short_factor`, `long_factor`, `short_mscale`, `long_mscale` and
            `original_max_position_embeddings`. The `type` must be `longrope`, the `short_mscale` and `long_scale` must
            be numbers, the `short_factor` and `long_factor` must be lists of numbers with the same length as half of
            the attention head size and the `original_max_position_embeddings` must be an integer.
        sliding_window (`int`, *optional*):
            Sliding window attention window size. If not specified, will default to `262144`.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        num_experts_per_tok (`int`, *optional*, defaults to 2):
            The number of experts to root per-token, can be also interpreted as the `top-p` routing
            parameter
        num_local_experts (`int`, *optional*, defaults to 16):
            Number of experts per Sparse MLP layer.
        output_router_logits (`bool`, *optional*, defaults to `False`):
            Whether or not the router logits should be returned by the model. Enabeling this will also
            allow the model to output the auxiliary loss. See [here]() for more details
        router_aux_loss_coef (`float`, *optional*, defaults to 0.0):
            The aux loss factor for the total loss.
        router_jitter_noise (`float`, *optional*, defaults to 0.01):
            Amount of noise to add to the router.

    ```python
    >>> from transformers import PhiMoEModel, PhiMoEConfig

    >>> # Initializing a Phi-3 style configuration
    >>> configuration = PhiMoEConfig.from_pretrained("microsoft/Phi-3.5-MoE-instruct")

    >>> # Initializing a model from the configuration
    >>> model = PhiMoEModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "phimoe"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        vocab_size=32064,
        hidden_size=4096,
        intermediate_size=6400,
        num_hidden_layers=32,
        num_attention_heads=32,
        num_key_value_heads=8,
        hidden_act="silu",
        max_position_embeddings=4096 * 32,
        initializer_range=0.02,
        rms_norm_eps=1e-5,
        use_cache=True,
        pad_token_id=None,
        bos_token_id=1,
        eos_token_id=2,
        tie_word_embeddings=False,
        rope_theta=1e6,
        rope_scaling=None,
        sliding_window=None,
        attention_dropout=0.0,
        num_experts_per_tok=2,
        num_local_experts=16,
        output_router_logits=False,
        router_aux_loss_coef=0.001,
        router_jitter_noise=0.01,
        input_jitter_noise=0.0,
        attention_bias=False,
        lm_head_bias=False,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.sliding_window = sliding_window
        self.attention_bias = attention_bias
        self.lm_head_bias = lm_head_bias
        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.attention_dropout = attention_dropout

        self.num_experts_per_tok = num_experts_per_tok
        self.num_local_experts = num_local_experts
        self.output_router_logits = output_router_logits
        self.router_aux_loss_coef = router_aux_loss_coef
        self.router_jitter_noise = router_jitter_noise
        self.input_jitter_noise = input_jitter_noise

        self.rope_scaling = rope_scaling
        self._rope_scaling_validation()

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )

    def _rope_scaling_validation(self):
        """
        Validate the `rope_scaling` configuration.
        """
        if self.rope_scaling is None:
            return

        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 6:
            raise ValueError(
                "`rope_scaling` must be a dictionary with three fields, `type`, `short_factor`, `long_factor`, "
                f"`short_mscale`, `long_mscale` and `original_max_position_embeddings`, got {self.rope_scaling}"
            )
        rope_scaling_type = self.rope_scaling.get("type", None)
        rope_scaling_short_factor = self.rope_scaling.get("short_factor", None)
        rope_scaling_long_factor = self.rope_scaling.get("long_factor", None)
        rope_scaling_short_mscale = self.rope_scaling.get("short_mscale", None)
        rope_scaling_long_mscale = self.rope_scaling.get("long_mscale", None)
        original_max_position_embeddings = self.rope_scaling.get(
            "original_max_position_embeddings", None
        )
        if rope_scaling_type is None or rope_scaling_type not in ["longrope"]:
            raise ValueError(
                f"`rope_scaling`'s type field must be one of ['longrope'], got {rope_scaling_type}"
            )
        if not (
            isinstance(rope_scaling_short_factor, list)
            and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor)
        ):
            raise ValueError(
                f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}"
            )
        if (
            not len(rope_scaling_short_factor)
            == self.hidden_size // self.num_attention_heads // 2
        ):
            raise ValueError(
                f"`rope_scaling`'s short_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_short_factor)}"
            )
        if not (
            isinstance(rope_scaling_long_factor, list)
            and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor)
        ):
            raise ValueError(
                f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}"
            )
        if (
            not len(rope_scaling_long_factor)
            == self.hidden_size // self.num_attention_heads // 2
        ):
            raise ValueError(
                f"`rope_scaling`'s long_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_long_factor)}"
            )
        if not isinstance(rope_scaling_short_mscale, (int, float)):
            raise ValueError(
                f"`rope_scaling`'s short_mscale field must be a number, got {rope_scaling_short_mscale}"
            )
        if not isinstance(rope_scaling_long_mscale, (int, float)):
            raise ValueError(
                f"`rope_scaling`'s long_mscale field must be a number, got {rope_scaling_long_mscale}"
            )
        if not isinstance(original_max_position_embeddings, int):
            raise ValueError(
                f"`rope_scaling`'s original_max_position_embeddings field must be an integer, got {original_max_position_embeddings}"
            )


================================================
FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
================================================
import torch
import torch.distributed

from torch import nn
from transformers.activations import ACT2FN
from typing import Optional, List, Tuple

from text_generation_server.layers.attention import (
    paged_attention,
    attention,
    set_block_mapping,
    Seqlen,
    HPUPagedAttentionMetadata,
)
from text_generation_server.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    SpeculativeHead,
)
from text_generation_server.layers.attention.kv_cache import get_kv_scales
from text_generation_server.layers.rotary import PositionRotaryEmbedding
from text_generation_server.layers.layernorm import (
    FastRMSNorm,
)
import habana_frameworks.torch as htorch


def load_attention(config, prefix, weights):
    if config.num_attention_heads != config.num_key_value_heads:
        return _load_gqa(config, prefix, weights)
    else:
        return TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
            dim=0,
            weights=weights,
            bias=True,
        )


def _load_gqa(config, prefix: str, weights):
    assert config.hidden_size % config.num_attention_heads == 0
    assert config.num_attention_heads % weights.process_group.size() == 0

    return TensorParallelColumnLinear.load_multi(
        config,
        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
        dim=0,
        weights=weights,
        bias=True,
    )


class Qwen2Attention(torch.nn.Module):
    def __init__(
        self,
        prefix: str,
        config,
        weights,
        rotary_emb,
    ):
        super().__init__()
        self.max_past = (
            config.sliding_window
            if config.use_sliding_window and config.sliding_window is not None
            else -1
        )
        self.num_heads = config.num_attention_heads
        self.hidden_size = config.hidden_size
        self.head_size = self.hidden_size // self.num_heads
        self.rotary_emb = rotary_emb

        self.softmax_scale = self.head_size**-0.5

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()
        self.num_key_value_heads = (
            config.num_key_value_heads // weights.process_group.size()
        )

        self.query_key_value = load_attention(config, prefix, weights)

        self.kv_scales = get_kv_scales(weights, f"{prefix}")

        self.o_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.o_proj",
            weights=weights,
            bias=False,
        )
        self.num_groups = self.num_heads // self.num_key_value_heads
        self.kv_head_mapping = torch.arange(
            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
        ).repeat_interleave(self.num_groups)

    def forward(
        self,
        hidden_states,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        hpu_attention_meta,
    ):
        qkv = self.query_key_value(hidden_states)
        query, kv = qkv.split(
            [
                self.head_size * self.num_heads,
                2 * self.head_size * self.num_key_value_heads,
            ],
            dim=1,
        )
        query = query.view(-1, self.num_heads, self.head_size)
        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)

        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)

        kv_cache.store(
            key=kv[:, 0],
            value=kv[:, 1],
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            # sdpa
            attn_output = attention(
                query=query,
                key=kv[:, 0],
                value=kv[:, 1],
                kv_cache=kv_cache,
                kv_scales=self.kv_scales,
                seqlen=seqlen,
                softmax_scale=self.softmax_scale,
                window_size_left=self.max_past,
            )
        # Decode
        else:
            attn_output = paged_attention(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                seqlen,
                kv_scales=self.kv_scales,
                hpu_attention_meta=hpu_attention_meta,
                window_size_left=self.max_past,
            )

        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))


class Qwen2MLP(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        act = config.hidden_act
        self.act = (
            ACT2FN[act]
            if "gelu" not in act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
                ),
            )
        )
        # Fuse gate and up proj
        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
            weights=weights,
            dim=0,
            bias=False,
        )
        self.down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.down_proj",
            weights=weights,
            bias=False,
        )
        self.intermediate_size = (
            config.intermediate_size // weights.process_group.size()
        )

    def forward(self, hidden_states):
        gate_up_states = self.gate_up_proj(hidden_states)
        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
        return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1])


class Qwen2Layer(nn.Module):
    def __init__(self, prefix, layer_id, config, weights, rotary_emb):
        super().__init__()
        prefix = f"{prefix}.layers.{layer_id}"
        self.self_attn = Qwen2Attention(
            prefix=f"{prefix}.self_attn",
            config=config,
            weights=weights,
            rotary_emb=rotary_emb,
        )
        self.mlp = Qwen2MLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
        self.input_layernorm = FastRMSNorm.load(
            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
        )
        self.post_attention_layernorm = FastRMSNorm.load(
            prefix=f"{prefix}.post_attention_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        hpu_attention_meta,
    ):
        normed_hidden_states, residual = self.input_layernorm(hidden_states)

        # Self Attention
        attn_output = self.self_attn(
            normed_hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            hpu_attention_meta,
        )
        hidden_states = attn_output + residual

        # faster post attention rms norm
        hidden_states, residual = self.post_attention_layernorm(hidden_states)
        mlp_output = self.mlp(hidden_states)
        hidden_states = mlp_output + residual
        return hidden_states


class Qwen2Model(torch.nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()

        prefix = f"{prefix}.model" if prefix else "model"

        process_group = weights.process_group
        self.tp_rank = process_group.rank()
        self.tp_world_size = process_group.size()

        rotary_emb = PositionRotaryEmbedding.static(
            config=config,
            dim=config.hidden_size // config.num_attention_heads,
            base=config.rope_theta,
            device=weights.device,
        )

        self.layers = nn.ModuleList(
            [
                Qwen2Layer(
                    prefix,
                    layer_id,
                    config,
                    weights,
                    rotary_emb,
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )
        self.norm = FastRMSNorm.load(
            prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
        )

        self.gradient_checkpointing = False

        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads
        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads

    def forward(
        self,
        inputs_embeds: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
    ) -> torch.Tensor:
        if hpu_attention_meta is not None:
            hpu_attention_meta = set_block_mapping(
                hpu_attention_meta, inputs_embeds.shape[0]
            )
        hidden_states = inputs_embeds

        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
            position_ids,
        )

        residual = None
        lazy_mode = htorch.utils.internal.is_lazy()
        if lazy_mode:
            htorch.core.mark_step()
        for i, layer in enumerate(self.layers):
            hidden_states = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                slots,
                seqlen,
                hpu_attention_meta,
            )
            if lazy_mode:
                htorch.core.mark_step()

        hidden_states, _ = self.norm(hidden_states)

        return hidden_states


class Qwen2ForCausalLM(torch.nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()

        self.model = Qwen2Model(prefix, config, weights)

        if config.tie_word_embeddings:
            suffix = "model.embed_tokens"
        else:
            suffix = "lm_head"

        self.lm_head = SpeculativeHead.load(
            config,
            prefix=f"{prefix}.{suffix}" if prefix else suffix,
            weights=weights,
        )

        self.embed_tokens = TensorParallelEmbedding(
            prefix=f"{prefix}.embed_tokens" if prefix else "model.embed_tokens",
            weights=weights,
        )

        self.max_past = config.sliding_window
        self.max_past_tensor = (
            torch.tensor(config.sliding_window, device=weights.device)
            if self.max_past is not None
            else None
        )

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        inputs_embeds = self.embed_tokens(input_ids)

        hidden_states = self.model(
            inputs_embeds,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            hpu_attention_meta,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits = self.lm_head(hidden_states)
        return logits


================================================
FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_qwen3_modeling.py
================================================
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Optional, Tuple, List

import torch
from torch import nn
import habana_frameworks.torch as htorch
from text_generation_server.layers.attention import (
    paged_attention,
    attention,
    set_block_mapping,
    Seqlen,
    HPUPagedAttentionMetadata,
)
from text_generation_server.layers.attention.kv_cache import get_kv_scales
from text_generation_server.layers import (
    TensorParallelEmbedding,
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    SpeculativeHead,
)


from text_generation_server.layers.layernorm import (
    FastRMSNorm,
)
from .flash_qwen2_modeling import Qwen2MLP
from text_generation_server.layers.rotary import PositionRotaryEmbedding


class Qwen3Attention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, config, prefix, weights, layer_idx, rotary_emb):
        super().__init__()
        self.config = config
        self.layer_idx = layer_idx
        self.head_dim = getattr(
            config, "head_dim", config.hidden_size // config.num_attention_heads
        )
        self.num_key_value_groups = (
            config.num_attention_heads // config.num_key_value_heads
        )
        self.num_heads = config.num_attention_heads
        self.attention_dropout = config.attention_dropout
        self.softmax_scale = self.head_dim**-0.5
        self.rotary_emb = rotary_emb

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()
        self.num_key_value_heads = (
            config.num_key_value_heads // weights.process_group.size()
        )
        self.query_key_value = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
            dim=0,
            weights=weights,
            bias=False,
        )

        self.kv_scales = get_kv_scales(weights, f"{prefix}")

        self.o_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.o_proj",
            weights=weights,
            bias=False,
        )

        self.num_groups = self.num_heads // self.num_key_value_heads
        self.kv_head_mapping = torch.arange(
            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
        ).repeat_interleave(self.num_groups)

        self.max_past = (
            config.sliding_window if config.sliding_window is not None else -1
        )

        self.q_norm = FastRMSNorm.load(
            prefix=f"{prefix}.q_norm",
            weights=weights,
            eps=config.rms_norm_eps,
        )
        self.k_norm = FastRMSNorm.load(
            prefix=f"{prefix}.k_norm",
            weights=weights,
            eps=config.rms_norm_eps,
        )
        self.sliding_window = config.sliding_window
        if not (
            self.config.use_sliding_window
            and getattr(self.config, "sliding_window", None) is not None
            and self.layer_idx >= self.config.max_window_layers
        ):
            self.sliding_window = None

    def forward(
        self,
        hidden_states,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        hpu_attention_meta,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, self.head_dim)
        qkv = self.query_key_value(hidden_states)
        query_states, key_states, value_states = qkv.split(
            [
                self.head_dim * self.num_heads,
                self.head_dim * self.num_key_value_heads,
                self.head_dim * self.num_key_value_heads,
            ],
            dim=1,
        )

        query_states, _ = self.q_norm(query_states.view(hidden_shape))
        key_states, _ = self.k_norm(key_states.view(hidden_shape))
        value_states = value_states.view(hidden_shape)
        self.rotary_emb(query_states, key_states, cos, sin)

        kv_cache.store(
            key=key_states,
            value=value_states,
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            # sdpa
            attn_output = attention(
                query=query_states,
                key=key_states,
                value=value_states,
                kv_cache=kv_cache,
                kv_scales=self.kv_scales,
                seqlen=seqlen,
                softmax_scale=self.softmax_scale,
                window_size_left=self.max_past,
            )
        # Decode
        else:
            attn_output = paged_attention(
                query_states,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                seqlen,
                kv_scales=self.kv_scales,
                hpu_attention_meta=hpu_attention_meta,
                window_size_left=self.max_past,
            )

        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
        return self.o_proj(attn_output)


class Qwen3DecoderLayer(nn.Module):
    def __init__(self, config, prefix, weights, layer_idx: int, rotary_emb):
        super().__init__()
        self.hidden_size = config.hidden_size
        self.self_attn = Qwen3Attention(
            config=config,
            prefix=f"{prefix}.self_attn",
            weights=weights,
            layer_idx=layer_idx,
            rotary_emb=rotary_emb,
        )
        self.mlp = Qwen2MLP(config=config, prefix=f"{prefix}.mlp", weights=weights)
        self.input_layernorm = FastRMSNorm.load(
            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
        )
        self.post_attention_layernorm = FastRMSNorm.load(
            prefix=f"{prefix}.post_attention_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        hpu_attention_meta,
    ) -> torch.Tensor:
        residual = hidden_states
        hidden_states, _ = self.input_layernorm(hidden_states)

        # Self Attention
        hidden_states = self.self_attn(
            hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            hpu_attention_meta,
        )

        hidden_states = residual + hidden_states

        # Fully Connected
        residual = hidden_states
        hidden_states, _ = self.post_attention_layernorm(hidden_states)
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states
        return hidden_states


class Qwen3Model(nn.Module):
    def __init__(self, config, prefix: str, weights):
        super().__init__()
        self.config = config
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size
        head_dim = getattr(
            config, "head_dim", config.hidden_size // config.num_attention_heads
        )
        rotary_emb = PositionRotaryEmbedding.static(
            config=config,
            dim=head_dim,
            base=config.rope_theta,
            device=weights.device,
        )

        self.layers = nn.ModuleList(
            [
                Qwen3DecoderLayer(
                    config=config,
                    prefix=f"{prefix}.layers.{layer_idx}",
                    weights=weights,
                    layer_idx=layer_idx,
                    rotary_emb=rotary_emb,
                )
                for layer_idx in range(config.num_hidden_layers)
            ]
        )
        self.norm = FastRMSNorm.load(
            prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
        )

    def forward(
        self,
        inputs_embeds: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
    ) -> torch.Tensor:
        if hpu_attention_meta is not None:
            hpu_attention_meta = set_block_mapping(
                hpu_attention_meta, inputs_embeds.shape[0]
            )
        hidden_states = inputs_embeds

        # create position embeddings to be shared across the decoder layers
        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
            position_ids,
        )

        residual = None
        lazy_mode = htorch.utils.internal.is_lazy()
        if lazy_mode:
            htorch.core.mark_step()

        for i, decoder_layer in enumerate(self.layers):
            hidden_states = decoder_layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                slots,
                seqlen,
                hpu_attention_meta,
            )
            if lazy_mode:
                htorch.core.mark_step()

        hidden_states, _ = self.norm(hidden_states)

        # add hidden states from the last decoder layer
        return hidden_states


class Qwen3ForCausalLM(nn.Module):

    def __init__(self, prefix: str, config, weights):
        super().__init__()
        self.model = Qwen3Model(config=config, prefix="model", weights=weights)
        self.vocab_size = config.vocab_size
        if config.tie_word_embeddings:
            suffix = "model.embed_tokens"
        else:
            suffix = "lm_head"

        self.lm_head = SpeculativeHead.load(
            config,
            prefix=f"{prefix}.{suffix}" if prefix else suffix,
            weights=weights,
        )

        self.embed_tokens = TensorParallelEmbedding(
            prefix=f"{prefix}.embed_tokens" if prefix else "model.embed_tokens",
            weights=weights,
        )

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        inputs_embeds = self.embed_tokens(input_ids)
        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
        hidden_states = self.model(
            inputs_embeds,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            hpu_attention_meta,
        )

        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss

        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits = self.lm_head(hidden_states)

        return logits


================================================
FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_qwen3_moe_modeling.py
================================================
# coding=utf-8
# Copyright 5 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List, Optional, Tuple, Type

import torch
from torch import nn
import torch.nn.functional as F
from text_generation_server.layers.attention import (
    attention,
    paged_attention,
    set_block_mapping,
    Seqlen,
    HPUPagedAttentionMetadata,
)
from text_generation_server.layers.attention.kv_cache import get_kv_scales
from text_generation_server.layers.moe import DenseMoELayer, MoELayer, SparseMoELayer
from text_generation_server.layers import (
    TensorParallelEmbedding,
    TensorParallelColumnLinear,
    TensorParallelRowLinear,
    SpeculativeHead,
    FastLinear,
)

from text_generation_server.layers.layernorm import (
    FastRMSNorm,
)
from .flash_qwen2_modeling import Qwen2MLP
from .flash_qwen3_modeling import Qwen3Attention
from transformers.activations import ACT2FN
from text_generation_server.layers.rotary import PositionRotaryEmbedding


def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)


def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    """
    cos = cos.unsqueeze(unsqueeze_dim)
    sin = sin.unsqueeze(unsqueeze_dim)
    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed


class Qwen3MoeAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, config, prefix, weights, layer_idx, rotary_emb):
        super().__init__()
        self.config = config
        self.layer_idx = layer_idx
        self.head_dim = getattr(
            config, "head_dim", config.hidden_size // config.num_attention_heads
        )
        self.num_key_value_heads = config.num_key_value_heads
        self.num_key_value_groups = (
            config.num_attention_heads // config.num_key_value_heads
        )
        self.scaling = self.head_dim**-0.5
        self.attention_dropout = config.attention_dropout
        self.is_causal = True

        self.q_proj = FastLinear.load(
            config, f"{prefix}.q_proj", weights, bias=config.attention_bias
        )

        self.k_proj = FastLinear.load(
            config, f"{prefix}.k_proj", weights, bias=config.attention_bias
        )
        self.v_proj = FastLinear.load(
            config, f"{prefix}.v_proj", weights, bias=config.attention_bias
        )
        self.o_proj = FastLinear.load(
            config, f"{prefix}.o_proj", weights, bias=config.attention_bias
        )
        self.rotary_emb = rotary_emb

        self.q_norm = FastRMSNorm.load(
            prefix=f"{prefix}.q_norm",
            weights=weights,
            eps=config.rms_norm_eps,
        )

        self.k_norm = FastRMSNorm.load(
            prefix=f"{prefix}.k_norm",
            weights=weights,
            eps=config.rms_norm_eps,
        )

        self.max_past = (
            config.sliding_window if config.sliding_window is not None else -1
        )

        self.kv_scales = get_kv_scales(weights, f"{prefix}")
        self.kv_head_mapping = torch.arange(
            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
        ).repeat_interleave(self.num_key_value_groups)

        self.sliding_window = config.sliding_window
        if not (
            self.config.use_sliding_window
            and getattr(self.config, "sliding_window", None) is not None
            and self.layer_idx >= self.config.max_window_layers
        ):
            self.sliding_window = None

    def forward(
        self,
        hidden_states,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        hpu_attention_meta,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, self.head_dim)

        query_states, _ = self.q_norm(self.q_proj(hidden_states).view(hidden_shape))
        key_states, _ = self.k_norm(self.k_proj(hidden_states).view(hidden_shape))
        value_states = self.v_proj(hidden_states).view(hidden_shape)

        self.rotary_emb(query_states, key_states, cos, sin)
        # query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)

        kv_cache.store(
            key=key_states,
            value=value_states,
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            # sdpa
            attn_output = attention(
                query=query_states,
                key=key_states,
                value=value_states,
                kv_cache=kv_cache,
                kv_scales=self.kv_scales,
                seqlen=seqlen,
                softmax_scale=self.scaling,
                window_size_left=self.max_past,
            )
        # Decode
        else:
            attn_output = paged_attention(
                query_states,
                kv_cache,
                self.kv_head_mapping,
                self.scaling,
                seqlen,
                kv_scales=self.kv_scales,
                hpu_attention_meta=hpu_attention_meta,
                window_size_left=self.max_past,
            )

        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
        attn_output = self.o_proj(attn_output)
        return attn_output


class Qwen3MoE(nn.Module):
    def __init__(self, prefix, config, moe_layer_cls: Type[MoELayer], weights):
        super().__init__()

        # gating
        self.gate = FastLinear.load(config, f"{prefix}.gate", weights, bias=False)

        self.moe = moe_layer_cls(
            n_expert_group=None,
            n_experts=config.num_experts,
            prefix=f"{prefix}.experts",
            renormalize=True,
            topk=config.num_experts_per_tok,
            topk_group=None,
            weights=weights,
        )
        # gate_proj_name="w1",
        # up_proj_name="w3",
        # down_proj_name="w2",

        assert isinstance(self.moe, MoELayer)

        self.process_group = weights.process_group

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        router_logits = self.gate(x)
        out = self.moe(x, gating_output=router_logits)

        # Reduce sum
        if self.process_group.size() > 1:
            torch.distributed.all_reduce(out, group=self.process_group)

        return out.view(*x.shape)


class Qwen3MoeMLP(nn.Module):
    def __init__(self, prefix, config, weights, intermediate_size=None):
        super().__init__()
        self.config = config
        self.hidden_size = config.hidden_size
        self.intermediate_size = (
            intermediate_size
            if intermediate_size is not None
            else config.intermediate_size
        )
        # Fuse gate and up proj
        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
            weights=weights,
            dim=0,
            bias=False,
        )
        self.down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.down_proj",
            weights=weights,
            bias=False,
        )
        self.intermediate_size = (
            config.intermediate_size // weights.process_group.size()
        )

        self.act_fn = ACT2FN[config.hidden_act]

    def forward(self, x):
        gate_up_states = self.gate_up_proj(x)
        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
        return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1])


class Qwen3MoeSparseMoeBlock(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.num_experts = config.num_experts
        self.top_k = config.num_experts_per_tok
        self.norm_topk_prob = config.norm_topk_prob

        # gating
        # self.gate = nn.Linear(config.hidden_size, config.num_experts, bias=False)
        self.gate = FastLinear.load(config, f"{prefix}.gate", weights, bias=False)
        self.experts = nn.ModuleList(
            [
                Qwen3MoeMLP(
                    prefix=f"{prefix}.experts.{i}",
                    config=config,
                    weights=weights,
                    intermediate_size=config.moe_intermediate_size,
                )
                for i in range(self.num_experts)
            ]
        )

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        """ """
        input_shape = hidden_states.shape
        _, hidden_dim = hidden_states.shape
        # hidden_states = hidden_states.view(-1, hidden_dim)
        # router_logits: (batch * sequence_length, n_experts)
        router_logits = self.gate(hidden_states)

        routing_weights = F.softmax(router_logits, dim=1, dtype=hidden_states.dtype)
        routing_weights, selected_experts = torch.topk(
            routing_weights, self.top_k, dim=-1
        )
        if self.norm_topk_prob:  # only diff with mixtral sparse moe block!
            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
        # we cast back to the input dtype
        routing_weights = routing_weights.to(hidden_states.dtype)

        final_hidden_states = torch.zeros(
            (input_shape), dtype=hidden_states.dtype, device=hidden_states.device
        )

        # One hot encode the selected experts to create an expert mask
        # this will be used to easily index which expert is going to be sollicitated
        expert_mask = torch.nn.functional.one_hot(
            selected_experts, num_classes=self.num_experts
        ).permute(2, 1, 0)
        # Loop over all available experts in the model and perform the computation on each expert
        for expert_idx in range(self.num_experts):
            expert_layer = self.experts[expert_idx]
            idx, top_x = torch.where(expert_mask[expert_idx])

            # Index the correct hidden states and compute the expert hidden state for
            # the current expert. We need to make sure to multiply the output hidden
            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
            current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
            current_hidden_states = (
                expert_layer(current_state) * routing_weights[top_x, idx, None]
            )

            # However `index_add_` only support torch tensors for indexing so we'll use
            # the `top_x` tensor here.
            final_hidden_states.index_add_(
                0, top_x, current_hidden_states.to(hidden_states.dtype)
            )
        final_hidden_states = final_hidden_states.reshape(input_shape)
        return final_hidden_states


class Qwen3MoeDecoderLayer(nn.Module):
    def __init__(self, config, prefix, weights, layer_idx: int, rotary_emb):
        super().__init__()
        self.hidden_size = config.hidden_size

        if config.num_key_value_heads // weights.process_group.size() > 0:
            self.self_attn = Qwen3Attention(
                config,
                prefix=f"{prefix}.self_attn",
                weights=weights,
                layer_idx=layer_idx,
                rotary_emb=rotary_emb,
            )
        else:
            self.self_attn = Qwen3MoeAttention(
                config,
                prefix=f"{prefix}.self_attn",
                weights=weights,
                layer_idx=layer_idx,
                rotary_emb=rotary_emb,
            )

        moe_layer_cls = (
            SparseMoELayer if SparseMoELayer.is_supported(weights) else DenseMoELayer
        )

        if (layer_idx not in config.mlp_only_layers) and (
            config.num_experts > 0 and (layer_idx + 1) % config.decoder_sparse_step == 0
        ):
            self.mlp = Qwen3MoE(f"{prefix}.mlp", config, moe_layer_cls, weights)
            # self.mlp = Qwen3MoeSparseMoeBlock(f"{prefix}.mlp", config, weights)

        else:
            self.mlp = Qwen2MLP(config=config, prefix=f"{prefix}.mlp", weights=weights)

        self.input_layernorm = FastRMSNorm.load(
            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
        )
        self.post_attention_layernorm = FastRMSNorm.load(
            prefix=f"{prefix}.post_attention_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        hpu_attention_meta,
    ) -> torch.Tensor:
        if residual is None:
            residual = hidden_states

        hidden_states, _ = self.input_layernorm(hidden_states)

        # Self Attention
        hidden_states = self.self_attn(
            hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            hpu_attention_meta,
        )

        hidden_states = residual + hidden_states

        # Fully Connected
        residual = hidden_states
        hidden_states, _ = self.post_attention_layernorm(hidden_states)

        hidden_states = self.mlp(hidden_states)

        hidden_states = residual + hidden_states
        return hidden_states


class Qwen3MoeModel(nn.Module):
    def __init__(self, config, prefix: str, weights):
        super().__init__()
        self.config = config
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size
        head_dim = getattr(
            config, "head_dim", config.hidden_size // config.num_attention_heads
        )
        rotary_emb = PositionRotaryEmbedding.static(
            config=config,
            dim=head_dim,
            base=config.rope_theta,
            device=weights.device,
        )

        self.layers = nn.ModuleList(
            [
                Qwen3MoeDecoderLayer(
                    config=config,
                    prefix=f"{prefix}.layers.{layer_idx}",
                    weights=weights,
                    layer_idx=layer_idx,
                    rotary_emb=rotary_emb,
                )
                for layer_idx in range(config.num_hidden_layers)
            ]
        )
        self.norm = FastRMSNorm.load(
            prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
        )

    def forward(
        self,
        inputs_embeds: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
    ) -> torch.Tensor:
        if hpu_attention_meta is not None:
            hpu_attention_meta = set_block_mapping(
                hpu_attention_meta, inputs_embeds.shape[0]
            )

        hidden_states = inputs_embeds

        # create position embeddings to be shared across the decoder layers
        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
            position_ids,
        )

        residual = None
        for i, decoder_layer in enumerate(self.layers):
            hidden_states = decoder_layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                slots,
                seqlen,
                hpu_attention_meta,
            )

        hidden_states, _ = self.norm(hidden_states)

        # add hidden states from the last decoder layer
        return hidden_states


class Qwen3MoeForCausalLM(nn.Module):

    def __init__(self, prefix: str, config, weights):
        super().__init__()
        self.model = Qwen3MoeModel(config=config, prefix="model", weights=weights)
        self.vocab_size = config.vocab_size
        if config.tie_word_embeddings:
            suffix = "model.embed_tokens"
        else:
            suffix = "lm_head"

        self.lm_head = SpeculativeHead.load(
            config,
            prefix=f"{prefix}.{suffix}" if prefix else suffix,
            weights=weights,
        )

        self.embed_tokens = TensorParallelEmbedding(
            prefix=f"{prefix}.embed_tokens" if prefix else "model.embed_tokens",
            weights=weights,
        )

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:

        inputs_embeds = self.embed_tokens(input_ids)
        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
        hidden_states = self.model(
            inputs_embeds,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            hpu_attention_meta,
        )

        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits = self.lm_head(hidden_states)

        return logits


================================================
FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
================================================
from typing import List, Optional, Tuple

import torch
import torch.distributed
from torch import nn
from transformers.configuration_utils import PretrainedConfig
from transformers.modeling_utils import PreTrainedModel
from text_generation_server.layers import (
    SpeculativeHead,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    TensorParallelRowLinear,
    get_linear,
)
from text_generation_server.layers.attention.kv_cache import get_kv_scales
from text_generation_server.layers.layernorm import FastLayerNorm
from text_generation_server.layers.rotary import PositionRotaryEmbedding
from text_generation_server.layers.attention import (
    attention,
    paged_attention,
    set_block_mapping,
    Seqlen,
    HPUPagedAttentionMetadata,
)
import habana_frameworks.torch as htorch


def load_row(config, prefix: str, weights, bias: bool):
    weight = weights.get_weights_row(prefix)

    if bias and weights.process_group.rank() == 0:
        # Rank is only on the first rank process
        bias = weights.get_tensor(f"{prefix}.bias")
    else:
        bias = None

    linear = get_linear(weight, bias)
    if config.parallel_attn:
        return linear
    else:
        return TensorParallelRowLinear(linear, process_group=weights.process_group)


class RWConfig(PretrainedConfig):
    attribute_map = {
        "num_hidden_layers": "n_layer",
        "num_attention_heads": "n_head",
        "num_key_value_heads": "n_head_kv",
    }

    def __init__(
        self,
        model_type="RefinedWeb",
        vocab_size=250880,
        hidden_size=64,
        num_hidden_layers=None,
        num_attention_heads=None,
        num_ln_in_prallel_attention=None,
        layer_norm_epsilon=1e-5,
        initializer_range=0.02,
        use_cache=True,
        bos_token_id=1,
        eos_token_id=2,
        hidden_dropout=0.0,
        attention_dropout=0.0,
        num_kv_heads=None,
        multi_query=False,
        alibi=False,
        new_decoder_architecture=None,
        bias=False,
        parallel_attn=False,
        rope_theta=10_000.0,
        **kwargs,
    ):
        if alibi:
            raise NotImplementedError(
                "alibi is not supported by this version of the model"
            )

        self.model_type = model_type
        self.alibi = False
        self.rotary = True
        self.rope_theta = rope_theta
        self.max_position_embeddings = 2048

        self.vocab_size = vocab_size
        # Backward compatibility with n_embed kwarg
        n_embed = kwargs.pop("n_embed", None)
        self.hidden_size = hidden_size if n_embed is None else n_embed
        self.n_layer = (
            num_hidden_layers
            if num_hidden_layers is not None
            else kwargs.pop("n_layer", 2)
        )
        self.n_head = (
            num_attention_heads
            if num_attention_heads is not None
            else kwargs.pop("n_head", 8)
        )
        self.layer_norm_epsilon = layer_norm_epsilon
        self.num_ln_in_parallel_attn = num_ln_in_prallel_attention
        self.initializer_range = initializer_range
        self.use_cache = use_cache
        self.hidden_dropout = hidden_dropout
        self.attention_dropout = attention_dropout
        self.bias = bias
        self.parallel_attn = parallel_attn

        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id

        if num_kv_heads is not None:
            self.n_head_kv = num_kv_heads
        else:
            old_n_head_kv = kwargs.pop("n_head_kv", None)
            if old_n_head_kv is not None:
                self.n_head_kv = old_n_head_kv
            else:
                self.n_head_kv = 1 if multi_query else self.n_head

        if new_decoder_architecture is not None:
            self.new_decoder_architecture = new_decoder_architecture
        elif model_type == "RefinedWeb":
            self.new_decoder_architecture = True
        else:
            self.new_decoder_architecture = False

        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)


class FlashRWAttention(torch.nn.Module):
    def __init__(
        self,
        config,
        prefix: str,
        weights,
        rotary_emb,
    ):
        super().__init__()
        self.num_heads = config.n_head
        self.num_heads_kv = config.n_head_kv
        self.hidden_size = config.hidden_size
        self.head_size = self.hidden_size // self.num_heads
        self.rope_theta = config.rope_theta
        self.rotary_emb = rotary_emb

        self.softmax_scale = self.head_size ** (-0.5)

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()

        self.query_key_value = TensorParallelColumnLinear.load(
            config,
            prefix=f"{prefix}.query_key_value",
            weights=weights,
            bias=config.bias,
        )
        self.kv_scales = get_kv_scales(weights, f"{prefix}")
        self.dense = load_row(
            config, prefix=f"{prefix}.dense", weights=weights, bias=config.bias
        )

        if self.num_heads_kv == 1:
            self.kv_head_mapping = torch.zeros(
                self.num_heads, dtype=torch.int32, device=weights.device
            )
        else:
            self.kv_head_mapping = torch.arange(
                0, self.num_heads, dtype=torch.int32, device=weights.device
            )

    def forward(
        self,
        hidden_states,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        hpu_attention_meta,
    ):
        qkv = self.query_key_value(hidden_states)

        # Split query from key_value
        query, kv = qkv.split(
            [self.head_size * self.num_heads, 2 * self.head_size * self.num_heads_kv],
            dim=1,
        )

        # Prepare query and key_value for indexing
        query = query.view(-1, self.num_heads, self.head_size)
        kv = kv.view(-1, 2, self.num_heads_kv, self.head_size)

        # Inplace rotary
        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)

        kv_cache.store(
            key=kv[:, 0],
            value=kv[:, 1],
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            # sdpa
            attn_output = attention(
                query=query,
                key=kv[:, 0],
                value=kv[:, 1],
                kv_cache=kv_cache,
                kv_scales=self.kv_scales,
                seqlen=seqlen,
                softmax_scale=self.softmax_scale,
            )
        # Decode
        else:
            attn_output = paged_attention(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                seqlen,
                kv_scales=self.kv_scales,
                hpu_attention_meta=hpu_attention_meta,
            )

        return self.dense(attn_output.view(-1, self.num_heads * self.head_size))


class FlashRWLargeAttention(torch.nn.Module):
    def __init__(
        self,
        config,
        prefix: str,
        weights,
        rotary_emb,
    ):
        super().__init__()

        hidden_size = config.hidden_size
        num_heads = config.n_head
        # num_heads_kv = config.n_head_kv
        num_groups = config.n_head_kv

        self.hidden_size = hidden_size
        self.head_size = hidden_size // num_heads
        self.num_groups = num_groups
        self.rope_theta = config.rope_theta
        self.rotary_emb = rotary_emb

        self.softmax_scale = self.head_size ** (-0.5)

        # self.num_groups = num_heads // (num_heads_kv * 2)
        self.num_heads = num_heads // self.num_groups
        # self.num_heads_kv = num_heads_kv // self.num_groups
        process_group = weights.process_group

        if process_group.size() > self.num_groups:
            raise NotImplementedError(
                "Tensor Parallelism is not implemented for world_size > n groups"
            )
        if self.num_groups % process_group.size() != 0:
            raise NotImplementedError(
                f"Tensor Parallelism is not implemented for {self.num_groups} not divisible by {process_group.size()}"
            )

        self.num_groups = self.num_groups // process_group.size()

        self.query_key_value = TensorParallelColumnLinear.load(
            config,
            prefix=f"{prefix}.query_key_value",
            weights=weights,
            bias=config.bias,
        )
        self.kv_scales = get_kv_scales(weights, f"{prefix}")
        self.dense = load_row(
            config, prefix=f"{prefix}.dense", weights=weights, bias=config.bias
        )

        self.kv_head_mapping = torch.arange(
            0, self.num_groups, dtype=torch.int32, device=weights.device
        ).repeat_interleave(self.num_heads)

    def forward(
        self,
        hidden_states,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        hpu_attention_meta,
    ):
        qkv = self.query_key_value(hidden_states)
        qkv = qkv.view(-1, self.num_groups, self.num_heads + 2, self.head_size)

        # Split on group dimension
        query, kv = qkv.split(
            [self.num_heads, 2],
            dim=2,
        )
        # Merge groups and heads
        query = query.reshape(-1, self.num_groups * self.num_heads, self.head_size)

        # Inplace rotary
        self.rotary_emb(query, torch.select(kv, dim=2, index=0), cos, sin)

        kv_cache.store(
            key=kv[:, :, 0].contiguous(),
            value=kv[:, :, 1].contiguous(),
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
            attn_output = attention(
                query=query,
                key=kv[:, :, 0],
                value=kv[:, :, 1],
                kv_cache=kv_cache,
                kv_scales=self.kv_scales,
                seqlen=seqlen,
                softmax_scale=self.softmax_scale,
            )
        # Decode
        else:
            attn_output = paged_attention(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                seqlen,
                kv_scales=self.kv_scales,
                hpu_attention_meta=hpu_attention_meta,
            )

        return self.dense(
            attn_output.view(-1, self.num_groups * self.num_heads * self.head_size)
        )


class FlashMLP(nn.Module):
    def __init__(self, config, prefix: str, weights):
        super().__init__()
        self.act = torch.nn.functional.gelu

        self.dense_h_to_4h = TensorParallelColumnLinear.load(
            config, prefix=f"{prefix}.dense_h_to_4h", weights=weights, bias=config.bias
        )
        self.dense_4h_to_h = load_row(
            config, prefix=f"{prefix}.dense_4h_to_h", weights=weights, bias=config.bias
        )

    def forward(self, hidden_states):
        hidden_states = self.dense_h_to_4h(hidden_states)
        hidden_states = self.act(hidden_states)
        hidden_states = self.dense_4h_to_h(hidden_states)
        return hidden_states


class FlashRWLayer(nn.Module):
    def __init__(
        self,
        layer_id,
        prefix: str,
        config,
        weights,
        rotary_emb,
    ):
        super().__init__()

        parallel_attn = config.parallel_attn
        self.parallel_attn = parallel_attn

        prefix = f"{prefix}.h.{layer_id}"

        # NOTE: Falcon 180B uses the ln_attn prefix
        ln_prefix = "input_layernorm"
        if config.num_hidden_layers == 80:
            ln_prefix = "ln_attn"

        self.input_layernorm = FastLayerNorm.load(
            prefix=f"{prefix}.{ln_prefix}",
            weights=weights,
            eps=config.layer_norm_epsilon,
        )
        self.self_attention = FlashRWAttention(
            config,
            prefix=f"{prefix}.self_attention",
            weights=weights,
            rotary_emb=rotary_emb,
        )
        self.post_attention_layernorm = (
            FastLayerNorm.load(
                prefix=f"{prefix}.post_attention_layernorm",
                weights=weights,
                eps=config.layer_norm_epsilon,
            )
            if not parallel_attn
            else None
        )

        self.mlp = FlashMLP(
            config,
            prefix=f"{prefix}.mlp",
            weights=weights,
        )

        self.process_group = weights.process_group

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        hpu_attention_meta,
    ):
        if self.parallel_attn:
            ln_hidden_states, residual = self.input_layernorm(hidden_states, residual)

            attn_output = self.self_attention(
                ln_hidden_states,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache,
                slots,
                seqlen,
                hpu_attention_meta,
            )

            mlp_output = self.mlp(ln_hidden_states)
            intermediate = mlp_output + attn_output

            if self.process_group.size() > 1:
                torch.distributed.all_reduce(intermediate, group=self.process_group)

            return intermediate, residual
        else:
            hidden_states, residual = self.input_layernorm(hidden_states, residual)

            hidden_states = self.self_attention(
                hidden_states,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache,
                slots,
                seqlen,
                hpu_attention_meta,
            )

            if self.post_attention_layernorm is not None:
                hidden_states, residual = self.post_attention_layernorm(
                    hidden_states, residual
                )

            mlp_output = self.mlp(hidden_states)

            return mlp_output, residual


class FlashRWLayerNorm(nn.Module):
    def __init__(self, config, prefix: str, weights):
        super().__init__()
        # Falcon2 includes the number of layer norms in the config
        # in the case no number of layer norms is provided, we default to 1
        self.num_ln = getattr(config, "num_ln_in_parallel_attn", 1)

        # Falcon 180B uses the ln_attn prefix and has 2 layer norms
        if config.num_hidden_layers == 80:
            self.num_ln = 2

        if self.num_ln == 1:
            self.input_ln = FastLayerNorm.load(
                prefix=f"{prefix}.input_layernorm",
                weights=weights,
                eps=config.layer_norm_epsilon,
            )
        elif self.num_ln == 2:
            self.ln_attn = FastLayerNorm.load(
                prefix=f"{prefix}.ln_attn",
                weights=weights,
                eps=config.layer_norm_epsilon,
            )
            self.ln_mlp = FastLayerNorm.load(
                prefix=f"{prefix}.ln_mlp",
                weights=weights,
                eps=config.layer_norm_epsilon,
            )
        else:
            raise ValueError("Number of layer norms can either be 1 or 2.")

    def forward(
        self,
        hidden_states,
        residual,
    ):
        if self.num_ln == 1:
            ln_hidden_states, residual = self.input_ln(hidden_states, residual)
            return ln_hidden_states, ln_hidden_states, residual
        elif self.num_ln == 2:
            ln_attn, residual = self.ln_attn(hidden_states, residual)
            ln_mlp, _ = self.ln_mlp(residual)
            return ln_attn, ln_mlp, residual


class FlashRWLargeLayer(nn.Module):
    def __init__(self, layer_id, prefix: str, config, weights, rotary_emb):
        super().__init__()
        prefix = f"{prefix}.h.{layer_id}"

        self.ln_layer = FlashRWLayerNorm(config, prefix, weights)

        self.self_attention = FlashRWLargeAttention(
            config,
            prefix=f"{prefix}.self_attention",
            weights=weights,
            rotary_emb=rotary_emb,
        )
        assert config.parallel_attn, "This version doesn't support non parallel_attn"

        self.mlp = FlashMLP(config, prefix=f"{prefix}.mlp", weights=weights)

        self.process_group = weights.process_group

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        hpu_attention_meta,
    ):
        # Layer norm.
        ln_attn, ln_mlp, residual = self.ln_layer(hidden_states, residual)

        # Self attention.
        attn_output = self.self_attention(
            ln_attn,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            hpu_attention_meta,
        )

        # MLP.
        mlp_output = self.mlp(ln_mlp)

        intermediate = attn_output + mlp_output

        if self.process_group.size() > 1:
            torch.distributed.all_reduce(intermediate, group=self.process_group)

        return intermediate, residual


class FlashRWPreTrainedModel(PreTrainedModel):
    config_class = RWConfig


class FlashRWModel(FlashRWPreTrainedModel):
    def __init__(self, prefix: str, config, weights):
        super().__init__(config)
        self.config = config

        self.word_embeddings = TensorParallelEmbedding(
            prefix=f"{prefix}.word_embeddings", weights=weights
        )
        rotary_emb = PositionRotaryEmbedding.static(
            config=config,
            dim=config.hidden_size // config.n_head,
            base=config.rope_theta,
            device=weights.device,
        )

        if config.new_decoder_architecture:
            self.h = nn.ModuleList(
                [
                    FlashRWLargeLayer(layer_id, prefix, config, weights, rotary_emb)
                    for layer_id in range(config.num_hidden_layers)
                ]
            )
            self.cache_size = self.h[0].self_attention.num_groups
        else:
            self.h = nn.ModuleList(
                [
                    FlashRWLayer(layer_id, prefix, config, weights, rotary_emb)
                    for layer_id in range(config.num_hidden_layers)
                ]
            )
            self.cache_size = self.h[0].self_attention.num_heads_kv

        self.ln_f = FastLayerNorm.load(
            prefix=f"{prefix}.ln_f",
            weights=weights,
            eps=config.layer_norm_epsilon,
        )

        self.head_size = self.h[0].self_attention.head_size

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
    ) -> torch.Tensor:
        if hpu_attention_meta is not None:
            hpu_attention_meta = set_block_mapping(
                hpu_attention_meta, input_ids.shape[0]
            )
        hidden_states = self.word_embeddings(input_ids)

        # Get rotary cos and sin for this forward
        # Avoid to index in each layer
        cos, sin = self.h[0].self_attention.rotary_emb.get_cos_sin(position_ids)

        residual = None
        lazy_mode = htorch.utils.internal.is_lazy()
        if lazy_mode:
            htorch.core.mark_step()
        for i, layer in enumerate(self.h):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                slots,
                seqlen,
                hpu_attention_meta,
            )
            if lazy_mode:
                htorch.core.mark_step()

        hidden_states, _ = self.ln_f(hidden_states, residual)

        return hidden_states


class FlashRWForCausalLM(FlashRWPreTrainedModel):
    def __init__(self, prefix: str, config, weights):
        super().__init__(config)

        if not prefix:
            prefix = "transformer"
        else:
            prefix = f"{prefix}.transformer"

        self.transformer = FlashRWModel(prefix, config, weights)

        self.lm_head = SpeculativeHead.load(config, prefix="lm_head", weights=weights)

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        hidden_states = self.transformer(
            input_ids,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            hpu_attention_meta,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits = self.lm_head(hidden_states)
        return logits


================================================
FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
================================================
import torch
import torch.distributed

from torch import nn
from transformers.activations import ACT2FN
from typing import Optional, List, Tuple

from text_generation_server.layers.attention import (
    paged_attention,
    attention,
    set_block_mapping,
    Seqlen,
    HPUPagedAttentionMetadata,
)
from text_generation_server.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    SpeculativeHead,
    TensorParallelEmbedding,
    get_linear,
)
from text_generation_server.layers.attention.kv_cache import get_kv_scales
from text_generation_server.layers.gptq import GPTQWeightsLoader
from text_generation_server.layers.layernorm import (
    FastLayerNorm,
)
import habana_frameworks.torch as htorch


def load_multi_mqa(
    config, prefix: str, weights, bias: bool, head_size, num_heads, hidden_size
):
    if config.quantize == "gptq":
        return _load_multi_mqa_gptq(
            config, prefix, weights, bias, head_size, num_heads, hidden_size
        )
    else:
        return _load_multi_mqa(
            config, prefix, weights, bias, head_size, num_heads, hidden_size
        )


def _load_multi_mqa_gptq(
    config, prefix: str, weights, bias: bool, head_size, num_heads, hidden_size
):
    from text_generation_server.layers.gptq import GPTQWeight

    if any("c_attn" in k for k in weights.routing.keys()) and not config.transpose:
        world_size = weights.process_group.size()
        rank = weights.process_group.rank()

        slice_ = weights._get_slice(f"{prefix}.c_attn.qweight")
        shape = slice_.get_shape()
        block_size = (shape[1] - 2 * head_size) // world_size
        start = rank * block_size
        stop = (rank + 1) * block_size
        assert (shape[1] - 2 * head_size) % world_size == 0
        q_tensor = slice_[:, start:stop]
        kv_tensor = slice_[:, -2 * head_size :]
        qweight = torch.cat([q_tensor, kv_tensor], dim=1)
        qweight = qweight.to(device=weights.device)

        slice_ = weights._get_slice(f"{prefix}.c_attn.scales")
        shape = slice_.get_shape()
        block_size = (shape[1] - 2 * head_size) // world_size
        start = rank * block_size
        stop = (rank + 1) * block_size
        assert (shape[1] - 2 * head_size) % world_size == 0
        q_tensor = slice_[:, start:stop]
        kv_tensor = slice_[:, -2 * head_size :]
        scales = torch.cat([q_tensor, kv_tensor], dim=1)
        scales = scales.to(device=weights.device)

        slice_ = weights._get_slice(f"{prefix}.c_attn.qzeros")
        shape = slice_.get_shape()
        block_size = (shape[1] - (2 * head_size) * 4 // 32) // world_size
        start = rank * block_size
        stop = (rank + 1) * block_size
        assert 2 * head_size % (32 // 4) == 0
        q_tensor = slice_[:, start:stop]
        kv_tensor = slice_[:, -2 * head_size * 4 // 32 :]
        qzeros = torch.cat([q_tensor, kv_tensor], dim=1)
        qzeros = qzeros.to(device=weights.device)

        loader = weights.weights_loader
        assert isinstance(loader, GPTQWeightsLoader)
        loader._get_gptq_params(weights)
        if loader.quant_method == "gptq":
            g_idx = weights.get_tensor(f"{prefix}.c_attn.g_idx")
            g_idx = g_idx.to(device=weights.device)
        elif loader.quant_method == "awq":
            g_idx = None
            from text_generation_server.layers.awq.conversion_utils import (
                fast_awq_to_gptq,
            )

            qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)

        from text_generation_server.layers.gptq import HAS_EXLLAMA

        weight = GPTQWeight(
            qweight=qweight,
            qzeros=qzeros,
            scales=scales,
            g_idx=g_idx,
            bits=loader.bits,
            groupsize=loader.groupsize,
            use_awq_kernel=loader.quantize == "awq",
            use_exllama=HAS_EXLLAMA,
        )

        if bias:
            slice_ = weights._get_slice(f"{prefix}.c_attn.bias")
            shape = slice_.get_shape()
            block_size = (shape[0] - 2 * head_size) // world_size
            assert (shape[0] - 2 * head_size) % world_size == 0
            q_tensor = slice_[start:stop]
            start = rank * block_size
            stop = (rank + 1) * block_size
            q_tensor = slice_[start:stop]
            kv_tensor = slice_[-2 * head_size :]
            bias = torch.cat([q_tensor, kv_tensor], dim=0)
            bias = bias.to(device=weights.device)

        return TensorParallelColumnLinear(get_linear(weight, bias))
    else:
        raise NotImplementedError("Gptq loading with santacoder is not implemented")


def _load_multi_mqa(
    config, prefix: str, weights, bias: bool, head_size, num_heads, hidden_size
):
    if any("c_attn" in k for k in weights.routing.keys()):
        slice_ = weights._get_slice(f"{prefix}.c_attn.weight")
        shape = slice_.get_shape()
        world_size = weights.process_group.size()
        rank = weights.process_group.rank()
        if config.transpose:
            block_size = (shape[1] - 2 * head_size) // world_size
            start = rank * block_size
            stop = (rank + 1) * block_size
            assert (shape[1] - 2 * head_size) % world_size == 0
            q_tensor = slice_[:, start:stop]
            kv_tensor = slice_[:, -2 * head_size :]
            weight = torch.cat([q_tensor, kv_tensor], dim=1).T
        else:
            block_size = (shape[0] - 2 * head_size) // world_size
            start = rank * block_size
            stop = (rank + 1) * block_size
            assert (shape[0] - 2 * head_size) % world_size == 0
            q_tensor = slice_[start:stop]
            kv_tensor = slice_[-2 * head_size :]
            weight = torch.cat([q_tensor, kv_tensor], dim=0)
        if bias:
            slice_ = weights._get_slice(f"{prefix}.c_attn.bias")
            shape = slice_.get_shape()
            block_size = (shape[0] - 2 * head_size) // world_size
            assert (shape[0] - 2 * head_size) % world_size == 0
            start = rank * block_size
            stop = (rank + 1) * block_size
            q_tensor = slice_[start:stop]
            kv_tensor = slice_[-2 * head_size :]
            bias = torch.cat([q_tensor, kv_tensor], dim=0)
    else:
        if config.transpose:
            w = [
                weights.get_sharded(f"{prefix}.q_attn.weight", dim=1).T,
                weights.get_tensor(f"{prefix}.kv_attn.weight").T,
            ]
            weight = torch.cat(w, dim=0)
        else:
            w = [
                weights.get_sharded(f"{prefix}.q_attn.weight", dim=0),
                weights.get_tensor(f"{prefix}.kv_attn.weight"),
            ]
            weight = torch.cat(w, dim=1)

        if bias:
            b = [
                weights.get_sharded(f"{prefix}.q_attn.bias", dim=0),
                weights.get_tensor(f"{prefix}.kv_attn.bias"),
            ]
            bias = torch.cat(b, dim=0)
        else:
            bias = None

    weight = weight.to(dtype=weights.dtype).to(device=weights.device)
    assert list(weight.shape) == [
        (num_heads + 2) * head_size,
        hidden_size,
    ], f"{weight.shape} != {[(num_heads + 2) * head_size, hidden_size]}"
    if bias is not None:
        bias = bias.to(dtype=weights.dtype).to(device=weights.device)
        assert list(bias.shape) == [
            (num_heads + 2) * head_size
        ], f"{weight.shape} != {[(num_heads + 2) * head_size]}"
    return TensorParallelColumnLinear(get_linear(weight, bias))


def load_col(config, prefix: str, weights, bias: bool):
    if config.transpose:
        weight = weights.get_sharded(f"{prefix}.weight", dim=1).T
    else:
        weight = weights.get_multi_weights_col([prefix], dim=0)

    if bias:
        bias = weights.get_sharded(f"{prefix}.bias", dim=0)
    else:
        bias = None
    return TensorParallelColumnLinear(get_linear(weight, bias))


def load_row(config, prefix: str, weights, bias: bool):
    if config.transpose:
        weight = weights.get_sharded(f"{prefix}.weight", dim=0).T
    else:
        weight = weights.get_weights_row(prefix)

    if bias and weights.process_group.rank() == 0:
        # Rank is only on the first rank process
        bias = weights.get_tensor(f"{prefix}.bias")
    else:
        bias = None
    return TensorParallelRowLinear(
        get_linear(weight, bias), process_group=weights.process_group
    )


class FlashMQAttention(torch.nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        num_heads = config.num_attention_heads
        hidden_size = config.hidden_size

        self.num_heads = num_heads
        self.hidden_size = hidden_size
        self.head_size = hidden_size // num_heads

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()

        self.softmax_scale = self.head_size ** (-0.5)

        self.c_attn = load_multi_mqa(
            config,
            prefix=prefix,
            weights=weights,
            bias=True,
            head_size=self.head_size,
            hidden_size=hidden_size,
            num_heads=self.num_heads,
        )
        self.c_proj = load_row(
            config, prefix=f"{prefix}.c_proj", weights=weights, bias=True
        )
        self.kv_scales = get_kv_scales(weights, f"{prefix}")
        self.kv_head_mapping = torch.zeros(
            self.num_heads, dtype=torch.int32, device=weights.device
        )

    def forward(
        self,
        hidden_states,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        hpu_attention_meta,
    ):
        qkv = self.c_attn(hidden_states)

        # Split query from key_value
        query, key_value = qkv.split(
            [self.head_size * self.num_heads, 2 * self.head_size], dim=1
        )

        # Prepare query and key_value for indexing
        query = query.view(-1, self.num_heads, self.head_size)
        key_value = key_value.view(-1, 2, 1, self.head_size)

        kv_cache.store(
            key=key_value[:, 0],
            value=key_value[:, 1],
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            # sdpa
            attn_output = attention(
                query=query,
                key=key_value[:, 0],
                value=key_value[:, 1],
                kv_cache=kv_cache,
                kv_scales=self.kv_scales,
                seqlen=seqlen,
                softmax_scale=self.softmax_scale,
            )
        # Decode
        else:
            attn_output = paged_attention(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                seqlen,
                kv_scales=self.kv_scales,
                hpu_attention_meta=hpu_attention_meta,
            )

        return self.c_proj(attn_output.view(-1, self.num_heads * self.head_size))


class MLP(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        act = config.activation_function
        self.act = (
            ACT2FN[act]
            if "gelu" not in act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
                ),
            )
        )

        self.c_fc = load_col(
            config, prefix=f"{prefix}.c_fc", weights=weights, bias=True
        )
        self.c_proj = load_row(
            config, prefix=f"{prefix}.c_proj", weights=weights, bias=True
        )

    def forward(self, hidden_states):
        hidden_states = self.c_fc(hidden_states)
        hidden_states = self.act(hidden_states)
        hidden_states = self.c_proj(hidden_states)
        return hidden_states


class Block(nn.Module):
    def __init__(self, prefix: str, layer_id, config, weights):
        super().__init__()
        prefix = f"{prefix}.h.{layer_id}"
        self.ln_1 = FastLayerNorm.load(
            prefix=f"{prefix}.ln_1", weights=weights, eps=config.layer_norm_epsilon
        )
        self.ln_2 = FastLayerNorm.load(
            prefix=f"{prefix}.ln_2", weights=weights, eps=config.layer_norm_epsilon
        )
        self.self_attn = FlashMQAttention(
            prefix=f"{prefix}.attn",
            config=config,
            weights=weights,
        )
        self.mlp = MLP(
            prefix=f"{prefix}.mlp",
            config=config,
            weights=weights,
        )

    def forward(
        self,
        hidden_states,
        residual,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        hpu_attention_meta,
    ):
        hidden_states, residual = self.ln_1(hidden_states, residual)
        hidden_states = self.self_attn(
            hidden_states,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            hpu_attention_meta,
        )

        hidden_states, residual = self.ln_2(hidden_states, residual)

        mlp_output = self.mlp(hidden_states)

        return mlp_output, residual


class FlashSantacoderModel(nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()
        self.config = config

        self.process_group = weights.process_group
        self.wte = TensorParallelEmbedding(
            prefix=f"{prefix}.wte",
            weights=weights,
            reduce=False,
        )
        self.wpe = TensorParallelEmbedding(
            prefix=f"{prefix}.wpe",
            weights=weights,
            reduce=False,
        )

        self.layers = nn.ModuleList(
            [
                Block(
                    prefix,
                    layer_id,
                    config,
                    weights,
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )
        self.ln_f = FastLayerNorm.load(
            prefix="transformer.ln_f", weights=weights, eps=config.layer_norm_epsilon
        )

        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
    ) -> torch.Tensor:
        if hpu_attention_meta is not None:
            hpu_attention_meta = set_block_mapping(
                hpu_attention_meta, input_ids.shape[0]
            )
        hidden_states = self.wte(input_ids) + self.wpe(position_ids)

        if self.process_group.size() > 1:
            torch.distributed.all_reduce(hidden_states, group=self.process_group)

        residual = None
        lazy_mode = htorch.utils.internal.is_lazy()
        if lazy_mode:
            htorch.core.mark_step()
        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cu_seqlen_prefill,
                kv_cache[i],
                slots,
                seqlen,
                hpu_attention_meta,
            )
            if lazy_mode:
                htorch.core.mark_step()

        hidden_states, _ = self.ln_f(hidden_states, residual)

        return hidden_states


class FlashSantacoderForCausalLM(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()

        if not prefix:
            prefix = "transformer"
        else:
            prefix = f"{prefix}.transformer"

        config.transpose = config.architectures[0].startswith("GPT2")
        self.model = FlashSantacoderModel(prefix, config, weights)
        self.lm_head = SpeculativeHead.load(
            config, prefix=f"{prefix}.wte", weights=weights
        )

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        hidden_states = self.model(
            input_ids,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            hpu_attention_meta,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits = self.lm_head(hidden_states)
        return logits


================================================
FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
================================================
# coding=utf-8
# Copyright 2024 Starcoder2 AI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
import torch.distributed

from torch import nn
from transformers.activations import ACT2FN
from transformers.configuration_utils import PretrainedConfig
from typing import Optional, List, Tuple

from text_generation_server.layers.attention import (
    paged_attention,
    attention,
    set_block_mapping,
    Seqlen,
    HPUPagedAttentionMetadata,
)
from text_generation_server.layers import (
    TensorParallelMultiAdapterLinear,
    TensorParallelAdapterRowLinear,
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    SpeculativeHead,
    get_linear,
)
from text_generation_server.layers.attention.kv_cache import get_kv_scales
from text_generation_server.layers.layernorm import (
    FastLayerNorm,
    FastRMSNorm,
)
from text_generation_server.layers.rotary import (
    PositionRotaryEmbedding,
)
from text_generation_server.utils.weights import UnquantizedWeight
import habana_frameworks.torch as htorch


class Starcoder2Config(PretrainedConfig):
    model_type = "starcoder2"

    def __init__(
        self,
        vocab_size=49152,
        hidden_size=3072,
        intermediate_size=12288,
        num_hidden_layers=30,
        num_attention_heads=24,
        num_key_value_heads=2,
        mlp_type="default",
        hidden_act="gelu_pytorch_tanh",
        max_position_embeddings=4096,
        initializer_range=0.018042,
        norm_type="layer_norm",
        norm_epsilon=1e-5,
        use_cache=True,
        bos_token_id=50256,
        eos_token_id=50256,
        rope_theta=10000.0,
        sliding_window=None,
        attention_dropout=0.0,
        residual_dropout=0.0,
        embedding_dropout=0.0,
        use_bias: bool = True,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.sliding_window = sliding_window
        self.use_bias = use_bias

        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.mlp_type = mlp_type
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.norm_type = norm_type
        self.norm_epsilon = norm_epsilon
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.attention_dropout = attention_dropout
        self.residual_dropout = residual_dropout
        self.embedding_dropout = embedding_dropout

        super().__init__(
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            **kwargs,
        )


def load_attention(config, prefix, weights, layer_id):
    prefixes = [f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"]
    head_size = config.hidden_size // config.num_attention_heads
    sizes = [
        head_size * config.num_attention_heads,
        head_size * config.num_key_value_heads,
        head_size * config.num_key_value_heads,
    ]
    if config.num_attention_heads != config.num_key_value_heads:
        base_layer = _load_gqa(config, prefix, weights)
    else:
        base_layer = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=prefixes,
            dim=0,
            weights=weights,
            bias=config.use_bias,
        )
    return TensorParallelMultiAdapterLinear.load(
        base_layer=base_layer,
        layer_id=layer_id,
        layer_names=prefixes,
        sizes=sizes,
        process_group=weights.process_group,
    )


def _load_gqa(config, prefix: str, weights):
    assert config.hidden_size % config.num_attention_heads == 0
    assert config.num_attention_heads % weights.process_group.size() == 0

    weight = weights.get_multi_weights_col(
        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
        dim=0,
    )

    if isinstance(weight, UnquantizedWeight):
        weight.weight = weight.weight.to(dtype=weights.dtype).to(device=weights.device)

        head_size = config.hidden_size // config.num_attention_heads
        num_heads = config.num_attention_heads // weights.process_group.size()
        num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
        assert list(weight.weight.shape) == [
            (num_heads + 2 * num_key_value_heads) * head_size,
            config.hidden_size,
        ], f"{list(weight.weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"

    if config.use_bias:
        w = [
            weights.get_sharded(f"{p}.bias", dim=0)
            for p in [f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"]
        ]
        bias = torch.cat(w, dim=0).to(dtype=weights.dtype).to(device=weights.device)
    else:
        bias = None

    return TensorParallelColumnLinear(get_linear(weight, bias=bias))


class Starcoder2Attention(torch.nn.Module):
    def __init__(
        self,
        index: int,
        prefix: str,
        config,
        weights,
        rotary_emb,
    ):
        super().__init__()
        self.max_past = (
            config.sliding_window if config.sliding_window is not None else -1
        )
        self.num_heads = config.num_attention_heads
        self.hidden_size = config.hidden_size
        self.head_size = self.hidden_size // self.num_heads
        self.rotary_emb = rotary_emb

        self.softmax_scale = self.head_size**-0.5

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()
        self.num_key_value_heads = (
            config.num_key_value_heads // weights.process_group.size()
        )

        self.query_key_value = load_attention(config, prefix, weights, index)
        self.kv_scales = get_kv_scales(weights, f"{prefix}")

        o_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.o_proj",
            weights=weights,
            bias=getattr(config, "use_bias", False),
        )

        self.o_proj = TensorParallelAdapterRowLinear.load(
            o_proj,
            index,
            "o_proj",
            process_group=weights.process_group,
        )

        self.num_groups = self.num_heads // self.num_key_value_heads
        self.kv_head_mapping = torch.arange(
            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
        ).repeat_interleave(self.num_groups)

    def forward(
        self,
        hidden_states,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        adapter_data,
        hpu_attention_meta,
    ):
        qkv = self.query_key_value(hidden_states, adapter_data)
        query, kv = qkv.split(
            [
                self.head_size * self.num_heads,
                2 * self.head_size * self.num_key_value_heads,
            ],
            dim=1,
        )
        query = query.view(-1, self.num_heads, self.head_size)
        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)

        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)

        kv_cache.store(
            key=kv[:, 0],
            value=kv[:, 1],
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            # sdpa
            attn_output = attention(
                query=query,
                key=kv[:, 0],
                value=kv[:, 1],
                kv_cache=kv_cache,
                kv_scales=self.kv_scales,
                seqlen=seqlen,
                softmax_scale=self.softmax_scale,
                window_size_left=self.max_past,
            )
        # Decode
        else:
            attn_output = paged_attention(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                seqlen,
                kv_scales=self.kv_scales,
                hpu_attention_meta=hpu_attention_meta,
                window_size_left=self.max_past,
            )

        return self.o_proj(
            attn_output.view(-1, self.num_heads * self.head_size), adapter_data
        )


class Starcoder2MLP(nn.Module):
    def __init__(self, prefix, config, weights, index):
        super().__init__()
        act = config.hidden_act
        self.act = (
            ACT2FN[act]
            if "gelu" not in act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
                ),
            )
        )
        # Fuse gate and up proj
        c_fc = TensorParallelColumnLinear.load(
            config,
            prefix=f"{prefix}.c_fc",
            weights=weights,
            bias=config.use_bias,
        )
        c_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.c_proj",
            weights=weights,
            bias=config.use_bias,
        )

        self.c_fc = TensorParallelMultiAdapterLinear.load(
            c_fc,
            layer_id=index,
            layer_names=[f"{prefix}.c_fc"],
            sizes=[config.intermediate_size, config.intermediate_size],
            process_group=weights.process_group,
        )

        self.c_proj = TensorParallelAdapterRowLinear.load(
            c_proj,
            index,
            "c_proj",
            process_group=weights.process_group,
        )

    def forward(self, hidden_states, adapter_data):
        hidden_states = self.c_fc(hidden_states, adapter_data)
        hidden_states = self.act(hidden_states)
        return self.c_proj(hidden_states, adapter_data)


class Starcoder2GatedMLP(nn.Module):
    def __init__(self, index, prefix, config, weights):
        super().__init__()
        act = config.hidden_act
        self.act = (
            ACT2FN[act]
            if "gelu" not in act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
                ),
            )
        )
        # Fuse gate and up proj
        prefixes = [f"{prefix}.gate_proj", f"{prefix}.up_proj"]
        sizes = [
            config.intermediate_size,
            config.intermediate_size,
        ]
        gate_up_proj = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=prefixes,
            weights=weights,
            dim=0,
            bias=config.use_bias,
        )
        self.gate_up_proj = TensorParallelMultiAdapterLinear.load(
            gate_up_proj,
            index,
            layer_names=prefixes,
            sizes=sizes,
            process_group=weights.process_group,
        )
        down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.down_proj",
            weights=weights,
            bias=config.use_bias,
        )
        self.down_proj = TensorParallelAdapterRowLinear.load(
            down_proj,
            index,
            "down_proj",
            process_group=weights.process_group,
        )
        self.intermediate_size = (
            config.intermediate_size // weights.process_group.size()
        )

    def forward(self, hidden_states, adapter_data):
        gate_up_states = self.gate_up_proj(hidden_states, adapter_data)
        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
        return self.down_proj(
            self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], adapter_data
        )


STARCODER2_NORMALIZATION_CLASSES = {
    "layer_norm": FastLayerNorm,
    "rms_norm": FastRMSNorm,
}

STARCODER2_MLP_CLASSES = {
    "default": Starcoder2MLP,
    "gated": Starcoder2GatedMLP,
}


class Starcoder2Layer(nn.Module):
    def __init__(self, layer_id, config, weights, rotary_emb):
        super().__init__()
        prefix = f"model.layers.{layer_id}"
        self.self_attn = Starcoder2Attention(
            prefix=f"{prefix}.self_attn",
            config=config,
            weights=weights,
            index=layer_id,
            rotary_emb=rotary_emb,
        )

        self.mlp = STARCODER2_MLP_CLASSES[config.mlp_type](
            prefix=f"{prefix}.mlp", config=config, weights=weights, index=layer_id
        )

        self.input_layernorm = STARCODER2_NORMALIZATION_CLASSES[config.norm_type].load(
            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.norm_epsilon
        )
        self.post_attention_layernorm = STARCODER2_NORMALIZATION_CLASSES[
            config.norm_type
        ].load(
            prefix=f"{prefix}.post_attention_layernorm",
            weights=weights,
            eps=config.norm_epsilon,
        )

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        slots,
        seqlen,
        adapter_data,
        hpu_attention_meta,
    ):
        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)

        # Self Attention
        attn_output = self.self_attn(
            normed_hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            adapter_data,
            hpu_attention_meta,
        )

        # faster post attention rms norm
        normed_attn_res_output, attn_res = self.post_attention_layernorm(
            attn_output, res
        )

        mlp_output = self.mlp(normed_attn_res_output, adapter_data)

        return mlp_output, attn_res


class Starcoder2Model(torch.nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()

        process_group = weights.process_group
        self.tp_rank = process_group.rank()
        self.tp_world_size = process_group.size()
        self.embed_tokens = TensorParallelEmbedding(
            prefix=f"{prefix}.embed_tokens", weights=weights
        )
        rotary_emb = PositionRotaryEmbedding.static(
            config=config,
            dim=config.hidden_size // config.num_attention_heads,
            base=config.rope_theta,
            device=weights.device,
        )
        self.layers = nn.ModuleList(
            [
                Starcoder2Layer(
                    layer_id,
                    config,
                    weights,
                    rotary_emb,
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )
        self.norm = STARCODER2_NORMALIZATION_CLASSES[config.norm_type].load(
            prefix=f"{prefix}.norm", weights=weights, eps=config.norm_epsilon
        )

        self.gradient_checkpointing = False

        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads
        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        adapter_data,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
    ) -> torch.Tensor:
        if hpu_attention_meta is not None:
            hpu_attention_meta = set_block_mapping(
                hpu_attention_meta, input_ids.shape[0]
            )
        hidden_states = self.embed_tokens(input_ids)

        # Get rotary cos and sin for this forward
        # Avoid to index in each layer
        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(position_ids)

        residual = None
        lazy_mode = htorch.utils.internal.is_lazy()
        if lazy_mode:
            htorch.core.mark_step()
        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                slots,
                seqlen,
                adapter_data,
                hpu_attention_meta,
            )
            if lazy_mode:
                htorch.core.mark_step()

        hidden_states, _ = self.norm(hidden_states, residual)

        return hidden_states


class FlashStarcoder2ForCausalLM(torch.nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()

        if not prefix:
            prefix = "model"
        else:
            prefix = f"{prefix}.model"

        self.model = Starcoder2Model(prefix, config, weights)
        try:
            self.lm_head = SpeculativeHead.load(
                config,
                prefix="lm_head",
                weights=weights,
            )
        except RuntimeError:
            self.lm_head = SpeculativeHead.load(
                config,
                prefix=f"{prefix}.embed_tokens",
                weights=weights,
            )

        self.max_past = config.sliding_window
        self.max_past_tensor = (
            torch.tensor(config.sliding_window, device=weights.device)
            if self.max_past is not None
            else None
        )

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        hidden_states = self.model(
            input_ids,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            slots,
            seqlen,
            adapter_data,
            hpu_attention_meta,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits = self.lm_head(hidden_states)
        return logits


================================================
FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/idefics2.py
================================================
# coding=utf-8
# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch Idefics2 model."""

from typing import List, Optional, Tuple

import torch
import torch.utils.checkpoint
from torch import nn
import math

from transformers.activations import ACT2FN
from text_generation_server.models.custom_modeling.vlm import (
    load_text_model,
)
from text_generation_server.layers.attention import Seqlen, HPUPagedAttentionMetadata
from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask

from text_generation_server.layers import (
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    TensorParallelRowLinear,
)
from text_generation_server.utils.weights import DefaultWeightsLoader, UnquantizedWeight


def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(
        batch, num_key_value_heads, n_rep, slen, head_dim
    )
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


class Idefics2VisionEmbeddings(nn.Module):
    """
    This is a modified version of `siglip.modelign_siglip.SiglipVisionEmbeddings` to enable images of variable
    resolution.

    The modifications are adapted from [Patch n' Pack: NaViT, a Vision Transformer for any Aspect Ratio and Resolution](https://arxiv.org/abs/2307.06304)
    which allows treating images in their native aspect ratio and without the need to resize them to the same
    fixed size. In particular, we start from the original pre-trained SigLIP model
    (which uses images of fixed-size square images) and adapt it by training on images of variable resolutions.
    """

    def __init__(self, prefix, config, weights):
        super().__init__()
        self.embed_dim = config.hidden_size
        self.image_size = config.image_size
        self.patch_size = config.patch_size

        self.patch_embedding = nn.Conv2d(
            in_channels=config.num_channels,
            out_channels=self.embed_dim,
            kernel_size=self.patch_size,
            stride=self.patch_size,
            padding="valid",
        )
        self.patch_embedding.weight = nn.Parameter(
            weights.get_tensor(f"{prefix}.patch_embedding.weight"), requires_grad=False
        )
        self.patch_embedding.bias = nn.Parameter(
            weights.get_tensor(f"{prefix}.patch_embedding.bias"), requires_grad=False
        )

        self.num_patches_per_side = self.image_size // self.patch_size
        self.num_patches = self.num_patches_per_side**2
        self.num_positions = self.num_patches
        self.position_embedding = TensorParallelEmbedding(
            prefix=f"{prefix}.position_embedding", weights=weights
        )

    def forward(
        self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.BoolTensor
    ) -> torch.Tensor:
        batch_size, _, max_im_h, max_im_w = pixel_values.shape

        patch_embeds = self.patch_embedding(pixel_values)
        embeddings = patch_embeds.flatten(2).transpose(1, 2)

        max_nb_patches_h, max_nb_patches_w = (
            max_im_h // self.patch_size,
            max_im_w // self.patch_size,
        )
        boundaries = torch.arange(
            1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side
        )
        position_ids = torch.full(
            size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0
        )

        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
            nb_patches_h = p_attn_mask[:, 0].sum()
            nb_patches_w = p_attn_mask[0].sum()

            fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
            fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)

            bucket_coords_h = torch.bucketize(
                fractional_coords_h, boundaries, right=True
            )
            bucket_coords_w = torch.bucketize(
                fractional_coords_w, boundaries, right=True
            )

            pos_ids = (
                bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w
            ).flatten()
            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids

        position_ids = position_ids.to(self.position_embedding.weight.device)
        embeddings = embeddings + self.position_embedding(position_ids)
        return embeddings


class Idefics2VisionAttention(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_size = self.embed_dim // self.num_heads
        if self.head_size * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {self.num_heads})."
            )
        self.scale = self.head_size**-0.5
        self.dropout = config.attention_dropout

        self.num_heads = self.num_heads // weights.process_group.size()
        self.embed_dim = self.embed_dim // weights.process_group.size()

        self.qkv = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
            dim=0,
            weights=weights,
            bias=True,
        )
        self.out_proj = TensorParallelRowLinear.load(
            config=config, prefix=f"{prefix}.out_proj", weights=weights, bias=True
        )
        self.is_causal = False

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        batch_size, q_len, _ = hidden_states.size()

        qkv = self.qkv(hidden_states)
        query_states, key_states, value_states = qkv.split(
            [
                self.head_size * self.num_heads,
                self.head_size * self.num_heads,
                self.head_size * self.num_heads,
            ],
            dim=2,
        )

        query_states = query_states.view(
            batch_size, q_len, self.num_heads, self.head_size
        ).transpose(1, 2)
        key_states = key_states.view(
            batch_size, q_len, self.num_heads, self.head_size
        ).transpose(1, 2)
        value_states = value_states.view(
            batch_size, q_len, self.num_heads, self.head_size
        ).transpose(1, 2)

        k_v_seq_len = key_states.shape[-2]
        attn_weights = (
            torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
        )

        if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
            raise ValueError(
                f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
                f" {attn_weights.size()}"
            )

        if attention_mask is not None:
            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
                raise ValueError(
                    f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
                )
            attn_weights = attn_weights + attention_mask

        # upcast attention to fp32
        attn_weights = nn.functional.softmax(
            attn_weights, dim=-1, dtype=torch.float32
        ).to(query_states.dtype)
        attn_weights = nn.functional.dropout(
            attn_weights, p=self.dropout, training=self.training
        )
        attn_output = torch.matmul(attn_weights, value_states)

        if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_size):
            raise ValueError(
                f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_size)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)

        attn_output = self.out_proj(attn_output)

        return attn_output


class Idefics2VisionMLP(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        self.activation_fn = ACT2FN[config.hidden_act]
        self.fc1 = TensorParallelColumnLinear.load(
            prefix=f"{prefix}.fc1", config=config, weights=weights, bias=True
        )
        self.fc2 = TensorParallelRowLinear.load(
            prefix=f"{prefix}.fc2", config=config, weights=weights, bias=True
        )

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.fc1(hidden_states)
        hidden_states = self.activation_fn(hidden_states)
        hidden_states = self.fc2(hidden_states)
        return hidden_states


class Idefics2EncoderLayer(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.embed_dim = config.hidden_size
        self.self_attn = Idefics2VisionAttention(
            prefix=f"{prefix}.self_attn", config=config, weights=weights
        )
        self.layer_norm1 = nn.LayerNorm.load(
            prefix=f"{prefix}.layer_norm1", eps=config.layer_norm_eps, weights=weights
        )
        self.layer_norm2 = nn.LayerNorm.load(
            prefix=f"{prefix}.layer_norm2", eps=config.layer_norm_eps, weights=weights
        )
        self.mlp = Idefics2VisionMLP(
            prefix=f"{prefix}.mlp", config=config, weights=weights
        )

    # Copied from transformers.models.siglip.modeling_siglip.SiglipEncoderLayer.forward
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
    ) -> torch.Tensor:
        residual = hidden_states

        hidden_states = self.layer_norm1(hidden_states)
        hidden_states = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
        )
        hidden_states = residual + hidden_states

        residual = hidden_states
        hidden_states = self.layer_norm2(hidden_states)
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states

        return hidden_states


class Idefics2Encoder(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        self.layers = nn.ModuleList(
            [
                Idefics2EncoderLayer(
                    prefix=f"{prefix}.layers.{i}", config=config, weights=weights
                )
                for i in range(config.num_hidden_layers)
            ]
        )

    # Ignore copy
    def forward(
        self,
        inputs_embeds,
        attention_mask: Optional[torch.Tensor] = None,
    ):
        hidden_states = inputs_embeds
        for encoder_layer in self.layers:
            hidden_states = encoder_layer(
                hidden_states,
                attention_mask,
            )
        return hidden_states


class Idefics2VisionTransformer(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        self.embeddings = Idefics2VisionEmbeddings(
            prefix=f"{prefix}.embeddings", config=config, weights=weights
        )
        self.encoder = Idefics2Encoder(
            prefix=f"{prefix}.encoder", config=config, weights=weights
        )
        self.post_layernorm = nn.LayerNorm.load(
            prefix=f"{prefix}.post_layernorm",
            weights=weights,
            eps=config.layer_norm_eps,
        )

    def forward(
        self,
        pixel_values,
        patch_attention_mask: Optional[torch.BoolTensor] = None,
    ):
        batch_size = pixel_values.size(0)
        if patch_attention_mask is None:
            patch_size = self.config.patch_size
            patch_attention_mask = torch.ones(
                (
                    batch_size,
                    pixel_values.size(2) // patch_size,
                    pixel_values.size(3) // patch_size,
                )
            )
            patch_attention_mask = patch_attention_mask.to(
                dtype=torch.bool, device=pixel_values.device
            )

        hidden_states = self.embeddings(
            pixel_values=pixel_values, patch_attention_mask=patch_attention_mask
        )

        patch_attention_mask = patch_attention_mask.view(batch_size, -1)
        # The call to `_upad_input` in `_flash_attention_forward` is expensive
        # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
        # avoiding passing the attention_mask, which is equivalent to attending to the full sequence
        if not torch.any(~patch_attention_mask):
            patch_attention_mask = None
        else:
            patch_attention_mask = _prepare_4d_attention_mask(
                patch_attention_mask, hidden_states.dtype
            )

        encoder_outputs = self.encoder(
            inputs_embeds=hidden_states,
            attention_mask=patch_attention_mask,
        )

        last_hidden_state = encoder_outputs
        last_hidden_state = self.post_layernorm(last_hidden_state)

        return last_hidden_state


class Idefics2MLP(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        act = config.text_config.hidden_act
        self.act = (
            ACT2FN[act]
            if "gelu" not in act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
                ),
            )
        )
        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
            weights=weights,
            dim=0,
            bias=False,
        )
        self.down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.down_proj",
            weights=weights,
            bias=False,
        )

    def forward(self, hidden_states):
        start_shape = hidden_states.shape[:-1]
        gate_up_states = self.gate_up_proj(hidden_states)
        intermediate_size = gate_up_states.shape[-1] // 2
        gate_up_states = gate_up_states.view(-1, 2, intermediate_size)
        return self.down_proj(
            self.act(gate_up_states[:, 0]) * gate_up_states[:, 1]
        ).view(*start_shape, -1)


class Idefics2RMSNorm(nn.Module):
    def __init__(self, prefix, weights, eps):
        """
        Idefics2RMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        self.weight = nn.Parameter(
            weights.get_tensor(f"{prefix}.weight"), requires_grad=False
        )
        self.variance_epsilon = eps

    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)


class Idefics2PerceiverAttention(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()

        self.layer_idx = None
        self.hidden_size = config.text_config.hidden_size
        self.num_heads = config.perceiver_config.resampler_n_heads
        self.head_size = config.perceiver_config.resampler_head_dim
        self.num_key_value_heads = config.perceiver_config.num_key_value_heads
        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
        self.attention_dropout = config.perceiver_config.attention_dropout
        self.num_heads = self.num_heads // weights.process_group.size()
        self.num_key_value_heads = (
            self.num_key_value_heads // weights.process_group.size()
        )

        self.q_proj = TensorParallelColumnLinear.load(
            config,
            prefix=f"{prefix}.q_proj",
            weights=weights,
            bias=False,
        )
        self.kv = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.k_proj", f"{prefix}.v_proj"],
            dim=0,
            weights=weights,
            bias=False,
        )
        self.o_proj = TensorParallelRowLinear.load(
            config=config, prefix=f"{prefix}.o_proj", weights=weights, bias=False
        )

        self.is_causal = False

    def forward(
        self,
        latents: torch.Tensor,
        context: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        bsz, q_len, _ = latents.size()
        kv_seq_len = q_len + context.size()[1]

        hidden_states = torch.concat([context, latents], dim=-2)
        query_states = self.q_proj(latents)
        kv = self.kv(hidden_states)
        key_states, value_states = kv.split(
            [
                self.head_size * self.num_key_value_heads,
                self.head_size * self.num_key_value_heads,
            ],
            dim=2,
        )

        query_states = query_states.view(
            bsz, q_len, self.num_heads, self.head_size
        ).transpose(1, 2)
        key_states = key_states.view(
            bsz, kv_seq_len, self.num_key_value_heads, self.head_size
        ).transpose(1, 2)
        value_states = value_states.view(
            bsz, kv_seq_len, self.num_key_value_heads, self.head_size
        ).transpose(1, 2)

        # repeat k/v heads if n_kv_heads < n_heads
        key_states = repeat_kv(key_states, self.num_key_value_groups)
        value_states = repeat_kv(value_states, self.num_key_value_groups)

        attn_weights = torch.matmul(
            query_states, key_states.transpose(2, 3)
        ) / math.sqrt(self.head_size)

        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
            raise ValueError(
                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
                f" {attn_weights.size()}"
            )

        if attention_mask is not None:
            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
                raise ValueError(
                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                )

            attn_weights = attn_weights + attention_mask

        # upcast attention to fp32
        attn_weights = nn.functional.softmax(
            attn_weights, dim=-1, dtype=torch.float32
        ).to(query_states.dtype)
        attn_output = torch.matmul(attn_weights, value_states)

        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_size):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_size)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_size)

        attn_output = self.o_proj(attn_output)

        return attn_output


class Idefics2PerceiverLayer(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.hidden_size = config.text_config.hidden_size
        self.n_latents = config.perceiver_config.resampler_n_latents
        self.depth = config.perceiver_config.resampler_depth
        self.rms_norm_eps = config.text_config.rms_norm_eps

        self.input_latents_norm = Idefics2RMSNorm(
            prefix=f"{prefix}.input_latents_norm",
            weights=weights,
            eps=self.rms_norm_eps,
        )
        self.input_context_norm = Idefics2RMSNorm(
            prefix=f"{prefix}.input_context_norm",
            weights=weights,
            eps=self.rms_norm_eps,
        )
        self.self_attn = Idefics2PerceiverAttention(
            prefix=f"{prefix}.self_attn", config=config, weights=weights
        )
        self.post_attention_layernorm = Idefics2RMSNorm(
            prefix=f"{prefix}.post_attention_layernorm",
            weights=weights,
            eps=self.rms_norm_eps,
        )
        self.mlp = Idefics2MLP(prefix=f"{prefix}.mlp", config=config, weights=weights)

    def forward(
        self,
        latents: torch.Tensor,
        context: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
    ):
        """
        Args:
            latents (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            context (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
        """
        residual = latents

        latents = self.input_latents_norm(latents)
        context = self.input_context_norm(context)

        latents = self.self_attn(
            latents=latents,
            context=context,
            attention_mask=attention_mask,
        )
        latents = residual + latents
        residual = latents

        latents = self.post_attention_layernorm(latents)
        latents = self.mlp(latents)
        latents = residual + latents

        return latents


class Idefics2PerceiverResampler(nn.Module):
    def __init__(self, prefix, config, weights) -> None:
        super().__init__()
        self.hidden_size = config.text_config.hidden_size
        self.hidden_act = config.perceiver_config.hidden_act
        self.n_latents = config.perceiver_config.resampler_n_latents
        self.depth = config.perceiver_config.resampler_depth
        self.rms_norm_eps = config.text_config.rms_norm_eps

        # Create Latents for Perceiver
        self.latents = weights.get_tensor(f"{prefix}.latents")

        # Create Transformer Blocks
        self.layers = nn.ModuleList(
            [
                Idefics2PerceiverLayer(
                    prefix=f"{prefix}.layers.{idx}", config=config, weights=weights
                )
                for idx in range(self.depth)
            ]
        )
        self.norm = Idefics2RMSNorm(
            prefix=f"{prefix}.norm",
            weights=weights,
            eps=config.text_config.rms_norm_eps,
        )

    def forward(
        self,
        context: torch.Tensor,
        attention_mask,
    ) -> torch.Tensor:
        # seq embed -> bsz seq embed
        latents = self.latents.unsqueeze(0).expand(
            (context.shape[0], *self.latents.size())
        )

        latent_attention_mask = torch.ones(
            (attention_mask.size(0), latents.size(1)),
            dtype=attention_mask.dtype,
            device=attention_mask.device,
        )
        attention_mask = torch.cat([attention_mask, latent_attention_mask], dim=-1)
        attention_mask = _prepare_4d_attention_mask(
            attention_mask, latents.dtype, tgt_len=self.n_latents
        )

        compressed_context = latents
        for perceiver_layer in self.layers:
            compressed_context = perceiver_layer(
                compressed_context,
                context,
                attention_mask=attention_mask,
            )
        compressed_context = self.norm(compressed_context)

        return compressed_context


class Idefics2Connector(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.modality_projection = Idefics2MLP(
            prefix=f"{prefix}.modality_projection", config=config, weights=weights
        )
        self.perceiver_resampler = Idefics2PerceiverResampler(
            prefix=f"{prefix}.perceiver_resampler", config=config, weights=weights
        )

    def forward(self, image_hidden_states, attention_mask):
        image_hidden_states = self.modality_projection(image_hidden_states)
        image_hidden_states = self.perceiver_resampler(
            context=image_hidden_states, attention_mask=attention_mask
        )
        return image_hidden_states


class Idefics2ForConditionalGeneration(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        config.vision_config.quantize = None
        config.vision_config.speculator = config.speculator
        config.text_config.quantize = config.quantize
        config.text_config.speculator = config.speculator

        vision_config = config.vision_config
        self.text_model = load_text_model(
            prefix="model" if not prefix else f"{prefix}.model",
            config=config.text_config,
            weights=weights,
            name="text_model",
        )
        self.dtype = weights.dtype

        # The vision and connector models are not quantized.
        with weights.use_loader(DefaultWeightsLoader(UnquantizedWeight)):
            self.vision_model = Idefics2VisionTransformer(
                prefix=(
                    f"{prefix}.model.vision_model" if prefix else "model.vision_model"
                ),
                config=vision_config,
                weights=weights,
            )

            config.quantize = None
            self.connector = Idefics2Connector(
                prefix=f"{prefix}.model.connector" if prefix else "model.connector",
                config=config,
                weights=weights,
            )

        self.config = config
        self.image_seq_len = config.perceiver_config.resampler_n_latents
        self.image_token_id = config.image_token_id
        self.pad_token_id = (
            config.pad_token_id if config.pad_token_id is not None else -1
        )

    def _merge_input_ids_with_image_features(
        self,
        input_ids: torch.Tensor,
        inputs_embeds: torch.Tensor,
        image_features: torch.Tensor,
    ):
        """In place merges in vision_embeddings with inputs_embeds."""
        # mask = input_ids == self.config.image_token_index
        #  - replace `==` with torch.where to fix the issue in hpu graph
        mask = torch.where(input_ids == self.config.image_token_id)
        # Let's pray we have enabled enough slots !
        inputs_embeds[mask] = image_features.view(-1, image_features.shape[-1])
        return inputs_embeds

    def get_vision_embeds(
        self,
        pixel_values: torch.FloatTensor,
        pixel_attention_mask: Optional[torch.FloatTensor] = None,
        image_sizes: Optional[torch.Tensor] = None,
        image_grid_thw: Optional[torch.LongTensor] = None,
    ):
        assert pixel_values is not None
        batch_size, num_images, num_channels, height, width = pixel_values.shape
        all_states = []
        all_pixel_values = pixel_values
        all_pixel_mask = pixel_attention_mask
        for i in range(batch_size):
            pixel_values = all_pixel_values.to(dtype=self.dtype)  # fp16 compatibility
            pixel_values = pixel_values[i : i + 1]
            pixel_values = pixel_values.view(num_images, *pixel_values.shape[2:])

            # Remove padding images - padding images are full 0.
            nb_values_per_image = pixel_values.shape[1:].numel()
            real_images_inds = (pixel_values == 0.0).sum(
                dim=(-1, -2, -3)
            ) != nb_values_per_image
            pixel_values = pixel_values[real_images_inds].contiguous()

            # Handle the vision attention mask
            if pixel_attention_mask is None:
                pixel_attention_mask = torch.ones(
                    size=(
                        pixel_values.size(0),
                        pixel_values.size(2),
                        pixel_values.size(3),
                    ),
                    dtype=torch.bool,
                    device=pixel_values.device,
                )
            else:
                # Remove padding images from the mask/pP p
                pixel_attention_mask = all_pixel_mask[i : i + 1]
                pixel_attention_mask = pixel_attention_mask.view(
                    1 * num_images, *pixel_attention_mask.shape[2:]
                )
                pixel_attention_mask = pixel_attention_mask[
                    real_images_inds
                ].contiguous()

            patch_size = self.config.vision_config.patch_size
            """
            patches_subgrid = pixel_attention_mask.unfold(
                dimension=1, size=patch_size, step=patch_size
            )
            patches_subgrid = patches_subgrid.unfold(
                dimension=2, size=patch_size, step=patch_size
            )
            patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
            """
            # hpu does none support unfold
            conv_kernel = torch.ones(
                [1, 1, patch_size, patch_size],
                dtype=pixel_values.dtype,
                device=pixel_values.device,
            )
            patches_subgrid = torch.nn.functional.conv2d(
                pixel_attention_mask.unsqueeze(1).to(conv_kernel.dtype),
                conv_kernel,
                stride=patch_size,
            ).squeeze(1)
            patch_attention_mask = torch.gt(patches_subgrid, 0)

            # Get sequence from the vision encoder
            image_hidden_states = self.vision_model(
                pixel_values=pixel_values,
                patch_attention_mask=patch_attention_mask,
            )

            # Modality projection & resampling
            image_hidden_states = self.connector(
                image_hidden_states,
                attention_mask=patch_attention_mask.view(pixel_values.size(0), -1),
            )
            all_states.append(image_hidden_states)
        image_hidden_states = torch.stack(all_states, dim=0)
        return image_hidden_states.view(-1, image_hidden_states.shape[-1])

    def get_inputs_embeds(
        self,
        input_ids: torch.Tensor,
        vision_embeds: torch.Tensor = None,
    ):
        inputs_embeds = self.text_model.embed_tokens(input_ids)

        if vision_embeds is not None:
            # When we generate, we don't want to replace the potential image_token_id that we generated by images
            # that simply don't exist
            inputs_embeds = self._merge_input_ids_with_image_features(
                input_ids, inputs_embeds, vision_embeds
            )
        return inputs_embeds

    def forward(
        self,
        inputs_embeds: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
        lm_head_indices: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.BoolTensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ):
        hidden_states = self.text_model.model(
            inputs_embeds=inputs_embeds,
            position_ids=position_ids,
            cu_seqlen_prefill=cu_seqlen_prefill,
            kv_cache=kv_cache,
            slots=slots,
            seqlen=seqlen,
            hpu_attention_meta=hpu_attention_meta,
            adapter_data=adapter_data,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.text_model.lm_head(hidden_states)
        return logits, speculative_logits


================================================
FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/idefics3.py
================================================
# coding=utf-8
# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch Idefics3 model."""

from typing import List, Optional, Tuple

import torch
import torch.utils.checkpoint
from torch import nn

from transformers.activations import ACT2FN
from text_generation_server.models.custom_modeling.vlm import (
    load_text_model,
)
from text_generation_server.layers.attention import Seqlen, HPUPagedAttentionMetadata
from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask

from text_generation_server.layers import (
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    TensorParallelRowLinear,
)
from text_generation_server.utils.weights import DefaultWeightsLoader, UnquantizedWeight


def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(
        batch, num_key_value_heads, n_rep, slen, head_dim
    )
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


class Idefics3VisionEmbeddings(nn.Module):
    """
    This is a modified version of `siglip.modelign_siglip.SiglipVisionEmbeddings` to enable images of variable
    resolution.

    The modifications are adapted from [Patch n' Pack: NaViT, a Vision Transformer for any Aspect Ratio and Resolution](https://arxiv.org/abs/2307.06304)
    which allows treating images in their native aspect ratio and without the need to resize them to the same
    fixed size. In particular, we start from the original pre-trained SigLIP model
    (which uses images of fixed-size square images) and adapt it by training on images of variable resolutions.
    """

    def __init__(self, prefix, config, weights):
        super().__init__()
        self.embed_dim = config.hidden_size
        self.image_size = config.image_size
        self.patch_size = config.patch_size

        self.patch_embedding = nn.Conv2d(
            in_channels=config.num_channels,
            out_channels=self.embed_dim,
            kernel_size=self.patch_size,
            stride=self.patch_size,
            padding="valid",
        )
        self.patch_embedding.weight = nn.Parameter(
            weights.get_tensor(f"{prefix}.patch_embedding.weight"), requires_grad=False
        )
        self.patch_embedding.bias = nn.Parameter(
            weights.get_tensor(f"{prefix}.patch_embedding.bias"), requires_grad=False
        )

        self.num_patches_per_side = self.image_size // self.patch_size
        self.num_patches = self.num_patches_per_side**2
        self.num_positions = self.num_patches
        self.position_embedding = TensorParallelEmbedding(
            prefix=f"{prefix}.position_embedding", weights=weights
        )

    def forward(
        self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.BoolTensor
    ) -> torch.Tensor:
        batch_size, _, max_im_h, max_im_w = pixel_values.shape

        patch_embeds = self.patch_embedding(pixel_values)
        embeddings = patch_embeds.flatten(2).transpose(1, 2)

        max_nb_patches_h, max_nb_patches_w = (
            max_im_h // self.patch_size,
            max_im_w // self.patch_size,
        )
        boundaries = torch.arange(
            1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side
        )
        position_ids = torch.full(
            size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0
        )

        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
            nb_patches_h = p_attn_mask[:, 0].sum()
            nb_patches_w = p_attn_mask[0].sum()

            fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
            fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)

            bucket_coords_h = torch.bucketize(
                fractional_coords_h, boundaries, right=True
            )
            bucket_coords_w = torch.bucketize(
                fractional_coords_w, boundaries, right=True
            )

            pos_ids = (
                bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w
            ).flatten()
            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids

        position_ids = position_ids.to(self.position_embedding.weight.device)
        embeddings = embeddings + self.position_embedding(position_ids)
        return embeddings


class Idefics3VisionAttention(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_size = self.embed_dim // self.num_heads
        if self.head_size * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {self.num_heads})."
            )
        self.scale = self.head_size**-0.5
        self.dropout = config.attention_dropout

        self.num_heads = self.num_heads // weights.process_group.size()
        self.embed_dim = self.embed_dim // weights.process_group.size()

        self.qkv = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
            dim=0,
            weights=weights,
            bias=True,
        )
        self.out_proj = TensorParallelRowLinear.load(
            config=config, prefix=f"{prefix}.out_proj", weights=weights, bias=True
        )
        self.is_causal = False

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        batch_size, q_len, _ = hidden_states.size()

        qkv = self.qkv(hidden_states)
        query_states, key_states, value_states = qkv.split(
            [
                self.head_size * self.num_heads,
                self.head_size * self.num_heads,
                self.head_size * self.num_heads,
            ],
            dim=2,
        )

        query_states = query_states.view(
            batch_size, q_len, self.num_heads, self.head_size
        ).transpose(1, 2)
        key_states = key_states.view(
            batch_size, q_len, self.num_heads, self.head_size
        ).transpose(1, 2)
        value_states = value_states.view(
            batch_size, q_len, self.num_heads, self.head_size
        ).transpose(1, 2)

        k_v_seq_len = key_states.shape[-2]
        attn_weights = (
            torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
        )

        if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
            raise ValueError(
                f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
                f" {attn_weights.size()}"
            )

        if attention_mask is not None:
            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
                raise ValueError(
                    f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
                )
            attn_weights = attn_weights + attention_mask

        # upcast attention to fp32
        attn_weights = nn.functional.softmax(
            attn_weights, dim=-1, dtype=torch.float32
        ).to(query_states.dtype)
        attn_weights = nn.functional.dropout(
            attn_weights, p=self.dropout, training=self.training
        )
        attn_output = torch.matmul(attn_weights, value_states)

        if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_size):
            raise ValueError(
                f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_size)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)

        attn_output = self.out_proj(attn_output)

        return attn_output


class Idefics3VisionMLP(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        self.activation_fn = ACT2FN[config.hidden_act]
        self.fc1 = TensorParallelColumnLinear.load(
            prefix=f"{prefix}.fc1", config=config, weights=weights, bias=True
        )
        self.fc2 = TensorParallelRowLinear.load(
            prefix=f"{prefix}.fc2", config=config, weights=weights, bias=True
        )

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.fc1(hidden_states)
        hidden_states = self.activation_fn(hidden_states)
        hidden_states = self.fc2(hidden_states)
        return hidden_states


class Idefics3EncoderLayer(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.embed_dim = config.hidden_size
        self.self_attn = Idefics3VisionAttention(
            prefix=f"{prefix}.self_attn", config=config, weights=weights
        )
        self.layer_norm1 = nn.LayerNorm.load(
            prefix=f"{prefix}.layer_norm1", eps=config.layer_norm_eps, weights=weights
        )
        self.layer_norm2 = nn.LayerNorm.load(
            prefix=f"{prefix}.layer_norm2", eps=config.layer_norm_eps, weights=weights
        )
        self.mlp = Idefics3VisionMLP(
            prefix=f"{prefix}.mlp", config=config, weights=weights
        )

    # Copied from transformers.models.siglip.modeling_siglip.SiglipEncoderLayer.forward
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
    ) -> torch.Tensor:
        residual = hidden_states

        hidden_states = self.layer_norm1(hidden_states)
        hidden_states = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
        )
        hidden_states = residual + hidden_states

        residual = hidden_states
        hidden_states = self.layer_norm2(hidden_states)
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states

        return hidden_states


class Idefics3Encoder(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        self.layers = nn.ModuleList(
            [
                Idefics3EncoderLayer(
                    prefix=f"{prefix}.layers.{i}", config=config, weights=weights
                )
                for i in range(config.num_hidden_layers)
            ]
        )

    # Ignore copy
    def forward(
        self,
        inputs_embeds,
        attention_mask: Optional[torch.Tensor] = None,
    ):
        hidden_states = inputs_embeds
        for encoder_layer in self.layers:
            hidden_states = encoder_layer(
                hidden_states,
                attention_mask,
            )
        return hidden_states


class Idefics3VisionTransformer(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        self.embeddings = Idefics3VisionEmbeddings(
            prefix=f"{prefix}.embeddings", config=config, weights=weights
        )
        self.encoder = Idefics3Encoder(
            prefix=f"{prefix}.encoder", config=config, weights=weights
        )
        self.post_layernorm = nn.LayerNorm.load(
            prefix=f"{prefix}.post_layernorm",
            weights=weights,
            eps=config.layer_norm_eps,
        )

    def forward(
        self,
        pixel_values,
        patch_attention_mask: Optional[torch.BoolTensor] = None,
    ):
        batch_size = pixel_values.size(0)
        if patch_attention_mask is None:
            patch_size = self.config.patch_size
            patch_attention_mask = torch.ones(
                (
                    batch_size,
                    pixel_values.size(2) // patch_size,
                    pixel_values.size(3) // patch_size,
                )
            )
            patch_attention_mask = patch_attention_mask.to(
                dtype=torch.bool, device=pixel_values.device
            )

        hidden_states = self.embeddings(
            pixel_values=pixel_values, patch_attention_mask=patch_attention_mask
        )

        patch_attention_mask = patch_attention_mask.view(batch_size, -1)
        # The call to `_upad_input` in `_flash_attention_forward` is expensive
        # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
        # avoiding passing the attention_mask, which is equivalent to attending to the full sequence
        if not torch.any(~patch_attention_mask):
            patch_attention_mask = None
        else:
            patch_attention_mask = _prepare_4d_attention_mask(
                patch_attention_mask, hidden_states.dtype
            )

        encoder_outputs = self.encoder(
            inputs_embeds=hidden_states,
            attention_mask=patch_attention_mask,
        )

        last_hidden_state = encoder_outputs
        last_hidden_state = self.post_layernorm(last_hidden_state)

        return last_hidden_state


class Idefics3SimpleMLP(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        input_size = config.vision_config.hidden_size * (config.scale_factor**2)
        output_size = config.text_config.hidden_size
        proj = nn.Parameter(
            weights.get_tensor(f"{prefix}.modality_projection.proj.weight"),
            requires_grad=False,
        ).to(weights.dtype)
        self.proj = nn.Linear(input_size, output_size, bias=False)
        self.proj.weight = proj

    def forward(self, x):
        return self.proj(x)


class Idefics3Connector(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.modality_projection = Idefics3SimpleMLP(prefix, config, weights)
        self.scale_factor = config.scale_factor

    def pixel_shuffle(self, x, scale_factor=2):
        bsz, seq, embed_dim = x.size()
        height = width = int(seq**0.5)
        x = x.view(bsz, height, width, embed_dim)
        x = x.view(bsz, height, int(width / scale_factor), embed_dim * scale_factor)
        x = x.permute(0, 2, 1, 3)
        x = x.reshape(
            bsz,
            int(width / scale_factor),
            int(height / scale_factor),
            embed_dim * (scale_factor**2),
        )
        x = x.permute(0, 2, 1, 3)
        x = x.reshape(bsz, int(seq / (scale_factor**2)), embed_dim * (scale_factor**2))
        return x

    def forward(self, image_hidden_states):
        image_hidden_states = self.pixel_shuffle(image_hidden_states, self.scale_factor)
        image_hidden_states = self.modality_projection(image_hidden_states)
        return image_hidden_states


class Idefics3ForConditionalGeneration(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        config.vision_config.quantize = None
        config.vision_config.speculator = config.speculator
        config.text_config.quantize = config.quantize
        config.text_config.speculator = config.speculator
        # set tie_word_embeddings to True to load `.embed_tokens.weight` instead of `.lm_head.weight`
        # since Idefics3 uses the `embed_tokens` for the final prediction
        # config.text_config.tie_word_embeddings = True

        vision_config = config.vision_config
        self.text_model = load_text_model(
            prefix="model" if not prefix else f"{prefix}.model",
            config=config.text_config,
            weights=weights,
            name="text_model",
        )
        self.dtype = weights.dtype

        # The vision and connector models are not quantized.
        with weights.use_loader(DefaultWeightsLoader(UnquantizedWeight)):
            self.vision_model = Idefics3VisionTransformer(
                prefix=(
                    f"{prefix}.model.vision_model" if prefix else "model.vision_model"
                ),
                config=vision_config,
                weights=weights,
            )

            config.quantize = None
            self.connector = Idefics3Connector(
                prefix=f"{prefix}.model.connector" if prefix else "model.connector",
                config=config,
                weights=weights,
            )

        self.config = config
        self.image_token_id = config.image_token_id
        self.pad_token_id = (
            config.pad_token_id if config.pad_token_id is not None else -1
        )

    def _merge_input_ids_with_image_features(
        self,
        input_ids: torch.Tensor,
        inputs_embeds: torch.Tensor,
        image_features: torch.Tensor,
    ):
        """In place merges in vision_embeddings with inputs_embeds."""
        # mask = input_ids == self.config.image_token_index
        #  - replace `==` with torch.where to fix the issue in hpu graph
        mask = torch.where(input_ids == self.config.image_token_id)
        # Let's pray we have enabled enough slots !
        inputs_embeds[mask] = image_features.view(-1, image_features.shape[-1])
        return inputs_embeds

    def get_vision_embeds(
        self,
        pixel_values: torch.FloatTensor,
        pixel_attention_mask: Optional[torch.FloatTensor] = None,
        image_sizes: Optional[torch.Tensor] = None,
        image_grid_thw: Optional[torch.LongTensor] = None,
    ):
        batch_size, num_images, num_channels, height, width = pixel_values.shape
        all_states = []
        all_pixel_values = pixel_values
        all_pixel_mask = pixel_attention_mask
        for i in range(batch_size):
            pixel_values = all_pixel_values.to(dtype=self.dtype)  # fp16 compatibility
            pixel_values = pixel_values[i : i + 1]
            pixel_values = pixel_values.view(num_images, *pixel_values.shape[2:])

            # Remove padding images - padding images are full 0.
            nb_values_per_image = pixel_values.shape[1:].numel()
            real_images_inds = (pixel_values == 0.0).sum(
                dim=(-1, -2, -3)
            ) != nb_values_per_image
            pixel_values = pixel_values[real_images_inds].contiguous()
            # Handle the vision attention mask
            if pixel_attention_mask is None:
                pixel_attention_mask = torch.ones(
                    size=(
                        pixel_values.size(0),
                        pixel_values.size(2),
                        pixel_values.size(3),
                    ),
                    dtype=torch.bool,
                    device=pixel_values.device,
                )
            else:
                # Remove padding images from the mask/pP p
                pixel_attention_mask = all_pixel_mask[i : i + 1]
                pixel_attention_mask = pixel_attention_mask.view(
                    1 * num_images, *pixel_attention_mask.shape[2:]
                )
                pixel_attention_mask = pixel_attention_mask[
                    real_images_inds
                ].contiguous()

            patch_size = self.config.vision_config.patch_size

            """
            patches_subgrid = pixel_attention_mask.unfold(
                dimension=1, size=patch_size, step=patch_size
            )
            patches_subgrid = patches_subgrid.unfold(
                dimension=2, size=patch_size, step=patch_size
            )
            patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
            """
            # hpu does none support unfold
            conv_kernel = torch.ones(
                [1, 1, patch_size, patch_size],
                dtype=pixel_values.dtype,
                device=pixel_values.device,
            )
            patches_subgrid = torch.nn.functional.conv2d(
                pixel_attention_mask.unsqueeze(1).to(conv_kernel.dtype),
                conv_kernel,
                stride=patch_size,
            ).squeeze(1)
            patch_attention_mask = torch.gt(patches_subgrid, 0)

            # Get sequence from the vision encoder
            image_hidden_states = self.vision_model(
                pixel_values=pixel_values,
                patch_attention_mask=patch_attention_mask,
            )

            # Modality projection & resampling
            image_hidden_states = self.connector(
                image_hidden_states,
            )

            all_states.append(image_hidden_states)
        image_hidden_states = torch.stack(all_states, dim=0)

        return image_hidden_states.view(-1, image_hidden_states.shape[-1])

    def get_inputs_embeds(
        self,
        input_ids: torch.Tensor,
        vision_embeds: torch.Tensor = None,
    ):
        inputs_embeds = self.text_model.embed_tokens(input_ids)

        if vision_embeds is not None:
            # When we generate, we don't want to replace the potential image_token_id that we generated by images
            # that simply don't exist
            inputs_embeds = self._merge_input_ids_with_image_features(
                input_ids, inputs_embeds, vision_embeds
            )
        return inputs_embeds

    def forward(
        self,
        inputs_embeds: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
        lm_head_indices: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.BoolTensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
        image_indices=None,
    ):
        hidden_states = self.text_model.model(
            inputs_embeds=inputs_embeds,
            position_ids=position_ids,
            cu_seqlen_prefill=cu_seqlen_prefill,
            kv_cache=kv_cache,
            slots=slots,
            seqlen=seqlen,
            hpu_attention_meta=hpu_attention_meta,
            adapter_data=adapter_data,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.text_model.lm_head(hidden_states)
        return logits, speculative_logits


================================================
FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/mamba_modeling.py
================================================
import torch
import torch.distributed

from mamba_ssm.ops.triton.selective_state_update import selective_state_update
from mamba_ssm.ops.selective_scan_interface import selective_scan_fn
from torch import nn
from typing import Optional, Tuple, Any
from transformers.configuration_utils import PretrainedConfig
import torch.nn.functional as F

from text_generation_server.layers import (
    SpeculativeHead,
    TensorParallelEmbedding,
    FastLinear,
)
from text_generation_server.layers.layernorm import FastRMSNorm

from einops import rearrange
from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
import math
from dataclasses import dataclass


@dataclass
class InferenceParams:
    """Inference parameters that are passed to the main model in order
    to efficienly calculate and store the context during inference."""

    max_seqlen: int
    max_batch_size: int
    conv_states: torch.Tensor
    ssm_states: torch.Tensor
    seqlen_offset: int


class MambaConfig(PretrainedConfig):
    def __init__(
        self,
        vocab_size=50280,
        d_model=768,
        d_state=16,
        n_layer=32,
        layer_norm_epsilon=1e-5,
        tie_word_embeddings=False,
        pad_token_id=0,
        bos_token_id=1,
        eos_token_id=2,
        expand=2,
        dt_rank="auto",
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.n_layer = n_layer
        self.layer_norm_epsilon = layer_norm_epsilon
        self.d_model = d_model
        self.d_inner = d_model * 2
        self.d_conv = 4
        self.d_state = d_state
        self.expand = expand
        self.dt_rank = math.ceil(self.d_model / 16) if dt_rank == "auto" else dt_rank

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )


class MambaBlock(nn.Module):
    def __init__(self, prefix, config, weights, layer_id):
        super().__init__()
        self.layer_id = layer_id
        self.in_proj = FastLinear.load(config, f"{prefix}.in_proj", weights, bias=False)
        self.x_proj = FastLinear.load(config, f"{prefix}.x_proj", weights, bias=False)
        self.dt_proj = FastLinear.load(config, f"{prefix}.dt_proj", weights, bias=True)
        self.dt_proj_no_bias = FastLinear.load(
            config, f"{prefix}.dt_proj", weights, bias=False
        )
        self.out_proj = FastLinear.load(
            config, f"{prefix}.out_proj", weights, bias=False
        )
        self.conv1d = FastLinear.load(config, f"{prefix}.conv1d", weights, bias=True)
        self.negA = -torch.exp(weights.get_tensor(f"{prefix}.A_log").float())
        self.D = weights.get_tensor(f"{prefix}.D")
        self.activation = "silu"
        self.dt_rank = config.dt_rank
        self.d_state = config.d_state
        self.d_conv = config.d_conv
        self.act = nn.SiLU()

    # inference_params
    def forward(self, hidden_states: torch.Tensor, inference_params=None):
        if inference_params.seqlen_offset > 0:
            conv_state = inference_params.conv_states[self.layer_id]
            ssm_state = inference_params.ssm_states[self.layer_id]
            out, conv_state, ssm_state = self.step(hidden_states, conv_state, ssm_state)
            return out, conv_state, ssm_state

        _, seqlen, _ = hidden_states.shape
        projected_states = self.in_proj(hidden_states).transpose(1, 2)
        # assert projected_states.shape == [batch_size, 2 * dstate, seqlen], f"{projected_states.shape} [{batch_size}, {dstate}, {seqlen}]"
        x, z = projected_states.chunk(2, dim=1)
        conv_state = F.pad(x, (self.d_conv - seqlen, 0))
        x = causal_conv1d_fn(
            x=x,
            weight=self.conv1d.weight.squeeze(1),
            bias=self.conv1d.bias,
            activation=self.activation,
        )

        # We're careful here about the layout, to avoid extra transposes.
        # We want dt to have d as the slowest moving dimension
        # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
        x_dbl = self.x_proj(rearrange(x, "b d l -> (b l) d"))  # (bl d)
        dt, B, C = torch.split(
            x_dbl, [self.dt_rank, self.d_state, self.d_state], dim=-1
        )
        dt = self.dt_proj.weight @ dt.t()
        dt = rearrange(dt, "d (b l) -> b d l", l=seqlen)
        B = rearrange(B, "(b l) dstate -> b dstate l", l=seqlen).contiguous()
        C = rearrange(C, "(b l) dstate -> b dstate l", l=seqlen).contiguous()
        y, last_state = selective_scan_fn(
            x,
            dt,
            self.negA,
            B,
            C,
            self.D.float(),
            z=z,
            delta_bias=self.dt_proj.bias.float(),
            delta_softplus=True,
            return_last_state=True,
        )
        y = rearrange(y, "b d l -> b l d")
        attn_outputs = self.out_proj(y)
        return attn_outputs, conv_state, last_state

    def step(self, hidden_states, conv_state, ssm_state):
        xz = self.in_proj(hidden_states.squeeze(1))
        x, z = xz.chunk(2, dim=-1)  # (B D)
        x = causal_conv1d_update(
            x,
            conv_state,
            self.conv1d.weight.squeeze(1),
            self.conv1d.bias,
            self.activation,
        )
        x_db = self.x_proj(x)  # (B dt_rank+2*d_state)
        dt, B, C = torch.split(x_db, [self.dt_rank, self.d_state, self.d_state], dim=-1)
        dt = F.linear(dt, self.dt_proj.weight)
        A = self.negA
        y = selective_state_update(
            ssm_state,
            x,
            dt,
            A,
            B,
            C,
            self.D,
            z=z,
            dt_bias=self.dt_proj.bias,
            dt_softplus=True,
        )
        out = self.out_proj(y)
        return out.unsqueeze(1), conv_state.clone(), ssm_state.clone()


class ResidualBlock(nn.Module):
    def __init__(self, prefix, config, weights, layer_id):
        super().__init__()
        self.mamba_block = MambaBlock(
            prefix=f"{prefix}.mixer", config=config, weights=weights, layer_id=layer_id
        )
        self.layer_norm = FastRMSNorm.load(
            prefix=f"{prefix}.norm", weights=weights, eps=config.layer_norm_epsilon
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        residual: Optional[torch.Tensor] = None,
        inference_params: Optional[Any] = None,
    ):
        residual = (hidden_states + residual) if residual is not None else hidden_states
        shape = residual.shape
        hidden_states, _ = self.layer_norm(residual.view(-1, shape[-1]))
        hidden_states, conv_state, last_ssm_state = self.mamba_block(
            hidden_states.view(*shape), inference_params
        )
        return hidden_states, residual, conv_state, last_ssm_state


class MambaModel(nn.Module):
    def __init__(self, config, weights):
        super().__init__()
        prefix = "backbone"
        try:
            self.embed_tokens = TensorParallelEmbedding(f"{prefix}.embeddings", weights)
        except RuntimeError:
            self.embed_tokens = TensorParallelEmbedding(f"{prefix}.embedding", weights)
        self.blocks = nn.ModuleList(
            [
                ResidualBlock(f"{prefix}.layers.{i}", config, weights, layer_id=i)
                for i in range(config.n_layer)
            ]
        )
        self.norm_f = FastRMSNorm.load(
            f"{prefix}.norm_f", weights, eps=config.layer_norm_epsilon
        )
        try:
            self.lm_head = SpeculativeHead.load(config, f"{prefix}.embeddings", weights)
        except RuntimeError:
            self.lm_head = SpeculativeHead.load(config, f"{prefix}.embedding", weights)
        self.config = config

    def forward(
        self, input_ids: torch.Tensor, inference_params=None, residual=None
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        hidden_states = self.embed_tokens(input_ids)
        for i, block in enumerate(self.blocks):
            hidden_states, residual, conv_state, ssm_state = block(
                hidden_states, residual, inference_params
            )
            inference_params.conv_states[i].copy_(conv_state)
            inference_params.ssm_states[i].copy_(ssm_state)

        hidden_states = (
            hidden_states + residual if residual is not None else hidden_states
        )
        hidden_states, _ = self.norm_f(hidden_states.view(-1, hidden_states.size(-1)))
        hidden_states = hidden_states.view(residual.shape)
        logits, speculative_logits = self.lm_head(hidden_states)

        # update the offset for the next inference using these params
        inference_params.seqlen_offset += input_ids.size(1)
        return logits, speculative_logits


================================================
FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/qwen2_5_vl.py
================================================
# coding=utf-8
# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch Qwen2.5 VL model."""

from typing import Optional, Tuple, List

import torch
import torch.utils.checkpoint
from torch import nn

from habana_frameworks.torch.hpex.kernels import FusedSDPA
from vllm_hpu_extension.utils import ModuleFusedSDPA


import numpy as np

from transformers.activations import ACT2FN
from transformers.configuration_utils import PretrainedConfig

import torch.nn.functional as F

from text_generation_server.layers.layernorm import FastRMSNorm
from text_generation_server.layers import (
    TensorParallelColumnLinear,
    TensorParallelRowLinear,
    TensorParallelEmbedding,
    SpeculativeHead,
)
from text_generation_server.layers.attention import (
    Seqlen,
    HPUPagedAttentionMetadata,
)
from text_generation_server.models.custom_modeling.flash_qwen2_modeling import (
    Qwen2Model,
)
from habana_frameworks.torch.hpex.kernels import (
    RotaryPosEmbeddingMode,
    apply_rotary_pos_emb,
)
import habana_frameworks.torch as htorch

# Copied from: https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
from typing import Union
from transformers.feature_extraction_utils import BatchFeature
from transformers.image_utils import ImageInput
from transformers.video_utils import VideoInput
from transformers.processing_utils import (
    ProcessingKwargs,
    ProcessorMixin,
    Unpack,
    VideosKwargs,
)
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput


class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False):
    fps: Union[List[float], float]


class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False):
    videos_kwargs: Qwen2_5_VLVideosProcessorKwargs
    _defaults = {
        "text_kwargs": {
            "padding": False,
        },
        "videos_kwargs": {"fps": 2.0},
    }


class Qwen2_5_VLProcessor(ProcessorMixin):
    r"""
    Constructs a Qwen2.5-VL processor which wraps a Qwen2.5-VL image processor and a Qwen2 tokenizer into a single processor.
    [`Qwen2_5_VLProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
    [`~Qwen2_5_VLProcessor.__call__`] and [`~Qwen2_5_VLProcessor.decode`] for more information.
    Args:
        image_processor ([`Qwen2VLImageProcessor`], *optional*):
            The image processor is a required input.
        tokenizer ([`Qwen2TokenizerFast`], *optional*):
            The tokenizer is a required input.
        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
            in a chat into a tokenizable string.
    """

    attributes = ["image_processor", "tokenizer"]
    valid_kwargs = ["chat_template"]

    image_processor_class = "AutoImageProcessor"
    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")

    def __init__(
        self, image_processor=None, tokenizer=None, chat_template=None, **kwargs
    ):
        self.image_token = (
            "<|image_pad|>"
            if not hasattr(tokenizer, "image_token")
            else tokenizer.image_token
        )
        self.video_token = (
            "<|video_pad|>"
            if not hasattr(tokenizer, "video_token")
            else tokenizer.video_token
        )
        super().__init__(image_processor, tokenizer, chat_template=chat_template)

    def __call__(
        self,
        images: ImageInput = None,
        text: Union[
            TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]
        ] = None,
        videos: VideoInput = None,
        **kwargs: Unpack[Qwen2_5_VLProcessorKwargs],
    ) -> BatchFeature:
        """
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
        Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.

        Args:
            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. Both channels-first and channels-last formats are supported.
            text (`str`, `List[str]`, `List[List[str]]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
                tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:
                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return NumPy `np.ndarray` objects.
                - `'jax'`: Return JAX `jnp.ndarray` objects.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
            - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
            - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
            - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
            - **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
        """
        output_kwargs = self._merge_kwargs(
            Qwen2_5_VLProcessorKwargs,
            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
            **kwargs,
        )
        if images is not None:
            image_inputs = self.image_processor(
                images=images, videos=None, **output_kwargs["images_kwargs"]
            )
            image_grid_thw = image_inputs["image_grid_thw"]
        else:
            image_inputs = {}
            image_grid_thw = None

        if videos is not None:
            videos_inputs = self.image_processor(
                images=None, videos=videos, **output_kwargs["images_kwargs"]
            )
            video_grid_thw = videos_inputs["video_grid_thw"]

            fps = output_kwargs["videos_kwargs"].pop("fps", 2.0)
            if isinstance(fps, (int, float)):
                second_per_grid_ts = [
                    self.image_processor.temporal_patch_size / fps
                ] * len(video_grid_thw)
            elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw):
                second_per_grid_ts = [
                    self.image_processor.temporal_patch_size / tmp for tmp in fps
                ]
            else:
                raise ValueError(
                    f"The length of fps ({len(fps) if hasattr(fps, '__len__') else fps}) must be equal to the length of video_grid_thw ({len(video_grid_thw)}) or fps should be a single number."
                )
            videos_inputs.update({"second_per_grid_ts": second_per_grid_ts})

        else:
            videos_inputs = {}
            video_grid_thw = None

        if not isinstance(text, list):
            text = [text]

        if image_grid_thw is not None:
            merge_length = self.image_processor.merge_size**2
            index = 0
            for i in range(len(text)):
                while self.image_token in text[i]:
                    text[i] = text[i].replace(
                        self.image_token,
                        "<|placeholder|>"
                        * (image_grid_thw[index].prod() // merge_length),
                        1,
                    )
                    index += 1
                text[i] = text[i].replace("<|placeholder|>", self.image_token)

        if video_grid_thw is not None:
            merge_length = self.image_processor.merge_size**2
            index = 0
            for i in range(len(text)):
                while self.video_token in text[i]:
                    text[i] = text[i].replace(
                        self.video_token,
                        "<|placeholder|>"
                        * (video_grid_thw[index].prod() // merge_length),
                        1,
                    )
                    index += 1
                text[i] = text[i].replace("<|placeholder|>", self.video_token)

        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])

        return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})

    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        """
        return self.tokenizer.batch_decode(*args, **kwargs)

    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        """
        return self.tokenizer.decode(*args, **kwargs)

    def post_process_image_text_to_text(self, generated_outputs):
        """
        Post-process the output of the model to decode the text.

        Args:
            generated_outputs (`torch.Tensor` or `np.ndarray`):
                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
                or `(sequence_length,)`.

        Returns:
            `List[str]`: The decoded text.
        """
        return self.tokenizer.batch_decode(
            generated_outputs,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False,
        )

    @property
    def model_input_names(self):
        tokenizer_input_names = self.tokenizer.model_input_names
        image_processor_input_names = self.image_processor.model_input_names
        names_from_processor = list(
            dict.fromkeys(tokenizer_input_names + image_processor_input_names)
        )
        return names_from_processor + ["second_per_grid_ts"]


# Copied from: https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py
class Qwen2_5_VLVisionConfig(PretrainedConfig):
    model_type = "qwen2_5_vl"
    base_config_key = "vision_config"

    def __init__(
        self,
        depth=32,
        hidden_size=3584,
        hidden_act="silu",
        intermediate_size=3420,
        num_heads=16,
        in_channels=3,
        patch_size=14,
        spatial_merge_size=2,
        spatial_patch_size=14,
        temporal_patch_size=2,
        tokens_per_second=4,
        window_size=112,
        out_hidden_size=3584,
        fullatt_block_indexes=[7, 15, 23, 31],
        **kwargs,
    ):
        super().__init__(**kwargs)

        self.depth = depth
        self.hidden_size = hidden_size
        self.hidden_act = hidden_act
        self.intermediate_size = intermediate_size
        self.num_heads = num_heads
        self.in_channels = in_channels
        self.patch_size = patch_size
        self.spatial_patch_size = spatial_patch_size
        self.spatial_merge_size = spatial_merge_size
        self.temporal_patch_size = temporal_patch_size
        self.tokens_per_second = tokens_per_second
        self.window_size = window_size
        self.fullatt_block_indexes = fullatt_block_indexes
        self.out_hidden_size = out_hidden_size


class Qwen2_5_VLConfig(PretrainedConfig):

    def __init__(
        self,
        vocab_size=152064,
        hidden_size=8192,
        intermediate_size=29568,
        num_hidden_layers=80,
        num_attention_heads=64,
        num_key_value_heads=8,
        hidden_act="silu",
        max_position_embeddings=32768,
        initializer_range=0.02,
        rms_norm_eps=1e-05,
        use_cache=True,
        tie_word_embeddings=False,
        rope_theta=1000000.0,
        use_sliding_window=False,
        sliding_window=4096,
        max_window_layers=80,
        attention_dropout=0.0,
        vision_config=None,
        rope_scaling=None,
        **kwargs,
    ):
        if vision_config is not None:
            self.vision_config = Qwen2_5_VLVisionConfig(**vision_config)

        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.use_sliding_window = use_sliding_window
        self.sliding_window = sliding_window
        self.max_window_layers = max_window_layers

        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.attention_dropout = attention_dropout
        self.rope_scaling = rope_scaling

        # Validate the correctness of rotary position embeddings parameters
        # BC: if there is a 'type' field, move it to 'rope_type'.
        # and change type from 'mrope' to 'default' because `mrope` does defeault RoPE calculations
        # one can set it to "linear"/"dynamic" etc. to have scaled RoPE
        # TODO: @raushan update config in the hub
        if self.rope_scaling is not None and "type" in self.rope_scaling:
            if self.rope_scaling["type"] == "mrope":
                self.rope_scaling["type"] = "default"
            self.rope_scaling["rope_type"] = self.rope_scaling["type"]

        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)


class Qwen2_5VLAttention(nn.Module):
    def __init__(self, *, prefix, config, weights):
        super().__init__()
        self.embed_dim = config.hidden_size // weights.process_group.size()
        self.head_dim = config.hidden_size // config.num_heads
        self.num_heads = config.num_heads // weights.process_group.size()

        self.qkv = TensorParallelColumnLinear.load_qkv(
            config,
            prefix=f"{prefix}.qkv",
            weights=weights,
            bias=False,
            num_heads=self.num_heads,
            num_key_value_heads=self.num_heads,
        )
        self.qkv.linear.bias = weights.get_sharded(f"{prefix}.qkv.bias", dim=0)

        self.proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.proj",
            weights=weights,
            bias=True,
        )
        self.softmax_scale = 1.0 / np.sqrt(self.embed_dim // self.num_heads)

    def forward(
        self,
        hidden_state: torch.Tensor,
        cu_seqlens: torch.Tensor,
        cos: torch.Tensor,
        sin: torch.Tensor,
        max_seqlen: int,
    ) -> torch.Tensor:
        # apply the qkv linear layer to the hidden state
        qkv = self.qkv(hidden_state)
        query, key, value = qkv.split(
            [self.embed_dim, self.embed_dim, self.embed_dim], dim=1
        )

        # reshape the query, key, and value tensors
        _shape = (
            hidden_state.shape[0],
            self.num_heads,
            self.embed_dim // self.num_heads,
        )
        query = query.view(*_shape)
        key = key.view(*_shape)
        value = value.view(*_shape)
        # apply rotary positional embeddings
        rope_mode = RotaryPosEmbeddingMode.BLOCKWISE
        rotary_dim = cos.shape[-1]
        query_rot = query[..., :rotary_dim]
        query_pass = query[..., rotary_dim:]
        query_rot = apply_rotary_pos_emb(query_rot, cos, sin, None, 0, rope_mode)
        query.copy_(torch.cat((query_rot, query_pass), dim=-1).reshape(query.shape))

        key_rot = key[..., :rotary_dim]
        key_pass = key[..., rotary_dim:]
        key_rot = apply_rotary_pos_emb(key_rot, cos, sin, None, 0, rope_mode)
        key.copy_(torch.cat((key_rot, key_pass), dim=-1).reshape(key.shape))

        # execute sdpa
        causal = False
        query = query.transpose(0, 1)
        key = key.transpose(0, 1)
        value = value.transpose(0, 1)
        fsdpa_op = ModuleFusedSDPA(FusedSDPA)
        attention_mask = torch.zeros(
            [1, max_seqlen, max_seqlen], device=query.device, dtype=torch.bool
        )
        for i in range(1, len(cu_seqlens)):
            attention_mask[
                :, cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]
            ] = True
        attn_output = fsdpa_op(
            query,
            key,
            value,
            attn_mask=attention_mask,
            dropout_p=0.0,
            is_causal=causal,
            scale=None,
            softmax_mode="None",
            recompute_mode=None,
            valid_sequence_lengths=None,
        )
        attn_output = attn_output.transpose(0, 1)

        # reshape output to original dimensions
        attn_output = attn_output.reshape(hidden_state.shape[0], -1)
        attn_output = self.proj(attn_output)
        return attn_output


class Qwen2_5VLVisionMLP(nn.Module):
    def __init__(self, *, prefix, config, weights):
        super().__init__()
        self.activation_fn = ACT2FN[config.hidden_act]

        self.intermediate_size = (
            config.intermediate_size // weights.process_group.size()
        )

        self.up = TensorParallelColumnLinear.load(
            prefix=f"{prefix}.up_proj", weights=weights, config=config, bias=True
        )
        self.gate = TensorParallelColumnLinear.load(
            prefix=f"{prefix}.gate_proj", weights=weights, config=config, bias=True
        )
        self.down = TensorParallelRowLinear.load(
            prefix=f"{prefix}.down_proj", weights=weights, config=config, bias=True
        )

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        gate_states = self.gate(hidden_states)
        up_states = self.up(hidden_states)
        activated_states = self.activation_fn(gate_states) * up_states
        down_states = self.down(activated_states)
        return down_states


class Qwen2_5VLVisionBlock(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.attn = Qwen2_5VLAttention(
            prefix=f"{prefix}.attn",
            config=config,
            weights=weights,
        )
        self.norm1 = FastRMSNorm.load(
            prefix=f"{prefix}.norm1",
            weights=weights,
            eps=1e-6,
        )
        self.norm2 = FastRMSNorm.load(
            prefix=f"{prefix}.norm2",
            weights=weights,
            eps=1e-6,
        )
        self.mlp = Qwen2_5VLVisionMLP(
            prefix=f"{prefix}.mlp",
            config=config,
            weights=weights,
        )

    def forward(self, hidden_states, cu_seqlens, cos, sin, max_seqlen) -> torch.Tensor:
        norm1_out, _ = self.norm1(hidden_states)
        attn_out = self.attn(norm1_out, cu_seqlens, cos, sin, max_seqlen)
        hidden_states = hidden_states + attn_out
        norm2_out, _ = self.norm2(hidden_states)
        mlp_out = self.mlp(norm2_out)
        hidden_states = hidden_states + mlp_out
        return hidden_states


class Qwen2_5VLPatchMerger(nn.Module):
    def __init__(self, *, prefix, config, weights):
        super().__init__()
        self.hidden_size = config.hidden_size * (config.spatial_merge_size**2)
        self.patch_merger_ln_q = FastRMSNorm.load(
            prefix=f"{prefix}.ln_q",
            weights=weights,
            eps=1e-6,
        )
        self.fc1 = TensorParallelColumnLinear.load(
            prefix=f"{prefix}.mlp.0", weights=weights, config=config, bias=True
        )
        self.fc2 = TensorParallelRowLinear.load(
            prefix=f"{prefix}.mlp.2", weights=weights, config=config, bias=True
        )

    def forward(self, hidden_states) -> torch.Tensor:
        hidden_states, _ = self.patch_merger_ln_q(hidden_states)
        hidden_states = hidden_states.view(-1, self.hidden_size)
        hidden_states = self.fc1(hidden_states)
        hidden_states = F.gelu(hidden_states)
        hidden_states = self.fc2(hidden_states)
        return hidden_states


class Qwen2_5VisionModel(nn.Module):
    def __init__(self, *, prefix, config, weights):
        super().__init__()

        self.spatial_merge_size = config.spatial_merge_size
        kernel_size = [config.temporal_patch_size, config.patch_size, config.patch_size]
        self.patch_embedding = nn.Conv3d(
            in_channels=config.in_channels,
            out_channels=config.hidden_size,
            kernel_size=kernel_size,
            stride=kernel_size,
            bias=False,
        )
        self.patch_embedding.weight = nn.Parameter(
            weights.get_tensor(f"{prefix}.patch_embed.proj.weight"), requires_grad=False
        )
        head_dim = config.hidden_size // config.num_heads

        theta = 10000.0
        dim = head_dim // 2
        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
        self.register_buffer("inv_freq", inv_freq, persistent=False)

        self.blocks = nn.ModuleList(
            [
                Qwen2_5VLVisionBlock(
                    prefix=f"{prefix}.blocks.{i}",
                    config=config,
                    weights=weights,
                )
                for i in range(config.depth)
            ]
        )
        self.merger = Qwen2_5VLPatchMerger(
            prefix=f"{prefix}.merger",
            config=config,
            weights=weights,
        )

        self.temporal_patch_size = config.temporal_patch_size
        self.spatial_patch_size = config.spatial_patch_size
        self.in_channels = config.in_channels
        self.embed_dim = config.hidden_size
        self.window_size = config.window_size
        self.patch_size = config.patch_size
        self.spatial_merge_unit = config.spatial_merge_size * config.spatial_merge_size
        self.fullatt_block_indexes = config.fullatt_block_indexes

    def apply_class_embedding(self, hidden_state: torch.Tensor) -> torch.Tensor:
        batch_size, _, hidden_size = hidden_state.shape
        class_embedding = self.class_embedding.expand(batch_size, 1, hidden_size)
        hidden_state = torch.cat([class_embedding, hidden_state], dim=1)
        return hidden_state

    def get_window_index(self, grid_thw):
        window_index: list = []
        cu_window_seqlens: list = [0]
        window_index_id = 0
        vit_merger_window_size = (
            self.window_size // self.spatial_merge_size // self.patch_size
        )

        for grid_t, grid_h, grid_w in grid_thw:
            llm_grid_h, llm_grid_w = (
                grid_h // self.spatial_merge_size,
                grid_w // self.spatial_merge_size,
            )
            index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(
                grid_t, llm_grid_h, llm_grid_w
            )
            pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size
            pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size
            num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
            num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size
            index_padded = F.pad(index, (0, pad_w, 0, pad_h), "constant", -100)
            index_padded = index_padded.reshape(
                grid_t,
                num_windows_h,
                vit_merger_window_size,
                num_windows_w,
                vit_merger_window_size,
            )
            index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape(
                grid_t,
                num_windows_h * num_windows_w,
                vit_merger_window_size,
                vit_merger_window_size,
            )
            seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
            index_padded = index_padded.reshape(-1)
            index_new = index_padded[index_padded != -100]
            window_index.append(index_new + window_index_id)
            cu_seqlens_tmp = (
                seqlens.cumsum(0) * self.spatial_merge_unit + cu_window_seqlens[-1]
            )
            cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
            window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
        window_index = torch.cat(window_index, dim=0)

        return window_index, cu_window_seqlens

    def forward(
        self,
        pixel_values: torch.Tensor,
        grid_thw: Optional[torch.LongTensor] = None,
    ) -> torch.Tensor:

        # reshape the input tensor for processing
        shape = (
            -1,
            self.in_channels,
            self.temporal_patch_size,
            self.spatial_patch_size,
            self.spatial_patch_size,
        )
        pixel_values = pixel_values.view(shape).to(self.patch_embedding.weight.dtype)
        hidden_states = self.patch_embedding(pixel_values).view(-1, self.embed_dim)
        # TODO: revisit to see if we can avoid some of these reshapes

        # find the position ids for the input tensor based on the grid_thw
        pos_ids = []
        for t, h, w in grid_thw:
            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
            hpos_ids = hpos_ids.reshape(
                h // self.spatial_merge_size,
                self.spatial_merge_size,
                w // self.spatial_merge_size,
                self.spatial_merge_size,
            )
            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
            hpos_ids = hpos_ids.flatten()

            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
            wpos_ids = wpos_ids.reshape(
                h // self.spatial_merge_size,
                self.spatial_merge_size,
                w // self.spatial_merge_size,
                self.spatial_merge_size,
            )
            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
            wpos_ids = wpos_ids.flatten()
            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))

        pos_ids = torch.cat(pos_ids, dim=0)

        max_grid_size = grid_thw[:, 1:].max()

        # apply the positional embeddings to the position ids
        seq = torch.arange(
            max_grid_size, device=self.inv_freq.device, dtype=self.inv_freq.dtype
        )
        rotary_pos_emb_full = torch.outer(seq, self.inv_freq)
        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
        window_index, cu_window_seqlens = self.get_window_index(grid_thw)
        seq_len = hidden_states.shape[0]
        patch_shape = (seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
        og_shape = (seq_len, -1)

        hidden_states = hidden_states.view(patch_shape)[window_index, :, :].view(
            og_shape
        )
        rotary_pos_emb = rotary_pos_emb.view(patch_shape)[window_index, :, :].view(
            og_shape
        )

        rotary_pos_emb = rotary_pos_emb.to(device=hidden_states.device)
        cos = rotary_pos_emb.cos()
        sin = rotary_pos_emb.sin()
        cos = torch.cat((cos, cos), dim=-1).unsqueeze(1)
        sin = torch.cat((sin, sin), dim=-1).unsqueeze(1)

        cu_window_seqlens = torch.tensor(
            cu_window_seqlens,
            device="cpu",
            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
        )
        cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens).to(
            hidden_states.device
        )

        # create a cu_seqlens tensor to be used in the attention mask
        cu_seqlens = torch.repeat_interleave(
            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
        ).cumsum(dim=0, dtype=torch.int32)
        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
        max_seqlen = torch.max(cu_seqlens[1:] - cu_seqlens[:-1])

        # iterately apply the blocks to the hidden states
        lazy_mode = htorch.utils.internal.is_lazy()
        if lazy_mode:
            htorch.core.mark_step()
        for layer_num, block in enumerate(self.blocks):
            # NOTE: qwen2_5_vl.py has a concept of full attention blocks
            # that are applied at specific layers.
            if layer_num in self.fullatt_block_indexes:
                cu_seqlens_now = cu_seqlens
            else:
                cu_seqlens_now = cu_window_seqlens

            hidden_states = block(hidden_states, cu_seqlens_now, cos, sin, max_seqlen)
            if lazy_mode:
                htorch.core.mark_step()

        # apply the final patch merger to the hidden states
        hidden_states = self.merger(hidden_states)
        reverse_indices = torch.argsort(window_index)
        hidden_states = hidden_states[reverse_indices, :]
        return hidden_states


class Qwen2_5VLForConditionalGeneration(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        config.vision_config.quantize = None
        config.vision_config.speculator = config.speculator
        # set rope_scaling.type == "mrope" since AutoConfig.from_pretrained incorrectly
        # returns rope_scaling.type == "default" for Qwen2_5-VL model at the moment
        if (
            hasattr(config, "rope_scaling")
            and config.rope_scaling is not None
            and config.rope_scaling.get("type", None) == "default"
        ):
            config.rope_scaling.update({"rope_type": "mrope"})
        self.hidden_size = config.hidden_size
        self.vision_start_token_id = config.vision_start_token_id
        self.vision_end_token_id = config.vision_end_token_id
        self.image_token_id = config.image_token_id
        self.video_token_id = config.video_token_id
        self.spatial_merge_size = config.vision_config.spatial_merge_size
        self.embed_tokens = TensorParallelEmbedding(
            prefix="model.embed_tokens", weights=weights
        )
        self.visual = Qwen2_5VisionModel(
            prefix="visual", config=config.vision_config, weights=weights
        )
        self.text_model = Qwen2Model(prefix=None, config=config, weights=weights)
        if config.tie_word_embeddings:
            suffix = "model.embed_tokens"
        else:
            suffix = "lm_head"

        self.lm_head = SpeculativeHead.load(
            config,
            prefix=suffix if not prefix else f"{prefix}.{suffix}",
            weights=weights,
        )
        self.device = weights.device

    # based on https://github.com/huggingface/transformers/blob/e284c7e954abe12c34b50461c17f8115a0afe115/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1391
    # modified to first find segments then initialize position ids for each segment
    # Steps:
    #  locate all vision and text segments
    #  calculate `vision_segment_lengths` for each vision segment to be use as offset
    #  calculate `text_segment_lengths` for each text segment to be used as offset
    #  create position ids for each vision segment based on the image grid
    #  create position ids for each text segment
    #  combine all the position ids
    #  the final segment is the difference between the last vision segment and the end of the input
    #  combine all the position ids and reshape to (3, input_ids_len) then swap dimensions to (input_ids_len, 3)
    def get_position_ids(
        self,
        input_ids: torch.Tensor,
        image_grid_thw: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        if image_grid_thw is None:
            return (
                torch.arange(input_ids.shape[0], device=input_ids.device)
                .unsqueeze(1)
                .repeat(1, 3)
            )

        spatial_merge_size = self.spatial_merge_size
        vision_start_token_id = self.vision_start_token_id
        vision_end_token_id = self.vision_end_token_id
        device = input_ids.device
        dtype = input_ids.dtype
        input_ids_len = input_ids.shape[0]

        vision_starts = torch.where(input_ids == vision_start_token_id)[0]
        vision_ends = torch.where(input_ids == vision_end_token_id)[0]
        vision_segments = torch.stack((vision_starts, vision_ends), dim=1)
        prev_vision_end = torch.cat(
            [torch.zeros(1, device=vision_ends.device, dtype=dtype), vision_ends[:-1]]
        )
        text_lengths_between_vision = vision_segments[:, 0] - prev_vision_end + 1
        vision_widths_max = torch.cat(
            [
                torch.zeros(1, device=image_grid_thw.device, dtype=dtype),
                image_grid_thw[:-1, 2] // spatial_merge_size,
            ]
        )
        vision_segment_lengths = vision_widths_max + text_lengths_between_vision
        vision_segment_lengths = vision_segment_lengths.cumsum(dim=0)
        text_segment_lengths = vision_segment_lengths - text_lengths_between_vision

        # create position ids for each vision segment based on the image grid
        llm_pos_ids_list = []
        for i, _ in enumerate(vision_segments):
            t, h, w = (
                image_grid_thw[i][0],
                image_grid_thw[i][1] // spatial_merge_size,
                image_grid_thw[i][2] // spatial_merge_size,
            )
            t_indices = torch.arange(t, device=device).repeat_interleave(h * w)
            h_indices = torch.arange(h, device=device).repeat_interleave(w).repeat(t)
            w_indices = torch.arange(w, device=device).repeat(t * h)
            image_position_ids = torch.stack([t_indices, h_indices, w_indices], dim=0)

            # offset by the position of the last vision segment
            im = image_position_ids + vision_segment_lengths[i]
            llm_pos_ids_list.append(im)

        # create position ids for each text segment
        text_ranges = [
            torch.arange(seq_len, device=device).view(1, -1).expand(3, -1)
            + text_segment_lengths[i]
            for i, seq_len in enumerate(text_lengths_between_vision)
        ]

        full_llm_pos_ids_list = [
            item for sublist in zip(text_ranges, llm_pos_ids_list) for item in sublist
        ]
        max_s = full_llm_pos_ids_list[-1].max() + 1
        final_text_len = input_ids_len - vision_ends[-1]
        if final_text_len > 0:
            m = torch.arange(final_text_len, device=device).view(1, -1).expand(3, -1)
            full_llm_pos_ids_list.append(m + max_s)

        position_ids = (
            torch.cat(full_llm_pos_ids_list, dim=1).reshape(3, -1).transpose(0, 1)
        )
        return position_ids

    def get_vision_embeds(
        self,
        pixel_values: torch.FloatTensor,
        pixel_attention_mask: Optional[torch.FloatTensor] = None,
        image_sizes: Optional[torch.Tensor] = None,
        image_grid_thw: Optional[torch.LongTensor] = None,
    ):
        image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw).squeeze(0)
        return image_embeds

    def get_inputs_embeds(
        self,
        input_ids: torch.Tensor,
        vision_embeds: torch.Tensor = None,
    ):
        inputs_embeds = self.embed_tokens(input_ids)

        # apply the visual model to the pixel values if they are provided
        if vision_embeds is not None:
            mask = torch.where(input_ids == self.image_token_id)
            inputs_embeds[mask] = vision_embeds

        return inputs_embeds

    def forward(
        self,
        inputs_embeds: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
        lm_head_indices: Optional[torch.Tensor],
        attention_mask: Optional[torch.BoolTensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
        image_indices=None,
    ):

        hidden_states = self.text_model(
            inputs_embeds=inputs_embeds,
            position_ids=position_ids,
            cu_seqlen_prefill=cu_seqlen_prefill,
            kv_cache=kv_cache,
            slots=slots,
            seqlen=seqlen,
            hpu_attention_meta=hpu_attention_meta,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.lm_head(hidden_states)
        return logits, speculative_logits


================================================
FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/qwen2_vl.py
================================================
# coding=utf-8
# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch Qwen2 VL model."""

from typing import Optional, Tuple, List

import torch
import torch.utils.checkpoint
from torch import nn


from habana_frameworks.torch.hpex.kernels import FusedSDPA
from vllm_hpu_extension.utils import ModuleFusedSDPA


import numpy as np

from transformers.activations import ACT2FN
import torch.nn.functional as F

from text_generation_server.layers.layernorm import FastLayerNorm, FastRMSNorm
from text_generation_server.layers import (
    TensorParallelColumnLinear,
    TensorParallelRowLinear,
    TensorParallelEmbedding,
    SpeculativeHead,
)
from text_generation_server.layers.attention import (
    Seqlen,
    HPUPagedAttentionMetadata,
)
from text_generation_server.models.custom_modeling.flash_qwen2_modeling import (
    Qwen2Model,
)
from habana_frameworks.torch.hpex.kernels import (
    RotaryPosEmbeddingMode,
    apply_rotary_pos_emb,
)
import habana_frameworks.torch as htorch


class Qwen2VLAttention(nn.Module):
    def __init__(self, *, prefix, config, weights):
        super().__init__()
        self.embed_dim = config.embed_dim // weights.process_group.size()
        self.head_dim = config.hidden_size // config.num_heads
        self.num_heads = config.num_heads // weights.process_group.size()

        self.qkv = TensorParallelColumnLinear.load_qkv(
            config,
            prefix=f"{prefix}.qkv",
            weights=weights,
            bias=False,
            num_heads=self.num_heads,
            num_key_value_heads=self.num_heads,
        )
        self.qkv.linear.bias = weights.get_sharded(f"{prefix}.qkv.bias", dim=0)
        self.proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.proj",
            weights=weights,
            bias=True,
        )
        self.softmax_scale = 1.0 / np.sqrt(self.embed_dim // self.num_heads)

    def forward(
        self,
        hidden_state: torch.Tensor,
        cu_seqlens: torch.Tensor,
        cos: torch.Tensor,
        sin: torch.Tensor,
        max_seqlen: int,
    ) -> torch.Tensor:
        # apply the qkv linear layer to the hidden state
        qkv = self.qkv(hidden_state)
        query, key, value = qkv.split(
            [self.embed_dim, self.embed_dim, self.embed_dim], dim=1
        )

        # reshape the query, key, and value tensors
        _shape = (
            hidden_state.shape[0],
            self.num_heads,
            self.embed_dim // self.num_heads,
        )
        query = query.view(*_shape)
        key = key.view(*_shape)
        value = value.view(*_shape)

        # apply rotary positional embeddings
        rope_mode = RotaryPosEmbeddingMode.BLOCKWISE
        rotary_dim = cos.shape[-1]
        query_rot = query[..., :rotary_dim]
        query_pass = query[..., rotary_dim:]
        query_rot = apply_rotary_pos_emb(query_rot, cos, sin, None, 0, rope_mode)
        query.copy_(torch.cat((query_rot, query_pass), dim=-1).reshape(query.shape))

        key_rot = key[..., :rotary_dim]
        key_pass = key[..., rotary_dim:]
        key_rot = apply_rotary_pos_emb(key_rot, cos, sin, None, 0, rope_mode)
        key.copy_(torch.cat((key_rot, key_pass), dim=-1).reshape(key.shape))

        # execute sdpa
        causal = False
        query = query.transpose(0, 1)
        key = key.transpose(0, 1)
        value = value.transpose(0, 1)
        fsdpa_op = ModuleFusedSDPA(FusedSDPA)
        attention_mask = torch.zeros(
            [1, max_seqlen, max_seqlen], device=query.device, dtype=torch.bool
        )
        for i in range(1, len(cu_seqlens)):
            attention_mask[
                :, cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]
            ] = True
        attn_output = fsdpa_op(
            query,
            key,
            value,
            attn_mask=attention_mask,
            dropout_p=0.0,
            is_causal=causal,
            scale=None,
            softmax_mode="None",
            recompute_mode=None,
            valid_sequence_lengths=None,
        )
        attn_output = attn_output.transpose(0, 1)
        # reshape output to original dimensions
        attn_output = attn_output.reshape(hidden_state.shape[0], -1)
        attn_output = self.proj(attn_output)
        return attn_output


class Qwen2VLVisionMLP(nn.Module):
    def __init__(self, *, prefix, config, weights):
        super().__init__()
        self.activation_fn = ACT2FN[config.hidden_act]
        self.fc1 = TensorParallelColumnLinear.load(
            prefix=f"{prefix}.fc1", weights=weights, config=config, bias=True
        )
        self.fc2 = TensorParallelRowLinear.load(
            prefix=f"{prefix}.fc2", weights=weights, config=config, bias=True
        )

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.fc1(hidden_states)
        hidden_states = self.activation_fn(hidden_states)
        hidden_states = self.fc2(hidden_states)
        return hidden_states


class Qwen2VLVisionBlock(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.attn = Qwen2VLAttention(
            prefix=f"{prefix}.attn",
            config=config,
            weights=weights,
        )
        self.norm1 = FastLayerNorm.load(
            prefix=f"{prefix}.norm1",
            weights=weights,
            eps=1e-6,
        )
        self.norm2 = FastLayerNorm.load(
            prefix=f"{prefix}.norm2",
            weights=weights,
            eps=1e-6,
        )
        self.mlp = Qwen2VLVisionMLP(
            prefix=f"{prefix}.mlp",
            config=config,
            weights=weights,
        )

    def forward(self, hidden_states, cu_seqlens, cos, sin, max_seqlen) -> torch.Tensor:
        norm1_out, residual = self.norm1(hidden_states)
        attn_out = self.attn(norm1_out, cu_seqlens, cos, sin, max_seqlen)
        hidden_states = attn_out + residual
        norm2_out, residual = self.norm2(hidden_states)
        hidden_states = hidden_states + self.mlp(norm2_out)
        return hidden_states


class Qwen2VLPatchMerger(nn.Module):
    def __init__(self, *, prefix, config, weights):
        super().__init__()
        self.hidden_size = config.embed_dim * (config.spatial_merge_size**2)
        self.patch_merger_ln_q = FastLayerNorm.load(
            prefix=f"{prefix}.ln_q",
            weights=weights,
            eps=1e-6,
        )
        self.fc1 = TensorParallelColumnLinear.load(
            prefix=f"{prefix}.mlp.0", weights=weights, config=config, bias=True
        )
        self.fc2 = TensorParallelRowLinear.load(
            prefix=f"{prefix}.mlp.2", weights=weights, config=config, bias=True
        )

    def forward(self, hidden_states) -> torch.Tensor:
        hidden_states, _ = self.patch_merger_ln_q(hidden_states)
        hidden_states = hidden_states.view(-1, self.hidden_size)
        hidden_states = self.fc1(hidden_states)
        hidden_states = F.gelu(hidden_states)
        hidden_states = self.fc2(hidden_states)
        return hidden_states


class Qwen2VisionModel(nn.Module):
    def __init__(self, *, prefix, config, weights):
        super().__init__()
        self.spatial_merge_size = config.spatial_merge_size
        kernel_size = [config.temporal_patch_size, config.patch_size, config.patch_size]
        self.patch_embedding = nn.Conv3d(
            in_channels=config.in_chans,
            out_channels=config.embed_dim,
            kernel_size=kernel_size,
            stride=kernel_size,
            bias=False,
        )
        self.patch_embedding.weight = nn.Parameter(
            weights.get_tensor(f"{prefix}.patch_embed.proj.weight"), requires_grad=False
        )
        head_dim = config.embed_dim // config.num_heads
        # TODO: replace with static positional embeddings once implemented
        theta = 10000.0
        dim = head_dim // 2
        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
        self.register_buffer("inv_freq", inv_freq, persistent=False)

        self.blocks = nn.ModuleList(
            [
                Qwen2VLVisionBlock(
                    prefix=f"{prefix}.blocks.{i}",
                    config=config,
                    weights=weights,
                )
                for i in range(config.depth)
            ]
        )
        self.merger = Qwen2VLPatchMerger(
            prefix=f"{prefix}.merger",
            config=config,
            weights=weights,
        )

        self.temporal_patch_size = config.temporal_patch_size
        self.spatial_patch_size = config.spatial_patch_size
        self.in_channels = config.in_channels
        self.embed_dim = config.embed_dim

    def apply_class_embedding(self, hidden_state: torch.Tensor) -> torch.Tensor:
        batch_size, _, hidden_size = hidden_state.shape
        class_embedding = self.class_embedding.expand(batch_size, 1, hidden_size)
        hidden_state = torch.cat([class_embedding, hidden_state], dim=1)
        return hidden_state

    def forward(
        self,
        pixel_values: torch.Tensor,
        grid_thw: Optional[torch.LongTensor] = None,
    ) -> torch.Tensor:
        # reshape the input tensor for processing
        shape = (
            -1,
            self.in_channels,
            self.temporal_patch_size,
            self.spatial_patch_size,
            self.spatial_patch_size,
        )
        pixel_values = pixel_values.view(shape).to(self.patch_embedding.weight.dtype)
        hidden_states = self.patch_embedding(pixel_values).view(-1, self.embed_dim)
        # TODO: revisit to see if we can avoid some of these reshapes

        # find the position ids for the input tensor based on the grid_thw
        pos_ids = []
        for t, h, w in grid_thw:
            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
            hpos_ids = hpos_ids.reshape(
                h // self.spatial_merge_size,
                self.spatial_merge_size,
                w // self.spatial_merge_size,
                self.spatial_merge_size,
            )
            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
            hpos_ids = hpos_ids.flatten()

            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
            wpos_ids = wpos_ids.reshape(
                h // self.spatial_merge_size,
                self.spatial_merge_size,
                w // self.spatial_merge_size,
                self.spatial_merge_size,
            )
            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
            wpos_ids = wpos_ids.flatten()
            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))

        pos_ids = torch.cat(pos_ids, dim=0)
        max_grid_size = grid_thw[:, 1:].max()

        # apply the positional embeddings to the position ids
        seq = torch.arange(
            max_grid_size, device=self.inv_freq.device, dtype=self.inv_freq.dtype
        )
        rotary_pos_emb_full = torch.outer(seq, self.inv_freq)
        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
        rotary_pos_emb = rotary_pos_emb.to(hidden_states.device, hidden_states.dtype)

        cos = rotary_pos_emb.cos()
        sin = rotary_pos_emb.sin()
        cos = torch.cat((cos, cos), dim=-1).unsqueeze(1)
        sin = torch.cat((sin, sin), dim=-1).unsqueeze(1)

        # create a cu_seqlens tensor to be used in the attention mask
        cu_seqlens = torch.repeat_interleave(
            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
        ).cumsum(dim=0, dtype=torch.int32)
        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
        max_seqlen = torch.max(cu_seqlens[1:] - cu_seqlens[:-1])
        # iterately apply the blocks to the hidden states
        lazy_mode = htorch.utils.internal.is_lazy()
        if lazy_mode:
            htorch.core.mark_step()
        for block in self.blocks:
            hidden_states = block(hidden_states, cu_seqlens, cos, sin, max_seqlen)
            if lazy_mode:
                htorch.core.mark_step()

        # apply the final patch merger to the hidden states
        hidden_states = self.merger(hidden_states)
        return hidden_states


class Qwen2VLForConditionalGeneration(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        config.vision_config.quantize = None
        config.vision_config.speculator = config.speculator
        # set rope_scaling.type == "mrope" since AutoConfig.from_pretrained incorrectly
        # returns rope_scaling.type == "default" for Qwen2-VL model at the moment
        if (
            hasattr(config, "rope_scaling")
            and config.rope_scaling is not None
            and config.rope_scaling.get("type", None) == "default"
        ):
            config.rope_scaling.update({"rope_type": "mrope"})
        self.hidden_size = config.hidden_size
        self.vision_start_token_id = config.vision_start_token_id
        self.vision_end_token_id = config.vision_end_token_id
        self.image_token_id = config.image_token_id
        self.video_token_id = config.video_token_id
        self.spatial_merge_size = config.vision_config.spatial_merge_size
        self.embed_tokens = TensorParallelEmbedding(
            prefix="model.embed_tokens", weights=weights
        )
        self.visual = Qwen2VisionModel(
            prefix="visual", config=config.vision_config, weights=weights
        )
        self.text_model = Qwen2Model(prefix=None, config=config, weights=weights)
        if config.tie_word_embeddings:
            suffix = "model.embed_tokens"
        else:
            suffix = "lm_head"

        self.lm_head = SpeculativeHead.load(
            config,
            prefix=suffix if not prefix else f"{prefix}.{suffix}",
            weights=weights,
        )
        self.norm = FastRMSNorm.load(
            prefix="model.norm",
            weights=weights,
            eps=config.rms_norm_eps,
        )
        self.device = weights.device

    # based on https://github.com/huggingface/transformers/blob/e284c7e954abe12c34b50461c17f8115a0afe115/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1391
    # modified to first find segments then initialize position ids for each segment
    # Steps:
    #  locate all vision and text segments
    #  calculate `vision_segment_lengths` for each vision segment to be use as offset
    #  calculate `text_segment_lengths` for each text segment to be used as offset
    #  create position ids for each vision segment based on the image grid
    #  create position ids for each text segment
    #  combine all the position ids
    #  the final segment is the difference between the last vision segment and the end of the input
    #  combine all the position ids and reshape to (3, input_ids_len) then swap dimensions to (input_ids_len, 3)
    def get_position_ids(
        self,
        input_ids: torch.Tensor,
        image_grid_thw: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        if image_grid_thw is None:
            return (
                torch.arange(input_ids.shape[0], device=input_ids.device)
                .unsqueeze(1)
                .repeat(1, 3)
            )

        spatial_merge_size = self.spatial_merge_size
        vision_start_token_id = self.vision_start_token_id
        vision_end_token_id = self.vision_end_token_id
        device = input_ids.device
        dtype = input_ids.dtype
        input_ids_len = input_ids.shape[0]

        vision_starts = torch.where(input_ids == vision_start_token_id)[0]
        vision_ends = torch.where(input_ids == vision_end_token_id)[0]
        vision_segments = torch.stack((vision_starts, vision_ends), dim=1)
        prev_vision_end = torch.cat(
            [torch.zeros(1, device=vision_ends.device, dtype=dtype), vision_ends[:-1]]
        )
        text_lengths_between_vision = vision_segments[:, 0] - prev_vision_end + 1
        vision_widths_max = torch.cat(
            [
                torch.zeros(1, device=image_grid_thw.device, dtype=dtype),
                image_grid_thw[:-1, 2] // spatial_merge_size,
            ]
        )
        vision_segment_lengths = vision_widths_max + text_lengths_between_vision
        vision_segment_lengths = vision_segment_lengths.cumsum(dim=0)
        text_segment_lengths = vision_segment_lengths - text_lengths_between_vision

        # create position ids for each vision segment based on the image grid
        llm_pos_ids_list = []
        for i, _ in enumerate(vision_segments):
            t, h, w = (
                image_grid_thw[i][0],
                image_grid_thw[i][1] // spatial_merge_size,
                image_grid_thw[i][2] // spatial_merge_size,
            )
            t_indices = torch.arange(t, device=device).repeat_interleave(h * w)
            h_indices = torch.arange(h, device=device).repeat_interleave(w).repeat(t)
            w_indices = torch.arange(w, device=device).repeat(t * h)
            image_position_ids = torch.stack([t_indices, h_indices, w_indices], dim=0)

            # offset by the position of the last vision segment
            im = image_position_ids + vision_segment_lengths[i]
            llm_pos_ids_list.append(im)

        # create position ids for each text segment
        text_ranges = [
            torch.arange(seq_len, device=device).view(1, -1).expand(3, -1)
            + text_segment_lengths[i]
            for i, seq_len in enumerate(text_lengths_between_vision)
        ]

        full_llm_pos_ids_list = [
            item for sublist in zip(text_ranges, llm_pos_ids_list) for item in sublist
        ]
        max_s = full_llm_pos_ids_list[-1].max() + 1
        final_text_len = input_ids_len - vision_ends[-1]
        if final_text_len > 0:
            m = torch.arange(final_text_len, device=device).view(1, -1).expand(3, -1)
            full_llm_pos_ids_list.append(m + max_s)

        position_ids = (
            torch.cat(full_llm_pos_ids_list, dim=1).reshape(3, -1).transpose(0, 1)
        )
        return position_ids

    def get_vision_embeds(
        self,
        pixel_values: torch.FloatTensor,
        pixel_attention_mask: Optional[torch.FloatTensor] = None,
        image_sizes: Optional[torch.Tensor] = None,
        image_grid_thw: Optional[torch.LongTensor] = None,
    ):
        image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw).squeeze(0)
        return image_embeds

    def get_inputs_embeds(
        self,
        input_ids: torch.Tensor,
        vision_embeds: torch.Tensor = None,
    ):
        inputs_embeds = self.embed_tokens(input_ids)

        # apply the visual model to the pixel values if they are provided
        if vision_embeds is not None:
            mask = torch.where(input_ids == self.image_token_id)
            inputs_embeds[mask] = vision_embeds

        return inputs_embeds

    def forward(
        self,
        inputs_embeds: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        slots: torch.Tensor,
        seqlen: Seqlen,
        hpu_attention_meta: Optional[HPUPagedAttentionMetadata],
        lm_head_indices: Optional[torch.Tensor],
        attention_mask: Optional[torch.BoolTensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
        image_indices=None,
    ):
        hidden_states = self.text_model(
            inputs_embeds=inputs_embeds,
            position_ids=position_ids,
            cu_seqlen_prefill=cu_seqlen_prefill,
            kv_cache=kv_cache,
            slots=slots,
            seqlen=seqlen,
            hpu_attention_meta=hpu_attention_meta,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.lm_head(hidden_states)
        return logits, speculative_logits


================================================
FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/siglip.py
================================================
from typing import Optional, Tuple
import warnings
import math
import torch
from torch import nn

from transformers.activations import ACT2FN
from transformers.modeling_outputs import (
    BaseModelOutputWithPooling,
)
from transformers import SiglipConfig, SiglipVisionConfig
from torch.nn.init import _calculate_fan_in_and_fan_out

from text_generation_server.layers.tensor_parallel import (
    TensorParallelEmbedding,
    TensorParallelColumnLinear,
    TensorParallelRowLinear,
)


class SiglipVisionEmbeddings(nn.Module):
    def __init__(self, prefix, config: SiglipVisionConfig, weights):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size
        self.image_size = config.image_size
        self.patch_size = config.patch_size
        self.patch_embedding = nn.Conv2d(
            in_channels=config.num_channels,
            out_channels=self.embed_dim,
            kernel_size=self.patch_size,
            stride=self.patch_size,
            padding="valid",
        )
        self.patch_embedding.weight = nn.Parameter(
            weights.get_tensor(f"{prefix}.patch_embedding.weight"), requires_grad=False
        )
        self.patch_embedding.bias = nn.Parameter(
            weights.get_tensor(f"{prefix}.patch_embedding.bias"), requires_grad=False
        )
        self.num_patches = (self.image_size // self.patch_size) ** 2
        self.num_positions = self.num_patches
        self.position_embedding = TensorParallelEmbedding(
            prefix=f"{prefix}.position_embedding", weights=weights
        )
        self.register_buffer(
            "position_ids",
            torch.arange(self.num_positions, device=weights.device).expand((1, -1)),
            persistent=False,
        )

    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
        patch_embeds = self.patch_embedding(
            pixel_values
        )  # shape = [*, width, grid, grid]
        embeddings = patch_embeds.flatten(2).transpose(1, 2)

        embeddings = embeddings + self.position_embedding(self.position_ids)
        return embeddings


class SiglipAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.embed_dim // self.num_heads
        self.head_size = self.head_dim
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {self.num_heads})."
            )
        self.num_heads = self.num_heads // weights.process_group.size()
        self.embed_dim = self.embed_dim // weights.process_group.size()
        self.scale = self.head_dim**-0.5
        self.dropout = config.attention_dropout

        self.k_proj = TensorParallelColumnLinear.load(
            config, prefix=f"{prefix}.k_proj", weights=weights, bias=True
        )
        self.v_proj = TensorParallelColumnLinear.load(
            config, prefix=f"{prefix}.v_proj", weights=weights, bias=True
        )
        self.q_proj = TensorParallelColumnLinear.load(
            config, prefix=f"{prefix}.q_proj", weights=weights, bias=True
        )
        self.out_proj = TensorParallelRowLinear.load(
            config, prefix=f"{prefix}.out_proj", weights=weights, bias=True
        )

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return (
            tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
            .transpose(1, 2)
            .contiguous()
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        """Input shape: Batch x Time x Channel"""

        bsz, tgt_len, _ = hidden_states.size()
        query_states = self.q_proj(hidden_states)
        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
        key_states = key_states.view(*proj_shape)
        value_states = value_states.view(*proj_shape)

        src_len = key_states.size(1)
        # scale post matmul
        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) * self.scale

        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
            raise ValueError(
                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
                f" {attn_weights.size()}"
            )

        if attention_mask is not None:
            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
                raise ValueError(
                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
                )
            attn_weights = (
                attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
                + attention_mask
            )
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)

        # upcast attention to fp32
        attn_weights = nn.functional.softmax(
            attn_weights, dim=-1, dtype=torch.float32
        ).to(attn_weights.dtype)
        attn_weights = nn.functional.dropout(
            attn_weights, p=self.dropout, training=self.training
        )
        attn_output = torch.matmul(attn_weights, value_states)

        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_size):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_size)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_size)
        attn_output = attn_output.transpose(1, 2)
        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)

        attn_output = self.out_proj(attn_output)

        return attn_output, attn_weights


class SiglipMLP(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        self.activation_fn = ACT2FN[config.hidden_act]
        self.fc1 = TensorParallelColumnLinear.load(  # config.hidden_size, config.intermediate_size
            prefix=f"{prefix}.fc1", config=config, weights=weights, bias=True
        )
        self.fc2 = TensorParallelRowLinear.load(  # config.intermediate_size, config.hidden_size
            prefix=f"{prefix}.fc2", config=config, weights=weights, bias=True
        )

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.fc1(hidden_states)
        hidden_states = self.activation_fn(hidden_states)
        hidden_states = self.fc2(hidden_states)
        return hidden_states


class SiglipEncoderLayer(nn.Module):
    def __init__(self, prefix, config: SiglipConfig, weights):
        super().__init__()
        self.embed_dim = config.hidden_size
        self.self_attn = SiglipAttention(
            prefix=f"{prefix}.self_attn", config=config, weights=weights
        )
        self.layer_norm1 = nn.LayerNorm.load(
            prefix=f"{prefix}.layer_norm1", weights=weights, eps=config.layer_norm_eps
        )
        self.mlp = SiglipMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
        self.layer_norm2 = nn.LayerNorm.load(
            prefix=f"{prefix}.layer_norm2", weights=weights, eps=config.layer_norm_eps
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
    ) -> Tuple[torch.FloatTensor]:
        residual = hidden_states
        hidden_states = self.layer_norm1(hidden_states)
        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
        )
        hidden_states = residual + hidden_states
        residual = hidden_states
        hidden_states = self.layer_norm2(hidden_states)
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states
        return hidden_states, None


class SiglipMultiheadAttentionPoolingHead(nn.Module):
    """Multihead Attention Pooling."""

    def __init__(self, prefix, config: SiglipVisionConfig, weights):
        super().__init__()

        self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
        self.attention = torch.nn.MultiheadAttention(
            config.hidden_size, config.num_attention_heads, batch_first=True
        )
        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.mlp = SiglipMLP(prefix, config, weights)

    def forward(self, hidden_state):
        batch_size = hidden_state.shape[0]
        probe = self.probe.repeat(batch_size, 1, 1)

        hidden_state = self.attention(probe, hidden_state, hidden_state)[0]

        residual = hidden_state
        hidden_state = self.layernorm(hidden_state)
        hidden_state = residual + self.mlp(hidden_state)

        return hidden_state[:, 0]


def _trunc_normal_(tensor, mean, std, a, b):
    # Cut & paste from PyTorch official master until it's in a few official releases - RW
    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
    def norm_cdf(x):
        # Computes standard normal cumulative distribution function
        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0

    if (mean < a - 2 * std) or (mean > b + 2 * std):
        warnings.warn(
            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
            "The distribution of values may be incorrect.",
            stacklevel=2,
        )

    # Values are generated by using a truncated uniform distribution and
    # then using the inverse CDF for the normal distribution.
    # Get upper and lower cdf values
    lower = norm_cdf((a - mean) / std)
    upper = norm_cdf((b - mean) / std)

    # Uniformly fill tensor with values from [l, u], then translate to
    # [2l-1, 2u-1].
    tensor.uniform_(2 * lower - 1, 2 * upper - 1)

    # Use inverse cdf transform for normal distribution to get truncated
    # standard normal
    tensor.erfinv_()

    # Transform to proper mean, std
    tensor.mul_(std * math.sqrt(2.0))
    tensor.add_(mean)

    # Clamp to ensure it's in the proper range
    tensor.clamp_(min=a, max=b)


def trunc_normal_tf_(
    tensor: torch.Tensor,
    mean: float = 0.0,
    std: float = 1.0,
    a: float = -2.0,
    b: float = 2.0,
) -> torch.Tensor:
    """Fills the input Tensor with values drawn from a truncated
    normal distribution. The values are effectively drawn from the
    normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
    with values outside :math:`[a, b]` redrawn until they are within
    the bounds. The method used for generating the random values works
    best when :math:`a \\leq \text{mean} \\leq b`.

    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
    and the result is subsquently scaled and shifted by the mean and std args.

    Args:
        tensor: an n-dimensional `torch.Tensor`
        mean: the mean of the normal distribution
        std: the standard deviation of the normal distribution
        a: the minimum cutoff value
        b: the maximum cutoff value
    """
    with torch.no_grad():
        _trunc_normal_(tensor, 0, 1.0, a, b)
        tensor.mul_(std).add_(mean)


def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
    if mode == "fan_in":
        denom = fan_in
    elif mode == "fan_out":
        denom = fan_out
    elif mode == "fan_avg":
        denom = (fan_in + fan_out) / 2

    variance = scale / denom

    if distribution == "truncated_normal":
        # constant is stddev of standard normal truncated to (-2, 2)
        trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
    elif distribution == "normal":
        with torch.no_grad():
            tensor.normal_(std=math.sqrt(variance))
    elif distribution == "uniform":
        bound = math.sqrt(3 * variance)
        with torch.no_grad():
            tensor.uniform_(-bound, bound)
    else:
        raise ValueError(f"invalid distribution {distribution}")


def lecun_normal_(tensor):
    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")


def default_flax_embed_init(tensor):
    variance_scaling_(tensor, mode="fan_in", distribution="normal")


class SiglipEncoder(nn.Module):
    """
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`SiglipEncoderLayer`].

    Args:
        config: SiglipConfig
    """

    def __init__(self, prefix, config: SiglipConfig, weights):
        super().__init__()
        self.config = config
        self.layers = nn.ModuleList(
            [
                SiglipEncoderLayer(
                    prefix=f"{prefix}.layers.{i}", config=config, weights=weights
                )
                for i in range(config.num_hidden_layers)
            ]
        )

    def forward(
        self,
        inputs_embeds,
        attention_mask: Optional[torch.Tensor] = None,
    ):
        hidden_states = inputs_embeds
        for idx, encoder_layer in enumerate(self.layers):
            hidden_states, _ = encoder_layer(
                hidden_states,
                attention_mask,
            )

        return hidden_states


class SiglipVisionTransformer(nn.Module):
    def __init__(self, prefix, config: SiglipVisionConfig, weights):
        super().__init__()
        self.config = config

        self.embeddings = SiglipVisionEmbeddings(
            prefix=f"{prefix}.embeddings", config=config, weights=weights
        )
        self.encoder = SiglipEncoder(
            prefix=f"{prefix}.encoder", config=config, weights=weights
        )

    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
    ):
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        hidden_states = self.embeddings(pixel_values)

        # NOTE: up until this point, the code logits are exactly
        # the same as the transformers code. The values evaulate
        # slightly differently in our encoder layer.
        encoder_outputs = self.encoder(
            inputs_embeds=hidden_states,
        )
        last_hidden_state = encoder_outputs

        return BaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,
            # pooler_output=pooled_output,
            # hidden_states=encoder_outputs,
        )


================================================
FILE: backends/gaudi/server/text_generation_server/models/custom_modeling/vlm.py
================================================
def load_text_model(prefix, config, weights, name=None):
    if config.model_type == "llama":
        from text_generation_server.models.custom_modeling.flash_llama_modeling import (
            FlashLlamaForCausalLM,
        )

        return FlashLlamaForCausalLM(prefix, config, weights, name=name)
    elif config.model_type == "mistral":
        from text_generation_server.models.custom_modeling.flash_mistral_modeling import (
            FlashMistralForCausalLM,
        )

        return FlashMistralForCausalLM(prefix, config, weights, name=name)
    elif config.model_type == "gemma":
        from text_generation_server.models.custom_modeling.flash_gemma_modeling import (
            FlashGemmaForCausalLM,
        )

        return FlashGemmaForCausalLM(prefix, config, weights)
    elif config.model_type == "gemma2":
        from text_generation_server.models.custom_modeling.flash_gemma2_modeling import (
            FlashGemma2ForCausalLM,
        )

        return FlashGemma2ForCausalLM(prefix, config, weights)
    elif config.model_type == "gemma3" or config.model_type == "gemma3_text":
        from text_generation_server.models.custom_modeling.flash_gemma3_modeling import (
            FlashGemma3ForCausalLM,
        )

        return FlashGemma3ForCausalLM(prefix, config, weights)
    elif config.model_type == "paligemma":
        from text_generation_server.models.custom_modeling.flash_gemma_modeling import (
            FlashGemmaForCausalLM,
        )

        return FlashGemmaForCausalLM(prefix, config, weights)
    else:
        raise RuntimeError(f"Unsupported model type {config.model_type}")


def load_vision_model(prefix, config, weights):
    if config.model_type == "clip_vision_model":
        from text_generation_server.models.custom_modeling.clip import (
            CLIPVisionTransformer,
        )

        return CLIPVisionTransformer(
            prefix=f"{prefix}.vision_model", config=config, weights=weights
        )
    if (
        config.model_type == "siglip_vision_model"
        or config.model_type == "gemma3_vision"
    ):
        from text_generation_server.models.custom_modeling.siglip import (
            SiglipVisionTransformer,
        )

        # TODO: ensure that using the prefix doesn't break any existing models
        # that rely on the old prefix (update the old models if necessary)
        return SiglipVisionTransformer(
            prefix=f"{prefix}.vision_model",
            config=config,
            weights=weights,
        )
    else:
        raise RuntimeError(f"Unsupported model type {config.model_type}")


================================================
FILE: backends/gaudi/server/text_generation_server/models/flash_causal_lm.py
================================================
import math
import os
import time
import torch
import torch.distributed

import numpy as np

from loguru import logger
from dataclasses import dataclass
from opentelemetry import trace
from transformers import (
    PreTrainedTokenizerBase,
    AutoConfig,
    AutoTokenizer,
    GenerationConfig,
)
from typing import (
    Any,
    Iterable,
    Optional,
    Tuple,
    List,
    Type,
    Dict,
    Union,
)
import torch.nn.functional as F
from text_generation_server.adapters import AdapterBatchData, AdapterBatchMetadata
from text_generation_server.utils.chunks import concat_text_chunks
from text_generation_server.models import Model
from text_generation_server.utils.log import log_master
from text_generation_server.utils.tokens import batch_top_tokens
from text_generation_server.utils.speculate import get_speculate
from text_generation_server.utils import (
    initialize_torch_distributed,
    weight_files,
    Weights,
    pad_next_token_chooser_parameters,
)
from text_generation_server.models.types import (
    Batch,
    Tokens,
    Generation,
    GeneratedText,
)
from text_generation_server.pb import generate_pb2
from text_generation_server.models.globals import (
    BLOCK_SIZE,
    REQUEST_LOGPROBS,
    TGI_WIGGLE_ROOM,
    get_adapter_to_index,
)
from text_generation_server.layers.attention import (
    KVCache,
    KVCompressCache,
    Seqlen,
    HPUPagedAttentionMetadata,
    trim_attn_metadata,
    trim_seqlen_metadata,
    _async_h2d_tensor_copy,
)
from text_generation_server.utils import StoppingCriteria, HeterogeneousNextTokenChooser
from text_generation_server.utils.dist import MEMORY_FRACTION
from text_generation_server.utils.quantization import get_loader
from text_generation_server.utils.segments import SegmentConcatBuilder, find_segments
from text_generation_server.utils.import_utils import (
    empty_cache,
    synchronize,
    get_free_memory,
)
from text_generation_server.utils.prefill_chunking import (
    get_max_prefill_tokens,
)
import vllm_hpu_extension.environment as environment
import habana_frameworks.torch as htorch
import itertools
from vllm_hpu_extension.bucketing.common import get_bucketing_context
from vllm_hpu_extension.profiler import HabanaMemoryProfiler, format_bytes

tracer = trace.get_tracer(__name__)


def generate_block_metadata(
    dtype,
    use_contiguous_pa,
    slots,
    block_tables,
    bucketing_ctx,
    slots_in_window=None,
    block_bucket_size=None,
):
    # Prepare values if we need to continue decoding
    # need for HPUPagedAttentionMetadata preparation
    def flatten(in_list):
        return list(itertools.chain(*in_list))

    def gather_list(input, indices, v):
        return [input[i] if i is not None else v for i in indices]

    def pad_list(input, k, v):
        input_len = len(input)
        target_len = (input_len + k - 1) // k * k
        padding = target_len - input_len
        return input + [v] * padding

    last_block_usage = [slot % BLOCK_SIZE + 1 for slot in slots]
    block_groups = [[i] * len(bt) for i, bt in enumerate(block_tables)]
    block_usage = [
        [BLOCK_SIZE] * (len(bt) - 1) + [lbu]
        for bt, lbu in zip(block_tables, last_block_usage)
        if bt
    ]

    block_list = flatten(block_tables)
    block_groups = flatten(block_groups)
    block_usage = flatten(block_usage)
    assert len(block_list) == len(block_groups)
    assert len(block_list) == len(block_usage)
    if use_contiguous_pa:
        if block_bucket_size is None:
            block_bucket_size = max(max(block_list) + 1, len(block_list))
            if bucketing_ctx is not None:
                block_bucket_size = bucketing_ctx.get_padded_decode_num_blocks(
                    block_bucket_size
                )
        indices: List[Any]
        indices = [None] * block_bucket_size
        for i, bid in enumerate(block_list):
            indices[bid] = i
        block_list = gather_list(block_list, indices, 0)
        block_groups = gather_list(block_groups, indices, -1)
        block_usage = gather_list(block_usage, indices, 1)
    else:
        if block_bucket_size is None:
            block_bucket_size = len(block_list)
            if bucketing_ctx is not None:
                block_bucket_size = bucketing_ctx.get_padded_decode_num_blocks(
                    block_bucket_size
                )
        block_list = pad_list(block_list, block_bucket_size, 0)
        block_groups = pad_list(block_groups, block_bucket_size, -1)
        block_usage = pad_list(block_usage, block_bucket_size, 1)
    slots_in_window_mask = None
    if slots_in_window is not None:
        slot_list = [
            block_id * BLOCK_SIZE + slot_idx
            for block_id in block_list
            for slot_idx in range(BLOCK_SIZE)
        ]
        slot_list = torch.tensor(slot_list, dtype=torch.int64)
        slot_list = slot_list.view(-1, BLOCK_SIZE)
        slots_in_window_mask = torch.isin(slot_list, slots_in_window)
        for i in range(slots_in_window_mask.shape[0]):
            if not slots_in_window_mask[i].any():
                slots_in_window_mask[i, 0] = True

    block_list = torch.tensor(block_list, dtype=torch.int, device="cpu")
    block_groups = torch.tensor(block_groups, dtype=torch.int, device="cpu")
    block_usage = torch.tensor(block_usage, dtype=dtype, device="cpu")
    return (
        block_list,
        block_groups,
        block_usage,
        slots_in_window_mask,
        block_bucket_size,
    )


@dataclass
class FlashCausalLMBatch(Batch):
    batch_id: int
    requests: List[generate_pb2.Request]
    # request id -> idx in list mapping
    requests_idx_mapping: Dict[int, int]

    # Decoder values
    # Can be a list for easy filtering
    # If `input_ids` is a list, it needs to be materialized to a tensor first
    input_ids: Union[torch.Tensor, List[List[int]]]
    # Will be set by `generate_token` and reset after each prefill forward before staying set in decode
    position_ids: Optional[torch.Tensor]
    speculative_ids: Optional[torch.Tensor]

    # Set when creating the batch
    # tensor of indices of the currently used slots, length = \sum_{i=0}^{b} s_i in prefill, length = b in decode
    # Will be set by `generate_token` and reset after each prefill forward before staying set in decode
    slot_indices: Optional[torch.Tensor]

    # list of length b of list of length s_i // block_size
    block_tables: List[List[int]]
    # tensor of size [b, max_total_seqlen // block_size] holding the paged attention block tables for all sequences
    block_tables_tensor: torch.Tensor
    # tensor of length \sum_{i=0}^{b} max_s_i  holding the paged attention slots for all sequences
    slots: torch.Tensor
    # list of length b + 1  containing the cumulative sequence slot lengths of the sequences in the batch
    # used for filtering
    cu_slots: torch.Tensor

    max_input_length: int
    max_current_length: int

    # Whether this batch contains at least one request that is prefilling
    prefilling: bool
    # Whether each request is prefilling
    prefilling_mask: List[bool]

    # Prefill metadata tensors to efficiently compute logprobs
    # tensor of length b + 1  containing the cumulative sequence lengths of the sequences in the batch, only used in prefill
    cu_seqlen_prefill: Optional[torch.Tensor]
    # Prefill cache indices is used to slice into the kv tensor before caching it into the paged attention buffers
    # as we only keep SLIDING_WINDOW values instead of the whole tensor
    prefill_cache_indices: Optional[torch.Tensor]
    # Will be set by `generate_token` and reset after each prefill forward
    prefill_head_indices: Optional[torch.Tensor]
    # Will be set by `generate_token` and reset after each prefill forward
    prefill_next_token_indices: Optional[torch.tensor]
    # Will be set by `generate_token` and reset after each prefill forward
    prefill_cu_outlens: Optional[List[int]]
    # Will be set by `generate_token` and reset after each prefill forward
    prefill_logprob_tokens: List[Optional[Tokens]]

    # All tokens
    all_input_ids: List[List[int]]
    all_input_ids_tensor: torch.Tensor

    # Lengths of all generations present in the batch
    input_lengths: List[int]
    # size [b], containing the number of blocks that can be retrieved from the cache
    cache_lengths: List[int]
    prompt_lengths: List[int]
    # Will be set by `generate_token` and reset after each prefill forward before staying set in decode
    input_lengths_tensor: Optional[torch.Tensor]
    cache_lengths_tensor: Optional[torch.Tensor]
    prompt_lengths_tensor: torch.Tensor

    prefix_offsets: List[Optional[int]]
    read_offsets: List[Optional[int]]

    # Generation helpers
    next_token_chooser: HeterogeneousNextTokenChooser
    stopping_criterias: List[StoppingCriteria]
    top_n_tokens: List[int]
    top_n_tokens_tensor: torch.Tensor

    # Adapter metadata for each request
    # Will be set by `generate_token` and reset after each prefill forward before staying set in decode
    adapter_meta: Optional[AdapterBatchMetadata]

    # Number of blocks in this batch
    num_blocks: int
    # Maximum number of blocks
    max_blocks: int

    hpu_attn_meta: Optional[HPUPagedAttentionMetadata]

    next_token_logits: Optional[torch.Tensor]
    speculative_logits: Optional[torch.Tensor]
    valid_indices: Optional[List[int]]

    def to_pb(self) -> generate_pb2.CachedBatch:
        return generate_pb2.CachedBatch(
            id=self.batch_id,
            request_ids=[r.id for r in self.requests],
            size=len(self),
            max_tokens=self.num_blocks * BLOCK_SIZE,
            current_tokens=(
                sum([len(i) for i in self.input_ids])
                if isinstance(self.input_ids, list)
                else len(self.input_ids)
            ),
        )

    @classmethod
    def batch_tokenized_inputs(
        cls, requests: Iterable[generate_pb2.Request], tokenizer
    ):
        max_length = 0
        all_input_ids = []
        batch_size = 0
        for r in requests:
            batch_size += 1
            inputs = concat_text_chunks(r.input_chunks.chunks)
            input_ids = tokenizer(
                inputs,
                truncation=True,
                max_length=r.truncate,
                add_special_tokens=r.add_special_tokens,
            )["input_ids"]
            max_length = max(max_length, len(input_ids))
            all_input_ids.append(input_ids)
        return all_input_ids

    @classmethod
    def from_tokenized(
        cls,
        pb: generate_pb2.Batch,
        tokenizer: PreTrainedTokenizerBase,
        batch_tokenized_inputs,
        dtype: torch.dtype,
        device: torch.device,
    ) -> "FlashCausalLMBatch":
        cache_lengths = []
        input_lengths = []
        prompt_lengths = []
        prefix_offsets = []
        read_offsets = []
        all_input_ids = []
        all_postfix_ids = []
        requests_idx_mapping = {}
        slots = []
        cu_slots = [0]

        next_token_chooser_parameters = []
        stopping_criterias = []
        top_n_tokens = []

        num_blocks = 0
        max_input_length = 0
        max_current_length = 0
        max_length = 0
        max_blocks = 0

        cu_blocks = [0]
        block_tables = []
        block_tables_ragged = []

        # Parse batch
        for i, (r, tokenized_input) in enumerate(
            zip(pb.requests, batch_tokenized_inputs)
        ):
            ### XXX: This consumes so much memory on long requests
            ### Deactivating it by default seems like the best course.
            if not REQUEST_LOGPROBS:
                r.prefill_logprobs = False
            else:
                assert False, "prefill_logprobs not supported yet"
            # request id -> idx in list mapping
            requests_idx_mapping[r.id] = i

            prompt_length = len(tokenized_input)
            prompt_lengths.append(prompt_length)

            cache_length = r.cache_len

            assert (
                cache_length <= prompt_length
            ), f"Prefix {cache_length} vs input {prompt_length}"
            if cache_length == prompt_length:
                assert False, "unreachable"

            # `chunk_len` is an optional field in the protobuf
            # It is only set if the model support chunking
            # Use all the remaining ids
            postfix_ids = tokenized_input[cache_length:]
            input_length = len(postfix_ids)

            input_lengths.append(input_length)

            prefix_offsets.append(prompt_length - 5)
            read_offsets.append(prompt_length)

            all_postfix_ids.append(postfix_ids)
            all_input_ids.append(tokenized_input)

            next_token_chooser_parameters.append(r.parameters)

            stopping_criteria = StoppingCriteria.from_pb(
                r.stopping_parameters, tokenizer
            )
            max_new_tokens = stopping_criteria.max_new_tokens
            stopping_criterias.append(stopping_criteria)
            top_n_tokens.append(r.top_n_tokens)

            # Paged attention
            # Remove one as the first token des not have a past
            speculative_length = get_speculate()
            speculative_length = 0 if speculative_length is None else speculative_length

            # Tokens that need to be mapped to blocks.
            block_tokens = prompt_length + max_new_tokens - 1 + speculative_length

            # blocks and slots can be empty (for example in warmup)
            if not r.blocks:
                needed_blocks = math.ceil(block_tokens / BLOCK_SIZE)
                request_blocks = [
                    b for b in range(num_blocks, num_blocks + needed_blocks)
                ]
                request_slots = [
                    s
                    for b in request_blocks
                    for s in range(b * BLOCK_SIZE, (b + 1) * BLOCK_SIZE)
                ]
            else:
                request_blocks = r.blocks
                request_slots = r.slots

            block_tables.append(request_blocks)
            block_tables_ragged.extend(request_blocks)
            cu_blocks.append(len(block_tables_ragged))

            slots.extend(request_slots)
            cu_slots.append(len(slots))

            cache_lengths.append(cache_length)
            num_blocks += len(request_blocks)

            # Update
            max_blocks = max(max_blocks, len(request_blocks))
            max_input_length = max(max_input_length, input_length)
            max_current_length = max(max_current_length, cache_length + input_length)
            max_length = max(
                max_length,
                prompt_length + max_new_tokens + speculative_length,
            )

        next_token_chooser = HeterogeneousNextTokenChooser.from_pb(
            next_token_chooser_parameters, dtype, device, tokenizer
        )

        # Padded all_input_ids_tensor
        all_input_ids_tensor = np.zeros(
            (len(all_input_ids), max_length), dtype=np.int64
        )
        for i, input_ids in enumerate(all_input_ids):
            all_input_ids_tensor[i, : len(input_ids)] = input_ids

        # put on cpu temporarily, move to hpu in prepare_for_prefill
        all_input_ids_tensor = torch.tensor(all_input_ids_tensor, dtype=torch.int64)

        top_n_tokens_tensor = torch.tensor(top_n_tokens, dtype=torch.int64)

        block_tables_ragged = torch.tensor(block_tables_ragged, dtype=torch.int32)
        cu_blocks = torch.tensor(cu_blocks, dtype=torch.int64)
        block_tables_tensor = torch.empty(
            (len(block_tables), max_blocks),
            dtype=torch.int32,
        )

        for i, request_blocks in enumerate(block_tables):
            block_tables_tensor[i, : len(request_blocks)] = torch.tensor(request_blocks)

        prompt_lengths_tensor = torch.tensor(prompt_lengths, dtype=torch.int32)

        slots = torch.tensor(slots, dtype=torch.int64)
        cu_slots = torch.tensor(cu_slots, dtype=torch.int64)

        return cls(
            batch_id=pb.id,
            requests=pb.requests,
            requests_idx_mapping=requests_idx_mapping,
            input_ids=all_postfix_ids,
            block_tables=block_tables,
            block_tables_tensor=block_tables_tensor,
            cache_lengths=cache_lengths,
            max_input_length=max_input_length,
            max_current_length=max_current_length,
            prefilling=True,
            prefilling_mask=[True] * len(pb.requests),
            prefill_logprob_tokens=[None] * len(pb.requests),
            input_lengths=input_lengths,
            prompt_lengths=prompt_lengths,
            prefix_offsets=prefix_offsets,
            read_offsets=read_offsets,
            all_input_ids=all_input_ids,
            all_input_ids_tensor=all_input_ids_tensor,
            next_token_chooser=next_token_chooser,
            stopping_criterias=stopping_criterias,
            top_n_tokens=top_n_tokens,
            top_n_tokens_tensor=top_n_tokens_tensor,
            num_blocks=num_blocks,
            max_blocks=max_blocks,
            speculative_ids=None,
            prompt_lengths_tensor=prompt_lengths_tensor,
            # These values will be set by `FlashCausalLMBatch.prepare_for_prefill`
            position_ids=None,
            cu_seqlen_prefill=None,
            prefill_cache_indices=None,
            slot_indices=None,
            slots=slots,
            cu_slots=cu_slots,
            prefill_head_indices=None,
            prefill_next_token_indices=None,
            prefill_cu_outlens=None,
            cache_lengths_tensor=None,
            input_lengths_tensor=None,
            adapter_meta=None,
            hpu_attn_meta=None,
            next_token_logits=None,
            speculative_logits=None,
            valid_indices=None,
        )

    @classmethod
    def from_pb(
        cls,
        pb: generate_pb2.Batch,
        tokenizer: PreTrainedTokenizerBase,
        dtype: torch.dtype,
        device: torch.device,
    ) -> "FlashCausalLMBatch":
        assert len(pb.requests) > 0
        batch_tokenized_inputs = cls.batch_tokenized_inputs(pb.requests, tokenizer)
        return cls.from_tokenized(pb, tokenizer, batch_tokenized_inputs, dtype, device)

    @tracer.start_as_current_span("filter")
    def filter(self, request_ids: List[int]) -> "FlashCausalLMBatch":
        if len(request_ids) == 0:
            raise ValueError("Batch must have at least one request")
        # We assume that if len(requests) == len(self) then the requests are the same
        if len(request_ids) == len(self):
            return self

        device = self.block_tables_tensor.device

        # New values after filtering
        requests_idx_mapping = {}

        # Used to index into tensors
        indices = []

        # slots to keep after filtering
        slot_filtering_indices = torch.zeros(self.slots.shape[0], dtype=torch.bool)

        # Create on CPU to only move to GPU once instead of at every copy
        slot_indices = torch.empty(len(request_ids), dtype=torch.int64)
        max_input_length = 0
        max_current_length = 0

        requests = []
        block_tables = []
        all_input_ids = []
        input_ids = []

        prompt_lengths = []
        input_lengths = []
        cache_lengths = []
        prefix_offsets = []
        read_offsets = []
        cu_slots = [0]

        prefilling_mask = []
        prefill_logprob_tokens = []

        stopping_criterias = []
        adapter_set = set()

        num_blocks = 0
        max_blocks = 0
        max_slots = 0
        cumulative_slot_tokens = 0

        for i, request_id in enumerate(request_ids):
            idx = self.requests_idx_mapping[request_id]
            indices.append(idx)
            requests_idx_mapping[request_id] = i

            requests.append(self.requests[idx])

            # Prefilling
            request_prefilling = self.prefilling_mask[idx]
            prefilling_mask.append(request_prefilling)

            # Get length
            request_input_length = self.input_lengths[idx]
            request_cache_length = self.cache_lengths[idx]
            max_input_length = max(max_input_length, request_input_length)
            max_current_length = max(
                max_current_length, request_cache_length + request_input_length
            )

            all_input_ids.append(self.all_input_ids[idx])

            prompt_lengths.append(self.prompt_lengths[idx])
            input_lengths.append(request_input_length)
            cache_lengths.append(request_cache_length)
            prefix_offsets.append(self.prefix_offsets[idx])
            read_offsets.append(self.read_offsets[idx])

            stopping_criteria = self.stopping_criterias[idx]
            stopping_criterias.append(stopping_criteria)

            prefill_logprob_tokens.append(self.prefill_logprob_tokens[idx])

            ADAPTER_TO_INDEX = get_adapter_to_index()
            adapter_index = ADAPTER_TO_INDEX.get(self.requests[idx].adapter_id, 0)
            adapter_set.add(adapter_index)

            request_block_table = self.block_tables[idx]
            num_blocks += len(request_block_table)
            block_tables.append(request_block_table)

            start_slot = self.cu_slots[idx]
            end_slot = self.cu_slots[idx + 1]
            slot_length = end_slot - start_slot

            # Set slice
            slot_filtering_indices[start_slot:end_slot] = True

            cu_slots.append(cumulative_slot_tokens + slot_length)

            # Input ids if the request was part of a prefilling batch
            # If the batch was decoding we can index into the tensor directly later
            if self.prefilling:
                input_ids.append(self.input_ids[idx])
            else:
                # Copy to tensor (CPU)
                slot_indices[i] = cumulative_slot_tokens + request_cache_length

            cumulative_slot_tokens += slot_length
            max_blocks = max(max_blocks, len(request_block_table))
            max_slots = max(max_slots, slot_length)

        block_tables_tensor = self.block_tables_tensor[indices]
        prompt_lengths_tensor = self.prompt_lengths_tensor[indices]

        cu_slots = torch.tensor(cu_slots, dtype=torch.int64)

        slots = self.slots[slot_filtering_indices]

        if self.prefilling:
            # These values will be set by `FlashCausalLMBatch.prepare_for_prefill`
            position_ids = None
            slot_indices = None
            cache_lengths_tensor = None
            input_lengths_tensor = None
            adapter_meta = None
        else:
            # Index into tensors
            input_ids = self.input_ids[indices]
            position_ids = self.position_ids[indices]
            input_lengths_tensor = self.input_lengths_tensor[indices]
            cache_lengths_tensor = self.cache_lengths_tensor[indices]

            # Move to GPU now that we have the whole tensor
            slot_indices = slot_indices.to(device)
            if self.adapter_meta is not None:
                adapter_indices = self.adapter_meta.adapter_indices[indices]
                adapter_segments, adapter_segment_indices = find_segments(
                    adapter_indices
                )
                adapter_segments = torch.tensor(adapter_segments, dtype=torch.int32)
                adapter_meta = AdapterBatchMetadata(
                    adapter_indices=adapter_indices,
                    adapter_set=adapter_set,
                    adapter_segments=adapter_segments,
                    segment_indices=adapter_segment_indices,
                )
            else:
                adapter_meta = None
        htorch.core.mark_step()
        return type(self)(
            batch_id=self.batch_id,
            requests=requests,
            requests_idx_mapping=requests_idx_mapping,
            input_ids=input_ids,
            position_ids=position_ids,
            cu_seqlen_prefill=None,
            prefill_cache_indices=None,
            slot_indices=slot_indices,
            block_tables=block_tables,
            block_tables_tensor=block_tables_tensor,
            slots=slots,
            cu_slots=cu_slots,
            max_input_length=max_input_length,
            max_current_length=max_current_length,
            prefilling=self.prefilling,
            prefilling_mask=prefilling_mask,
            prefill_head_indices=None,
            prefill_next_token_indices=None,
            prefill_cu_outlens=None,
            prefill_logprob_tokens=prefill_logprob_tokens,
            prompt_lengths=prompt_lengths,
            prompt_lengths_tensor=prompt_lengths_tensor,
            input_lengths=input_lengths,
            input_lengths_tensor=input_lengths_tensor,
            cache_lengths=cache_lengths,
            cache_lengths_tensor=cache_lengths_tensor,
            prefix_offsets=prefix_offsets,
            read_offsets=read_offsets,
            all_input_ids=all_input_ids,
            all_input_ids_tensor=self.all_input_ids_tensor,
            next_token_chooser=self.next_token_chooser,
            stopping_criterias=stopping_criterias,
            top_n_tokens=self.top_n_tokens,
            top_n_tokens_tensor=self.top_n_tokens_tensor,
            num_blocks=num_blocks,
            max_blocks=max_blocks,
            speculative_ids=self.speculative_ids,
            adapter_meta=adapter_meta,
            hpu_attn_meta=None,
            valid_indices=indices,
            next_token_logits=self.next_token_logits,
            speculative_logits=self.speculative_logits,
        )

    @classmethod
    @tracer.start_as_current_span("concatenate")
    def concatenate(
        cls, batches: List["FlashCausalLMBatch"], padded_total_bs: int = 0
    ) -> "FlashCausalLMBatch":
        # Batch attributes
        requests = []
        requests_idx_mapping = {}

        prefilling = False
        num_blocks = 0
        total_batch_size = 0
        total_slots = 0
        max_blocks = 0
        max_length = 0
        max_input_length = 0
        max_current_length = 0
        ADAPTER_TO_INDEX = get_adapter_to_index()
        for b in batches:
            total_batch_size += len(b)
            max_blocks = max(max_blocks, b.max_blocks)
            total_slots += len(b.slots)
            num_blocks += b.num_blocks
            speculative_length = (
                b.speculative_ids.shape[1] if b.speculative_ids is not None else 0
            )
            max_input_length = max(max_input_length, b.max_input_length)
            max_current_length = max(max_current_length, b.max_current_length)
            max_length = max(
                max_length,
                max(
                    prompt_length
                    + stopping_criteria.max_new_tokens
                    + speculative_length
                    for prompt_length, stopping_criteria in zip(
                        b.prompt_lengths, b.stopping_criterias
                    )
                ),
            )
            prefilling = prefilling or b.prefilling

        slots = batches[0].slots.new_empty(total_slots)
        cu_slots = torch.zeros(total_batch_size + 1, dtype=torch.int64)
        if prefilling:
            input_ids = []
            # These values will be set by `FlashCausalLMBatch.prepare_for_prefill`
            position_ids = None
            slot_indices = None
            cache_lengths_tensor = None
            input_lengths_tensor = None
            adapter_meta = None
            adapter_segment_builder = None
        else:
            if padded_total_bs == batches[0].input_ids.shape[0]:
                input_ids = batches[0].input_ids
            else:
                input_ids = batches[0].input_ids.new_empty(total_batch_size)
            if (
                batches[0].position_ids is not None
                and batches[0].position_ids.dim() == 2
            ):
                # Qwen2_vl case:
                position_ids = batches[0].position_ids.new_empty(
                    (total_batch_size, batches[0].position_ids.shape[-1])
                )
            else:
                position_ids = batches[0].position_ids.new_empty(total_batch_size)
            slot_indices = batches[0].slot_indices.new_empty(total_batch_size)
            input_lengths_tensor = batches[0].input_lengths_tensor.new_empty(
                total_batch_size
            )
            cache_lengths_tensor = batches[0].cache_lengths_tensor.new_empty(
                total_batch_size
            )
            if ADAPTER_TO_INDEX:
                total_indices_size = sum(
                    b.adapter_meta.adapter_indices.shape[0] for b in batches
                )
                adapter_indices = batches[0].adapter_meta.adapter_indices.new_empty(
                    total_indices_size
                )
                adapter_segment_builder = SegmentConcatBuilder()
                adapter_set = set()

        prompt_lengths_tensor = batches[0].prompt_lengths_tensor.new_empty(
            total_batch_size
        )
        block_tables_tensor = batches[0].block_tables_tensor.new_zeros(
            (total_batch_size, max_blocks)
        )
        all_input_ids_tensor = batches[0].all_input_ids_tensor
        top_n_tokens_tensor = batches[0].top_n_tokens_tensor.new_zeros(
            total_batch_size,
        )

        block_tables = []
        cache_lengths = []
        all_input_ids = []

        prompt_lengths = []
        input_lengths = []
        prefix_offsets = []
        read_offsets = []

        prefill_logprob_tokens = []

        next_token_chooser_parameters = []
        fsm_grammar_states = []
        stopping_criterias = []
        top_n_tokens = []
        prefilling_mask = []

        # Cumulative length
        cumulative_batch_size = 0
        cumulative_slots = 0
        cumulative_adapter_indices_size = 0

        for i, batch in enumerate(batches):
            requests.extend(batch.requests)
            valid_bsize = len(batch)

            if i == 0:
                requests_idx_mapping = batch.requests_idx_mapping
            else:
                # We need to offset the mapping for each batch by the cumulative batch size
                for k, v in batch.requests_idx_mapping.items():
                    requests_idx_mapping[k] = v + cumulative_batch_size

            start_index = cumulative_batch_size
            end_index = cumulative_batch_size + valid_bsize

            index = torch.tensor(list(range(start_index, end_index)), device="cpu")
            top_n_tokens_tensor.index_copy_(0, index, batch.top_n_tokens_tensor)
            if i > 0:
                all_input_ids_tensor.index_copy_(
                    0,
                    index.to(batch.all_input_ids_tensor.device),
                    batch.all_input_ids_tensor[:valid_bsize, :],
                )

            block_tables_tensor[
                start_index:end_index, : batch.block_tables_tensor.shape[1]
            ] = batch.block_tables_tensor[:, :max_blocks]
            prompt_lengths_tensor.index_copy_(0, index, batch.prompt_lengths_tensor)

            slots_start_index = cumulative_slots
            slots_end_index = cumulative_slots + len(batch.slots)
            slot_index = torch.tensor(
                list(range(slots_start_index, slots_end_index)),
                device=batch.slots.device,
            )

            slots.index_copy_(0, slot_index, batch.slots)
            cu_slots[start_index + 1 : end_index + 1] = (
                batch.cu_slots[1:] + cumulative_slots
            )

            if not prefilling:
                if padded_total_bs != batches[0].input_ids.shape[0] or i > 0:
                    input_ids.index_copy_(
                        0, index.to(input_ids.device), batch.input_ids[:valid_bsize]
                    )
                position_ids.index_copy_(0, index, batch.position_ids[:valid_bsize])
                slot_indices.index_copy_(
                    0, index, batch.slot_indices + cumulative_slots
                )
                input_lengths_tensor.index_copy_(
                    0, index, batch.input_lengths_tensor[:valid_bsize]
                )
                cache_lengths_tensor.index_copy_(
                    0, index, batch.cache_lengths_tensor[:valid_bsize]
                )
                if ADAPTER_TO_INDEX:
                    adapter_start_index = cumulative_adapter_indices_size
                    adapter_end_index = (
                        cumulative_adapter_indices_size
                        + batch.adapter_meta.adapter_indices.shape[0]
                    )
                    adapter_indices[adapter_start_index:adapter_end_index] = (
                        batch.adapter_meta.adapter_indices
                    )
                    cumulative_adapter_indices_size = adapter_end_index
                    adapter_set.update(batch.adapter_meta.adapter_set)
                    adapter_segment_builder.concat(
                        batch.adapter_meta.adapter_segments,
                        batch.adapter_meta.segment_indices,
                    )
            else:
                if isinstance(batch.input_ids, torch.Tensor):
                    batch.input_ids = batch.input_ids.view(-1, 1).tolist()
                input_ids.extend(batch.input_ids)

            prefilling_mask.extend(batch.prefilling_mask)
            block_tables.extend(batch.block_tables)
            cache_lengths.extend(batch.cache_lengths)
            all_input_ids.extend(batch.all_input_ids)

            prompt_lengths.extend(batch.prompt_lengths)
            input_lengths.extend(batch.input_lengths)
            prefix_offsets.extend(batch.prefix_offsets)
            read_offsets.extend(batch.read_offsets)

            prefill_logprob_tokens.extend(batch.prefill_logprob_tokens)

            next_token_chooser_parameters.extend([r.parameters for r in batch.requests])
            fsm_grammar_states.extend(batch.next_token_chooser.fsm_grammar_states)
            stopping_criterias.extend(batch.stopping_criterias)

            top_n_tokens.extend(batch.top_n_tokens)

            # Update
            cumulative_slots += len(batch.slots)
            cumulative_batch_size += len(batch)

        next_token_chooser = HeterogeneousNextTokenChooser.from_pb(
            next_token_chooser_parameters,
            dtype=batches[0].next_token_chooser.dtype,
            device=batches[0].next_token_chooser.device,
            tokenizer=batches[0].next_token_chooser.tokenizer,
            fsm_grammar_states=fsm_grammar_states,
        )

        # We skip computing the speculative_ids when the batch size is too large, so
        # we must check that all batches have them, otherwise they must be discarded
        if get_speculate() > 0 and all(b.speculative_ids is not None for b in batches):
            speculative_ids = torch.cat([b.speculative_ids for b in batches], dim=0)
        else:
            speculative_ids = None

        if ADAPTER_TO_INDEX and adapter_segment_builder is not None:
            adapter_segments, adapter_segment_indices = adapter_segment_builder.build()
            adapter_meta = AdapterBatchMetadata(
                adapter_indices=adapter_indices,
                adapter_set=adapter_set,
                adapter_segments=adapter_segments,
                segment_indices=adapter_segment_indices,
            )

        return cls(
            batch_id=batches[0].batch_id,
            requests=requests,
            requests_idx_mapping=requests_idx_mapping,
            input_ids=input_ids,
            position_ids=position_ids,
            cu_seqlen_prefill=None,
            prefill_cache_indices=None,
            slot_indices=slot_indices,
            block_tables=block_tables,
            block_tables_tensor=block_tables_tensor,
            cache_lengths=cache_lengths,
            cache_lengths_tensor=cache_lengths_tensor,
            slots=slots,
            cu_slots=cu_slots,
            max_input_length=max_input_length,
            max_current_length=max_current_length,
            prefilling=prefilling,
            prefilling_mask=prefilling_mask,
            prefill_head_indices=None,
            prefill_next_token_indices=None,
            prefill_cu_outlens=None,
            prefill_logprob_tokens=prefill_logprob_tokens,
            prompt_lengths=prompt_lengths,
            prompt_lengths_tensor=prompt_lengths_tensor,
            input_lengths=input_lengths,
            input_lengths_tensor=input_lengths_tensor,
            prefix_offsets=prefix_offsets,
            read_offsets=read_offsets,
            all_input_ids=all_input_ids,
            all_input_ids_tensor=all_input_ids_tensor,
            next_token_chooser=next_token_chooser,
            stopping_criterias=stopping_criterias,
            top_n_tokens=top_n_tokens,
            top_n_tokens_tensor=top_n_tokens_tensor,
            num_blocks=num_blocks,
            max_blocks=max_blocks,
            speculative_ids=speculative_ids,
            adapter_meta=adapter_meta if ADAPTER_TO_INDEX else None,
            hpu_attn_meta=None,
            next_token_logits=None,
            speculative_logits=None,
            valid_indices=None,
        )

    def prepare_for_decode(
        self, dtype, use_contiguous_pa, bucketing_ctx, pad_token_id, sliding_window
    ):
        block_num = [length // BLOCK_SIZE + 1 for length in self.cache_lengths]
        block_tables = []
        for i, bt in enumerate(self.block_tables):
            block_tables.append(bt[0 : block_num[i]])
        if bucketing_ctx is not None:
            padded_bs = bucketing_ctx.get_padded_decode_batch_size(
                self.input_ids.shape[0]
            )
        else:
            padded_bs = self.input_ids.shape[0]
        slots = self.slots[self.slot_indices]

        block_list, block_groups, block_usage, _, block_bucket_size = (
            generate_block_metadata(
                dtype,
                use_contiguous_pa,
                slots,
                block_tables,
                bucketing_ctx,
            )
        )
        meta = HPUPagedAttentionMetadata(
            block_list=_async_h2d_tensor_copy(block_list),
            block_groups=_async_h2d_tensor_copy(block_groups),
            block_usage=_async_h2d_tensor_copy(block_usage),
            block_mapping=None,
            attn_bias=None,
        )
        if sliding_window is not None:
            block_tables_in_window = []
            for i, bt in enumerate(self.block_tables):
                block_num_in_window = (
                    sliding_window + 2 * BLOCK_SIZE - 2 - slots[i] % BLOCK_SIZE
                ) // BLOCK_SIZE
                block_tables_in_window.append(
                    bt[max(0, block_num[i] - block_num_in_window) : block_num[i]]
                )
            slots_in_window = []
            for i, indice in enumerate(self.slot_indices):
                start_idx = indice - self.cache_lengths[i]
                mask = (
                    indice
                    - torch.arange(
                        start_idx,
                        indice + 1,
                        device=self.slots.device,
                    )
                ) < sliding_window
                slots_in_window.append(self.slots[start_idx : indice + 1][mask])
            slots_in_window = torch.cat(slots_in_window, dim=0)
            (
                block_list_in_window,
                block_groups_in_window,
                block_usage_in_window,
                slots_in_window_mask,
                _,
            ) = generate_block_metadata(
                dtype,
                use_contiguous_pa,
                slots,
                block_tables_in_window,
                bucketing_ctx,
                slots_in_window,
                block_bucket_size,
            )
            meta.block_list_in_window = _async_h2d_tensor_copy(block_list_in_window)
            meta.block_groups_in_window = _async_h2d_tensor_copy(block_groups_in_window)
            meta.block_usage_in_window = _async_h2d_tensor_copy(block_usage_in_window)
            meta.slots_in_window_mask = _async_h2d_tensor_copy(slots_in_window_mask)

        self.hpu_attn_meta = trim_attn_metadata(meta)
        self.input_ids = F.pad(
            self.input_ids, (0, padded_bs - self.input_ids.shape[0]), value=pad_token_id
        )

        if self.position_ids.dim() == 2:
            # Qwen VL case
            self.position_ids = F.pad(
                self.position_ids,
                (0, 0, 0, padded_bs - self.position_ids.shape[0]),
                value=1,
            )
        else:
            self.position_ids = F.pad(
                self.position_ids, (0, padded_bs - self.position_ids.shape[0]), value=1
            )
        self.input_lengths_tensor = F.pad(
            self.input_lengths_tensor,
            (0, padded_bs - self.input_lengths_tensor.shape[0]),
            value=0,
        )
        self.cache_lengths_tensor = F.pad(
            self.cache_lengths_tensor,
            (0, padded_bs - self.cache_lengths_tensor.shape[0]),
            value=0,
        )
        if len(self.next_token_chooser.do_sample) != padded_bs:
            next_token_chooser_parameters = []
            next_token_chooser_parameters.extend([r.parameters for r in self.requests])
            pad_next_token_chooser_parameters(next_token_chooser_parameters, padded_bs)
            # update past grammar states
            fsm_grammar_states = [0] * padded_bs

            for i, req in enumerate(self.requests):
                fsm_grammar_states[i] = self.next_token_chooser.fsm_grammar_states[i]

            self.next_token_chooser = HeterogeneousNextTokenChooser.from_pb(
                next_token_chooser_parameters,
                self.next_token_chooser.dtype,
                self.next_token_chooser.device,
                self.next_token_chooser.tokenizer,
                fsm_grammar_states,
            )

    def prepare_for_prefill(
        self, max_padded_input_len, max_padded_bs, max_total_tokens, pad_token_id
    ):
        # Prepare values if we need to continue prefilling
        # Speculation must be ignored while we prefill even with chunking
        # it simplifies everything
        assert self.speculative_ids is None

        # device = self.block_tables_tensor.device

        # hpu does not support varlen for prefill, use sdpa instead. so need to pad input_tensor, position
        # padding to left to work with sliding window
        # use prefill_cache_indices to indicate the valid kv slot, update prefill_next_token_indices to indicate
        # the right logit position
        input_ids_padded_length = []
        # need extra pad to match warmup seq
        extra_pad = max_padded_input_len - self.max_input_length
        extra_pad_bs = max_padded_bs - len(self)
        device = "hpu"
        if isinstance(self.input_ids, list) and len(self) > 1:
            input_ids_padded_length = []
            input_ids = []
            for input_id in self.input_ids:
                padded = self.max_input_length - len(input_id) + extra_pad
                if padded > 0:
                    input_id = [pad_token_id] * padded + input_id
                input_ids.append(input_id)
                input_ids_padded_length.append(padded)
            input_ids = np.concatenate(input_ids, dtype=np.int64)
            self.input_ids = torch.tensor(input_ids, dtype=torch.int64, device=device)
        elif isinstance(self.input_ids, list):
            input_ids = self.input_ids[0]
            input_ids_padded_length.append(extra_pad)
            input_ids = [pad_token_id] * extra_pad + input_ids
            self.input_ids = torch.tensor(input_ids, dtype=torch.int64, device=device)
        else:
            input_ids = torch.full(
                (max_padded_input_len * len(self),),
                pad_token_id,
                dtype=torch.int64,
                device=self.input_ids.device,
            )
            src_pos = 0
            for i in range(len(self)):
                end_pos = (i + 1) * max_padded_input_len
                start_pos = end_pos - self.input_lengths[i]
                input_ids[start_pos:end_pos] = self.input_ids[
                    src_pos : src_pos + self.input_lengths[i]
                ]
                input_ids_padded_length.append(
                    max_padded_input_len - self.input_lengths[i]
                )
                src_pos += self.input_lengths[i]
            self.input_ids = input_ids

        self.input_ids = F.pad(
            self.input_ids, (0, extra_pad_bs * max_padded_input_len), value=pad_token_id
        )

        self.input_lengths_tensor = torch.tensor(self.input_lengths, dtype=torch.int32)

        self.input_lengths_tensor = F.pad(
            self.input_lengths_tensor, (0, extra_pad_bs), value=0
        )

        cu_seqlen_prefill = self.input_lengths_tensor.new_zeros(max_padded_bs + 1)
        torch.cumsum(self.input_lengths_tensor, out=cu_seqlen_prefill[1:], dim=0)
        self.cu_seqlen_prefill = cu_seqlen_prefill.to(torch.int32)
        self.cache_lengths_tensor = torch.tensor(self.cache_lengths, dtype=torch.int32)
        self.cache_lengths_tensor = F.pad(
            self.cache_lengths_tensor, (0, extra_pad_bs), value=0
        )

        position_ids = []
        slot_indices = []
        prefill_cache_indices = []
        all_prefill_logprobs = True
        no_prefill_logprobs = True
        prefill_cu_outlens = [0]

        # Cumulative length
        cumulative_length = 0
        cumulative_slot_tokens = 0
        prefill_out_cumulative_length = 0

        adapter_indices_list = []
        adapter_set = set()

        for i, (
            r,
            cache_length,
            input_length,
            prompt_length,
            request_prefilling,
            blocks,
        ) in enumerate(
            zip(
                self.requests,
                self.cache_lengths,
                self.input_lengths,
                self.prompt_lengths,
                self.prefilling_mask,
                self.block_tables,
            )
        ):
            next_chunk_length = input_length

            # Position ids
            request_position_ids = torch.arange(
                cache_length, cache_length + input_length, dtype=torch.int32
            )
            request_position_ids = F.pad(
                request_position_ids, (input_ids_padded_length[i], 0), value=1
            )
            position_ids.append(request_position_ids)

            if not r.slots:
                request_slots = [
                    s
                    for b in blocks
                    for s in range(b * BLOCK_SIZE, (b + 1) * BLOCK_SIZE)
                ]
            else:
                request_slots = r.slots

            request_slot_indices = torch.arange(
                cache_length + cumulative_slot_tokens,
                cache_length + cumulative_slot_tokens + input_length,
                dtype=torch.int64,
            )

            slot_indices.append(request_slot_indices)

            # Update
            cumulative_slot_tokens += len(request_slots)

            # Create tensor to slice into the kv tensor in prefill
            # hpu need request_prefill_cache_indices to skip padding in kv cache
            sliding_window = input_length
            cumulative_length += input_ids_padded_length[i]
            if sliding_window is not None:
                request_prefill_cache_indices = torch.arange(
                    cumulative_length + max(0, input_length - sliding_window),
                    cumulative_length + input_length,
                    dtype=torch.int64,
                )

            # Prefill logprobs is ignored if the request is done prefilling
            prefill_logprobs = r.prefill_logprobs and request_prefilling

            all_prefill_logprobs = all_prefill_logprobs and prefill_logprobs
            no_prefill_logprobs = no_prefill_logprobs and not prefill_logprobs

            if prefill_logprobs:
                prefill_cu_outlens.append(prefill_out_cumulative_length + input_length)
                prefill_out_cumulative_length += input_length
            else:
                prefill_cu_outlens.append(prefill_out_cumulative_length + 1)
                prefill_out_cumulative_length += 1

            prefill_cache_indices.append(request_prefill_cache_indices)

            ADAPTER_TO_INDEX = get_adapter_to_index()
            if ADAPTER_TO_INDEX:
                adapter_index = ADAPTER_TO_INDEX.get(r.adapter_id, 0)
                adapter_indices_list.append(
                    torch.full((next_chunk_length,), adapter_index)
                )
                adapter_set.add(adapter_index)

            # Update
            cumulative_length += next_chunk_length

        if not all_prefill_logprobs and not no_prefill_logprobs:
            prefill_head_indices = []
            prefill_next_token_indices = []

            # Cumulative length
            cumulative_length = 0
            prefill_out_cumulative_length = 0

            for i, (
                r,
                input_length,
                request_prefilling,
            ) in enumerate(
                zip(
                    self.requests,
                    self.input_lengths,
                    self.prefilling_mask,
                )
            ):
                # Prefill logprobs is ignored if the request is done prefilling
                prefill_logprobs = r.prefill_logprobs and request_prefilling

                if prefill_logprobs:
                    prefill_head_indices.append(
                        torch.arange(
                            cumulative_length,
                            cumulative_length + input_length,
                            dtype=torch.int32,
                        )
                    )
                    prefill_next_token_indices.append(
                        prefill_out_cumulative_length + input_length - 1
                    )
                    prefill_out_cumulative_length += input_length
                else:
                    prefill_head_indices.append(
                        torch.tensor(
                            [cumulative_length + input_length - 1],
                            dtype=torch.int32,
                        )
                    )
                    prefill_next_token_indices.append(prefill_out_cumulative_length)
                    prefill_out_cumulative_length += 1

                # Update
                cumulative_length += input_length

        if len(self) > 1:
            if position_ids:
                position_ids = torch.cat(position_ids)
            if slot_indices:
                slot_indices = torch.cat(slot_indices)
            prefill_cache_indices = torch.cat(prefill_cache_indices)
        else:
            if position_ids:
                position_ids = position_ids[0]
            if slot_indices:
                slot_indices = slot_indices[0]
            prefill_cache_indices = prefill_cache_indices[0]

        self.position_ids = position_ids
        self.position_ids = F.pad(
            self.position_ids, (0, extra_pad_bs * max_padded_input_len), value=1
        )
        self.slot_indices = slot_indices

        self.prefill_cu_outlens = prefill_cu_outlens
        self.prefill_cache_indices = torch.zeros_like(
            self.input_ids, dtype=torch.bool, device="cpu"
        )
        self.prefill_cache_indices[prefill_cache_indices] = True

        if all_prefill_logprobs:
            prefill_head_indices = None
            prefill_next_token_indices = self.cu_seqlen_prefill[1:] - 1
        elif no_prefill_logprobs:
            prefill_head_indices = self.cu_seqlen_prefill[1:] - 1
            prefill_next_token_indices = None
        else:
            prefill_head_indices = torch.cat(prefill_head_indices)
            prefill_next_token_indices = torch.tensor(
                prefill_next_token_indices, dtype=torch.int64
            )

        self.prefill_head_indices = prefill_head_indices
        self.prefill_next_token_indices = prefill_next_token_indices
        input_ids_padded_length_tensor = torch.cumsum(
            torch.tensor(input_ids_padded_length, dtype=torch.int32),
            dim=-1,
        ).to(torch.int32)
        input_ids_padded_length_tensor = F.pad(
            input_ids_padded_length_tensor, (0, extra_pad_bs), value=0
        )
        if self.prefill_head_indices is not None:
            self.prefill_head_indices = (
                self.prefill_head_indices + input_ids_padded_length_tensor
            )

        if self.prefill_next_token_indices is not None:
            self.prefill_next_token_indices = (
                self.prefill_next_token_indices + input_ids_padded_length_tensor
            )
        all_input_ids_tensor = torch.full(
            (max_padded_bs, max(max_total_tokens, self.all_input_ids_tensor.shape[-1])),
            pad_token_id,
            dtype=torch.int64,
            device="hpu",
        )
        for i in range(len(self)):
            all_input_ids_tensor[i, : self.all_input_ids_tensor.shape[-1]] = (
                self.all_input_ids_tensor[i]
            )
        self.all_input_ids_tensor = all_input_ids_tensor
        if len(self.next_token_chooser.do_sample) != max_padded_bs:
            next_token_chooser_parameters = []
            next_token_chooser_parameters.extend([r.parameters for r in self.requests])
            pad_next_token_chooser_parameters(
                next_token_chooser_parameters, max_padded_bs
            )
            # update past grammar states
            fsm_grammar_states = [0] * max_padded_bs

            for i, req in enumerate(self.requests):
                fsm_grammar_states[i] = self.next_token_chooser.fsm_grammar_states[i]

            self.next_token_chooser = HeterogeneousNextTokenChooser.from_pb(
                next_token_chooser_parameters,
                self.next_token_chooser.dtype,
                self.next_token_chooser.device,
                self.next_token_chooser.tokenizer,
                fsm_grammar_states,
            )

        if ADAPTER_TO_INDEX:
            if adapter_set:
                adapter_indices = torch.cat(adapter_indices_list).to(dtype=torch.int64)
                adapter_segments, adapter_segment_indices = find_segments(
                    adapter_indices
                )
            else:
                adapter_indices = torch.zeros_like(self.input_ids)
                adapter_segments = [0, len(adapter_indices)]
                adapter_segment_indices = [len(adapter_indices) - 1]

            adapter_segments = torch.tensor(adapter_segments, dtype=torch.int32)
            self.adapter_meta = AdapterBatchMetadata(
                adapter_indices=adapter_indices,
                adapter_set=adapter_set,
                adapter_segments=adapter_segments,
                segment_indices=adapter_segment_indices,
            )

    def __len__(self):
        return len(self.requests)


ADAPTER_LAYERS = [
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
    "gate_proj",
    "up_proj",
    "down_proj",
]
ROW_PARALLEL = {"o_proj", "down_proj", "lm_head"}


class FlashCausalLM(Model):
    def __init__(
        self,
        model_id: str,
        model_class,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
        speculator: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
        lora_adapter_ids: Optional[list] = [],
        tokenizer_class: PreTrainedTokenizerBase = AutoTokenizer,
        config_class: PreTrainedTokenizerBase = AutoConfig,
        default_dtype=torch.float16,
        aliases=None,
        # Used for Santacoder override of config
        num_kv_heads: Optional[int] = None,
        # Deepseek V2 uses different QK and V dims.
        head_size: Optional[int] = None,
        skip_special_tokens: bool = True,
        kv_cache_dtype: Optional[torch.dtype] = None,
        support_chunking: bool = True,
    ):
        self.quantize = quantize
        self.process_group, rank, world_size = initialize_torch_distributed()
        if world_size > 1:
            self.process_group_cpu = torch.distributed.new_group(backend="gloo")

        device = torch.device("hpu")
        dtype = torch.bfloat16 if dtype is None else dtype

        tokenizer = tokenizer_class.from_pretrained(
            model_id,
            revision=revision,
            padding_side="left",
            truncation_side="left",
            trust_remote_code=trust_remote_code,
        )
        try:
            generation_config = GenerationConfig.from_pretrained(
                model_id, revision=revision, trust_remote_code=trust_remote_code
            )
            if isinstance(generation_config.eos_token_id, (list, set)):
                # TODO Huge hack
                tokenizer._eos_token_ids = set(generation_config.eos_token_id)
        except Exception:
            pass

        config = config_class.from_pretrained(
            model_id, revision=revision, trust_remote_code=trust_remote_code
        )
        config.quantize = quantize
        config.speculator = speculator

        torch.distributed.barrier(group=self.process_group)

        weights_loader = get_loader(quantize, model_id, revision)
        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
        weights = Weights(
            filenames,
            device,
            dtype,
            process_group=self.process_group,
            aliases=aliases,
            weights_loader=weights_loader,
        )

        prefix = None
        model = model_class(prefix, config, weights)
        torch.distributed.barrier(group=self.process_group)

        # VLM models define the config we care about in their text_config
        text_config = getattr(config, "text_config", None)
        if text_config is not None:
            config = text_config

        if getattr(config, "sliding_window", None) is None:
            config.sliding_window = None
        if getattr(config, "use_sliding_window", True) is False:
            config.sliding_window = None

        self.num_layers = config.num_hidden_layers
        self.num_heads = config.num_attention_heads // self.process_group.size()
        self.config = config
        # Validation is done in the model itself
        if num_kv_heads is None:
            num_kv_heads = getattr(config, "num_key_value_heads", None)
            # GPT-2 workaround
            if num_kv_heads is None:
                num_kv_heads = getattr(config, "n_head", None)
        if num_kv_heads is None:
            raise ValueError("Cannot get the number of key/value heads")
        self.num_kv_heads = (
            num_kv_heads // self.process_group.size()
            if num_kv_heads // self.process_group.size() > 0
            else num_kv_heads
        )
        assert self.num_kv_heads > 0

        if head_size is None:
            # Some models use GQA and different sizes for o_proj
            # and q_proj, that allows for that.
            if getattr(config, "head_dim", None) is not None:
                self.head_size = config.head_dim
            else:
                self.head_size = config.hidden_size // config.num_attention_heads
        else:
            self.head_size = head_size

        self.cuda_graphs = {}
        self.kv_cache = []
        self.kv_cache_dtype = dtype if kv_cache_dtype is None else kv_cache_dtype
        self.bucketing_ctx = None
        self.max_total_tokens = None
        self.max_input_tokens = None
        htorch.core.hpu_set_env()
        if htorch.utils.internal.is_lazy():
            htorch.hpu.wrap_in_hpu_graph(model, disable_tensor_cache=True)
        environment.set_model_config(self.config)
        self.use_contiguous_pa = (
            os.environ.get("VLLM_CONTIGUOUS_PA", "true").lower() == "true"
        )
        self.limit_hpu_graph = (
            os.environ.get("LIMIT_HPU_GRAPH", "false").lower() == "true"
        )
        self.skip_warmup = os.getenv("VLLM_SKIP_WARMUP", "false").lower() == "true"
        self.max_seq_len_to_capture = 8192
        if tokenizer.pad_token_id is None:
            if config.pad_token_id is not None:
                tokenizer.pad_token_id = config.pad_token_id
            elif config.eos_token_id is not None:
                tokenizer.pad_token_id = (
                    config.eos_token_id[0]
                    if isinstance(config.eos_token_id, list)
                    else config.eos_token_id
                )
            elif tokenizer.eos_token_id is not None:
                tokenizer.pad_token_id = tokenizer.eos_token_id
            else:
                tokenizer.pad_token_id = 0
        super().__init__(
            model_id=model_id,
            model=model,
            tokenizer=tokenizer,
            requires_padding=False,
            dtype=dtype,
            device=device,
            rank=rank,
            world_size=world_size,
            sliding_window=config.sliding_window,
            support_chunking=support_chunking,
        )

    @property
    def batch_type(self) -> Type[FlashCausalLMBatch]:
        return FlashCausalLMBatch

    def max_past(self) -> int:
        return getattr(self.model, "max_past", None)

    def init_kv_cache(
        self,
        num_blocks: int,
        num_layers: int,
        num_heads: int,
        head_size: int,
        dtype: torch.dtype,
        device: torch.device,
    ):
        self.kv_cache = []
        empty_cache()
        if self.config.model_type in ["deepseek_v3", "deepseek_v2"]:
            self.kv_cache = [
                KVCompressCache(
                    num_blocks=num_blocks,
                    head_size=self.config.kv_lora_rank + self.config.qk_rope_head_dim,
                    dtype=dtype,
                    device=device,
                )
                for _ in range(num_layers)
            ]
        else:
            self.kv_cache = [
                KVCache(
                    num_blocks=num_blocks,
                    num_heads=num_heads,
                    head_size=head_size,
                    dtype=dtype,
                    device=device,
                )
                for _ in range(num_layers)
            ]

    def warmup(
        self,
        batch: FlashCausalLMBatch,
        max_input_tokens: Optional[int],
        max_total_tokens: Optional[int],
    ):
        if os.environ.get("MAX_BATCH_SIZE") is None:
            raise RuntimeError(
                "MAX_BATCH_SIZE is not set, it should be set in the launcher "
                "using `--max-batch-size xxx`"
            )
        # The warmup batch is the biggest batch we could ever receive
        self.kv_cache = []
        empty_cache()
        self.graphed_buckets = set()
        # Inspired by the original implementation in [vllm](https://github.com/vllm-project/vllm)
        # Calculate the number of blocks that can be allocated with the free memory
        dtype_size = torch.tensor([], dtype=self.kv_cache_dtype).element_size()
        if self.config.model_type in ["deepseek_v3", "deepseek_v2"]:
            cache_block_size = BLOCK_SIZE * (
                self.config.kv_lora_rank + self.config.qk_rope_head_dim
            )
        else:
            cache_block_size = BLOCK_SIZE * self.num_kv_heads * self.head_size
            cache_block_size = cache_block_size * 2
        total_cache_size = self.num_layers * cache_block_size * dtype_size
        free_memory = get_free_memory(self.device, TGI_WIGGLE_ROOM)
        self.mem_reserved = int(free_memory * (1 - MEMORY_FRACTION))
        graph_reserved_mem = (
            float(os.environ.get("TGI_GRAPH_RESERVED_MEM", "0.1"))
            if htorch.utils.internal.is_lazy()
            else 0
        )
        mem_used_from_graph = int(
            (free_memory - self.mem_reserved) * graph_reserved_mem
        )
        log_master(
            logger.info,
            f"Free memory on device {self.device}: {format_bytes(free_memory)} used_for_graph: {format_bytes(mem_used_from_graph)} ratio {graph_reserved_mem} reserved_for_runtime: {format_bytes(self.mem_reserved)}",
        )
        if max_total_tokens is None:
            max_total_tokens = sum(batch.input_lengths)

        if max_input_tokens is None:
            max_input_tokens = max_total_tokens - 1

        self.max_total_tokens = max_total_tokens
        self.max_input_tokens = max_input_tokens
        try:
            self.init_kv_cache(
                batch.num_blocks,
                self.num_layers,
                self.num_kv_heads,
                self.head_size,
                self.kv_cache_dtype,
                self.device,
            )

            batch_num_blocks = batch.num_blocks

            num_tokens = batch.to_pb().current_tokens
            synchronize(self.device)
            _, _batch, _ = self.generate_token([batch])
        except Exception:
            raise RuntimeError(
                f"Not enough memory to handle {num_tokens} prefill tokens. "
                f"You need to decrease `--max-batch-prefill-tokens`"
            )

        synchronize(self.device)
        free_memory = get_free_memory(self.device, TGI_WIGGLE_ROOM)

        kv_memory = free_memory - self.mem_reserved - mem_used_from_graph
        num_blocks = (
            # Leave 5% for some wiggle room
            int(kv_memory // total_cache_size)
            # Add batch.num_blocks as we allocated it above, so it is included in the peak memory.
            + batch_num_blocks
        )

        log_master(logger.info, f"KV-cache blocks: {num_blocks}, size: {BLOCK_SIZE}")

        self.kv_cache = []
        empty_cache()
        self.init_kv_cache(
            num_blocks,
            self.num_layers,
            self.num_kv_heads,
            self.head_size,
            self.kv_cache_dtype,
            self.device,
        )
        self.max_batch_prefill_tokens = get_max_prefill_tokens()
        max_num_seqs = int(os.getenv("MAX_BATCH_SIZE"))
        HPUBucketingContext = get_bucketing_context()
        # need to warmup one more step since block is allocated from 1
        block_step = os.getenv("VLLM_DECODE_BLOCK_BUCKET_STEP", BLOCK_SIZE)
        max_total_tokens_aligned = math.ceil(
            max_total_tokens / BLOCK_SIZE
        ) * BLOCK_SIZE + math.ceil(block_step * BLOCK_SIZE / max_num_seqs)
        model_max_length = self.tokenizer.model_max_length
        max_position_embeddings = getattr(
            self.config, "max_position_embeddings", model_max_length
        )

        self.bucketing_ctx = HPUBucketingContext(
            max_num_seqs,
            max_num_seqs,  # self.max_num_prefill_seqs, #TODO
            BLOCK_SIZE,
            max_num_seqs * max_total_tokens_aligned,
            False,
            min(model_max_length, max_position_embeddings),
            max_input_tokens,
            max_total_tokens_aligned,
        )
        max_blocks = max(
            BLOCK_SIZE, max_num_seqs * max_total_tokens_aligned // BLOCK_SIZE
        )
        self.bucketing_ctx.num_hpu_blocks = min(max_blocks, num_blocks)
        synchronize(self.device)
        if self.skip_warmup:
            self.bucketing_ctx.generate_prompt_buckets()
            self.bucketing_ctx.generate_decode_buckets(
                self.bucketing_ctx.num_hpu_blocks
            )
            log_master(
                logger.info, "skip warmup hpu graph, not recommmended, may cause OOM"
            )
            del _batch, batch
            return int(num_blocks * BLOCK_SIZE), max_input_tokens, max_total_tokens
        self.warmup_hpu_graph(batch)
        del _batch, batch

        return int(num_blocks * BLOCK_SIZE), max_input_tokens, max_total_tokens

    def log_warmup(self, prefilling, i, max_i, batch_size, seq_len):
        free_mem = format_bytes(HabanaMemoryProfiler.current_free_device_memory())
        phase = "Prompt" if prefilling else "Decode"
        dim = "seq_len" if prefilling else "num_blocks"
        graphed_bucket = (batch_size, seq_len, prefilling)
        bypass = graphed_bucket not in self.graphed_buckets
        msg = (
            f"[Warmup][{phase}][{i+1}/{max_i}] "
            f"batch_size:{batch_size} "
            f"{dim}:{seq_len} "
            f"bypass:{bypass} "
            f"free_mem:{free_mem}"
            ", this may take a while..."
        )
        log_master(logger.info, msg)

    def use_graphs(self, prefill, seq_len, batch_size):
        if self.limit_hpu_graph and prefill:
            return False

        if self.skip_warmup:
            return True

        return (batch_size, seq_len, prefill) in self.graphed_buckets

    def align_workers(self, value, op):
        if self.world_size <= 1:
            return value
        value_t = torch.tensor(value, device="cpu")
        torch.distributed.all_reduce(value_t, op=op, group=self.process_group_cpu)
        return value_t.item()

    def warmup_hpu_graph(self, batch):
        prompt_graph_mem_ratio = float(os.environ.get("VLLM_GRAPH_PROMPT_RATIO", "0.3"))
        free_mem = HabanaMemoryProfiler.current_free_device_memory()
        graph_free_mem = free_mem - self.mem_reserved
        graph_free_mem = self.align_workers(
            graph_free_mem, torch.distributed.ReduceOp.MIN
        )
        prompt_available_memory = prompt_graph_mem_ratio * graph_free_mem
        decode_available_memory = graph_free_mem - prompt_available_memory
        msg = (
            f"Using {format_bytes(graph_free_mem)}"
            f"/{format_bytes(free_mem)} "
            "of free device memory for HPUGraphs, "
            f"{format_bytes(prompt_available_memory)} for prompt and "
            f"{format_bytes(decode_available_memory)} for decode "
            f"(VLLM_GRAPH_PROMPT_RATIO={prompt_graph_mem_ratio})"
        )
        log_master(logger.info, msg)
        start_time = time.time()
        warmup_shape_count = 0
        warmup_times = 3
        self.bucketing_ctx.generate_prompt_buckets()

        def ordering_function_min_tokens(b):
            return (b[0] * b[1], b[1], b[0])

        buckets = list(
            sorted(self.bucketing_ctx.prompt_buckets, key=ordering_function_min_tokens)
        )
        total_batch_seq = 0.001
        total_mem = 0
        available_mem = prompt_available_memory
        msg = (
            f"Prefill batch size list:{[bsz[0] for bsz in buckets]}\n"
            f"Prefill sequence length list:{[seq[1] for seq in buckets]}\n"
        )
        log_master(logger.info, msg)
        for i, (batch_size, seq_len) in enumerate(buckets):
            if batch_size * seq_len > self.max_batch_prefill_tokens:
                continue
            # Graph memory usage is proportional to seq dimension in a batch
            batch_seq = batch_size * seq_len
            mem_estimate = batch_seq / total_batch_seq * total_mem
            graphed_bucket = (batch_size, seq_len, True)
            if not (
                mem_estimate >= available_mem or batch_seq > self.max_seq_len_to_capture
            ):
                if graphed_bucket not in self.graphed_buckets:
                    self.graphed_buckets.add(graphed_bucket)
            warmup_shape_count += 1
            self.log_warmup(True, i, len(buckets), batch_size, seq_len)
            with HabanaMemoryProfiler() as mem_prof:
                for index in range(warmup_times):
                    self.warmup_prefill(seq_len, batch_size, batch)
                    synchronize(self.device)
            used_mem = self.align_workers(
                mem_prof.consumed_device_memory, torch.distributed.ReduceOp.MAX
            )
            if graphed_bucket in self.graphed_buckets:
                available_mem -= used_mem
                total_mem += used_mem
                total_batch_seq += batch_seq

        log_master(logger.info, "Prefill warmup successful.\n")

        def ordering_function_max_bs(b):
            return (-b[0], b[1])

        self.bucketing_ctx.generate_decode_buckets(self.bucketing_ctx.num_hpu_blocks)
        buckets = list(
            sorted(self.bucketing_ctx.decode_buckets, key=ordering_function_max_bs)
        )
        free_mem = HabanaMemoryProfiler.current_free_device_memory()
        total_batch_seq = 0.001
        total_mem = 0
        available_mem = free_mem - self.mem_reserved
        log_master(
            logger.info, f"Decode batch size list:{[bsz[0] for bsz in buckets]}\n"
        )
        for i, (batch_size, block_num) in enumerate(buckets):
            if batch_size > block_num:
                continue
            # Graph memory usage is proportional to seq dimension in a batch
            batch_seq = batch_size
            mem_estimate = batch_seq / total_batch_seq * total_mem
            graphed_bucket = (batch_size, block_num, False)
            if not mem_estimate >= available_mem:
                if graphed_bucket not in self.graphed_buckets:
                    self.graphed_buckets.add(graphed_bucket)
            warmup_shape_count += 1
            self.log_warmup(False, i, len(buckets), batch_size, block_num)
            with HabanaMemoryProfiler() as mem_prof:
                for index in range(warmup_times):
                    self.warmup_decode(batch_size, block_num, batch)
                    synchronize(self.device)
            used_mem = self.align_workers(
                mem_prof.consumed_device_memory, torch.distributed.ReduceOp.MAX
            )
            if graphed_bucket in self.graphed_buckets:
                available_mem -= used_mem
                total_mem += used_mem
                total_batch_seq += batch_seq

        log_master(logger.info, "Decode warmup successful.\n")

        log_master(
            logger.info,
            f"warmup hpu graph time {int(time.time() - start_time)}s warmup shape count {warmup_shape_count}",
        )

    def warmup_prefill(
        self, prompt_len: int, batch_size: int, batch: FlashCausalLMBatch
    ):
        input_ids = torch.zeros(prompt_len, dtype=batch.input_ids.dtype).repeat(
            batch_size
        )
        position_ids = torch.arange(prompt_len, dtype=batch.position_ids.dtype).repeat(
            batch_size
        )
        max_bt = (prompt_len // BLOCK_SIZE + 1) * batch_size
        block_tables = torch.arange(max_bt, dtype=torch.int32).reshape(batch_size, -1)
        slot_acc = []
        for i in range(batch_size):
            slots = []
            for b in block_tables[i]:
                slots.extend(range(b * BLOCK_SIZE, (b + 1) * BLOCK_SIZE))
            slot_acc.extend(slots[:prompt_len])
        slots = torch.tensor(slot_acc, dtype=batch.slots.dtype)

        input_lengths = torch.ones(batch_size, dtype=torch.int32) * prompt_len
        cu_seqlen_prefill = torch.zeros(batch_size + 1, dtype=torch.int32)
        torch.cumsum(input_lengths, -1, out=cu_seqlen_prefill[1:])

        seqlen = Seqlen(
            input_lengths=_async_h2d_tensor_copy(input_lengths),
        )
        lm_head_indices = input_lengths - 1
        kwargs = {}
        if htorch.utils.internal.is_lazy():
            kwargs["bypass_hpu_graphs"] = not self.use_graphs(
                True, prompt_len, batch_size
            )
        if self.sliding_window is not None:
            attn_mask = seqlen.make_sliding_window_bias(
                input_lengths.tolist(),
                self.sliding_window,
                self.dtype,
                prompt_len,
                batch_size,
            )
            seqlen.attn_mask = _async_h2d_tensor_copy(attn_mask)

        # We pass a `cu_seqlen_prefill` in order not to have to deal with paged attention cache allocation/deallocation.
        self.model.forward(
            input_ids=_async_h2d_tensor_copy(input_ids),
            position_ids=_async_h2d_tensor_copy(position_ids),
            cu_seqlen_prefill=_async_h2d_tensor_copy(cu_seqlen_prefill),
            kv_cache=self.kv_cache,
            slots=_async_h2d_tensor_copy(slots),
            seqlen=trim_seqlen_metadata(seqlen),
            lm_head_indices=_async_h2d_tensor_copy(lm_head_indices),
            adapter_data=None,
            hpu_attention_meta=None,
            **kwargs,
        )

    def warmup_decode(self, batch_size: int, block_num: int, batch: FlashCausalLMBatch):
        input_ids = torch.zeros(batch_size, dtype=batch.input_ids.dtype)
        position_ids = torch.arange(batch_size, dtype=batch.position_ids.dtype)
        blocks = [block_num // batch_size for _ in range(batch_size)]
        blocks[0] += block_num % batch_size
        block_tables = []
        slots = []
        start_idx = 0
        slot_indices = []

        # fetch the last blocked to warmup block num
        for i in range(batch_size):
            block_array = list(range(start_idx, start_idx + blocks[i]))
            slots.append(BLOCK_SIZE * block_array[-1] + BLOCK_SIZE - 1)
            slot_indices.append((start_idx + blocks[i]) * BLOCK_SIZE - 1)
            block_tables.append(block_array)
            start_idx += blocks[i]
        input_lengths = torch.ones(batch_size, dtype=torch.int32)
        cu_seqlen_prefill = torch.zeros(batch_size + 1, dtype=torch.int32)
        torch.cumsum(input_lengths, -1, out=cu_seqlen_prefill[1:])

        seqlen = Seqlen(
            input_lengths=_async_h2d_tensor_copy(input_lengths),
        )
        block_list, block_groups, block_usage, _, block_bucket_size = (
            generate_block_metadata(
                self.dtype,
                self.use_contiguous_pa,
                slots,
                block_tables,
                self.bucketing_ctx,
            )
        )
        meta = HPUPagedAttentionMetadata(
            block_list=_async_h2d_tensor_copy(block_list),
            block_groups=_async_h2d_tensor_copy(block_groups),
            block_usage=_async_h2d_tensor_copy(block_usage),
            block_mapping=None,
            attn_bias=None,
        )
        if self.sliding_window is not None:
            block_tables_in_window = []
            for i, bt in enumerate(block_tables):
                block_num_in_window = (
                    self.sliding_window + BLOCK_SIZE - 1
                ) // BLOCK_SIZE
                block_tables_in_window.append(
                    bt[max(0, blocks[i] - block_num_in_window) : blocks[i]]
                )
            slots_in_window = []
            start_idx = 0
            for i, indice in enumerate(slot_indices):
                mask = (
                    indice - torch.arange(start_idx, indice + 1)
                ) < self.sliding_window
                slots_in_window.append(torch.arange(start_idx, indice + 1)[mask])
                start_idx += blocks[i] * BLOCK_SIZE
            slots_in_window = torch.cat(slots_in_window, dim=0)
            (
                block_list_in_window,
                block_groups_in_window,
                block_usage_in_window,
                slots_in_window_mask,
                _,
            ) = generate_block_metadata(
                self.dtype,
                self.use_contiguous_pa,
                slots,
                block_tables_in_window,
                self.bucketing_ctx,
                slots_in_window,
                block_bucket_size,
            )
            meta.block_list_in_window = _async_h2d_tensor_copy(block_list_in_window)
            meta.block_groups_in_window = _async_h2d_tensor_copy(block_groups_in_window)
            meta.block_usage_in_window = _async_h2d_tensor_copy(block_usage_in_window)
            meta.slots_in_window_mask = _async_h2d_tensor_copy(slots_in_window_mask)

        hpu_attention_meta = trim_attn_metadata(meta)
        slots_tensor = torch.tensor(slots, dtype=batch.slots.dtype)
        kwargs = {}
        if htorch.utils.internal.is_lazy():
            kwargs["bypass_hpu_graphs"] = not self.use_graphs(
                False, hpu_attention_meta.block_list.shape[0], batch_size
            )
        # We pass a `cu_seqlen_prefill` in order not to have to deal with paged attention cache allocation/deallocation.
        self.model.forward(
            input_ids=_async_h2d_tensor_copy(input_ids),
            position_ids=_async_h2d_tensor_copy(position_ids),
            cu_seqlen_prefill=None,
            kv_cache=self.kv_cache,
            slots=_async_h2d_tensor_copy(slots_tensor),
            seqlen=trim_seqlen_metadata(seqlen),
            lm_head_indices=None,
            adapter_data=None,
            hpu_attention_meta=hpu_attention_meta,
            **kwargs,
        )

    def forward(
        self, batch: FlashCausalLMBatch, adapter_data: AdapterBatchData
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        # Model Forward
        if batch.speculative_ids is not None:
            input_ids = batch.input_ids
            position_ids = batch.position_ids
            cu_seqlen_prefill = batch.cu_seqlen_prefill
            kv_cache = self.kv_cache
            block_tables = batch.block_tables_tensor
            slots = batch.slots[batch.slot_indices]
            input_lengths = batch.input_lengths_tensor
            max_s = batch.max_current_length
            lm_head_indices = batch.prefill_head_indices

            speculative_ids = batch.speculative_ids

            B, speculative_length = speculative_ids.shape
            new_length = speculative_length + 1
            new_input_ids = torch.cat(
                [input_ids.unsqueeze(-1), speculative_ids], dim=1
            ).reshape(-1)
            arange = torch.arange(new_length, device=position_ids.device).unsqueeze(0)
            arange_int = arange.to(dtype=torch.int32)
            new_position_ids = (
                position_ids.unsqueeze(-1).expand(B, new_length) + arange
            ).view(-1)

            # Slots can be discontiguous when prefix caching is enabled, so we need to expand the slot_indices,
            # then update the slots with the additional indices to ensure we're grabbing the ones that have been
            # allocated
            slot_indices = (
                batch.slot_indices.unsqueeze(-1).expand(B, new_length) + arange_int
            ).view(-1)
            slots = batch.slots[slot_indices]

            input_lengths = (
                input_lengths.unsqueeze(-1).expand(B, new_length) + arange_int
            ).view(-1)

            # Add Copy the block tables for all members
            block_tables = (
                block_tables.unsqueeze(1)
                .expand(B, new_length, -1)
                .reshape(B * new_length, -1)
                .contiguous()
            )
            max_s = max_s + speculative_length

            input_ids = new_input_ids
            position_ids = new_position_ids
        else:
            input_ids = batch.input_ids
            position_ids = batch.position_ids
            cu_seqlen_prefill = batch.cu_seqlen_prefill
            kv_cache = self.kv_cache
            block_tables = batch.block_tables_tensor
            slots = batch.slots[batch.slot_indices]
            input_lengths = batch.input_lengths_tensor
            max_s = batch.max_current_length
            lm_head_indices = batch.prefill_head_indices

        if cu_seqlen_prefill is None and self.max_past() is not None:
            # In decode, not prefill, we're actually overwriting the KV-cache
            # in a circular buffer mode.
            # This makes sure the max_s for the decode pass is correct.
            max_s = min(self.max_past(), max_s)
        if batch.prefill_cache_indices is not None:
            slots_pad = torch.zeros_like(input_ids, device=slots.device)
            slots_pad[batch.prefill_cache_indices] = slots
            slots = slots_pad
        else:
            slots_pad = torch.zeros_like(input_ids, device=slots.device)
            slots_pad[: slots.shape[0]] = slots
            slots = slots_pad
        seqlen = Seqlen(
            input_lengths=_async_h2d_tensor_copy(input_lengths),
        )

        kwargs = {}
        batch_size = input_lengths.shape[0]
        prompt_len = (
            input_ids.shape[0] // batch_size
            if batch.prefilling
            else batch.hpu_attn_meta.block_list.shape[0]
        )
        if htorch.utils.internal.is_lazy():
            kwargs["bypass_hpu_graphs"] = not self.use_graphs(
                batch.prefilling, prompt_len, batch_size
            )
        if self.sliding_window is not None and batch.prefilling:
            attn_mask = seqlen.make_sliding_window_bias(
                input_lengths.tolist(),
                self.sliding_window,
                self.dtype,
                prompt_len,
                batch_size,
            )
            seqlen.attn_mask = _async_h2d_tensor_copy(attn_mask)

        logits, speculative_logits = self.model.forward(
            input_ids=input_ids,
            position_ids=_async_h2d_tensor_copy(position_ids),
            cu_seqlen_prefill=_async_h2d_tensor_copy(cu_seqlen_prefill),
            kv_cache=kv_cache,
            slots=_async_h2d_tensor_copy(slots),
            seqlen=trim_seqlen_metadata(seqlen),
            lm_head_indices=_async_h2d_tensor_copy(lm_head_indices),
            # TODO not support adapter now, need the add in the future
            adapter_data=None,
            hpu_attention_meta=batch.hpu_attn_meta,
            **kwargs,
        )
        return logits, speculative_logits

    @tracer.start_as_current_span("generate_token")
    def generate_token(
        self, batches: List[FlashCausalLMBatch]
    ) -> Tuple[List[Generation], Optional[FlashCausalLMBatch], Tuple[int, int]]:

        # In order to pipeline any actions on CPU we perform the operation in 3 main stages:
        # Stage 1. Collect next token ids of any previously started generations
        start = time.time_ns()
        prev_batches = []
        requests_to_generate = []
        for batch_id, batch in enumerate(batches):
            if batch.next_token_logits is not None:
                prefill = batch.prefilling
                if batch.prefilling:
                    batch.prefilling = False
                    batch.prefilling_mask = [False] * len(batch)

                speculate = get_speculate()
                (
                    next_input_ids,
                    next_token_logprobs,
                    logprobs,
                    accepted_ids,
                    speculative_ids,
                ) = batch.next_token_chooser(
                    batch.all_input_ids_tensor[
                        : batch.next_token_logits.shape[0], : batch.max_current_length
                    ],
                    batch.next_token_logits,
                    speculate,
                    batch.speculative_ids,
                    batch.speculative_logits,
                )

                batch_top_token_ids, batch_top_token_logprobs = batch_top_tokens(
                    batch.top_n_tokens,
                    _async_h2d_tensor_copy(batch.top_n_tokens_tensor),
                    logprobs,
                    accepted_ids,
                )
                if batch.valid_indices is not None:
                    # TODO speculative decoding handling missing
                    index = torch.arange(
                        0,
                        len(batch.valid_indices),
                        device=batch.all_input_ids_tensor.device,
                    )
                    batch.all_input_ids_tensor.index_copy_(
                        0, index, batch.all_input_ids_tensor[batch.valid_indices]
                    )
                    padded_total_bs = self.bucketing_ctx.get_padded_decode_batch_size(
                        len(batch.valid_indices)
                    )
                    next_input_ids.index_copy_(
                        0, index, next_input_ids[batch.valid_indices]
                    )
                    next_input_ids = next_input_ids[:padded_total_bs]

                    next_token_logprobs.index_copy_(
                        0, index, next_token_logprobs[batch.valid_indices]
                    )
                    accepted_ids.index_copy_(
                        0, index, accepted_ids[batch.valid_indices]
                    )
                    if speculative_ids is not None:
                        speculative_ids = speculative_ids[batch.valid_indices]
                    batch.top_n_tokens_tensor = batch.top_n_tokens_tensor[
                        batch.valid_indices
                    ]
                    top_n_tokens = []
                    batch_top_token_ids_v = []
                    batch_top_token_logprobs_v = []
                    for i in batch.valid_indices:
                        top_n_tokens.append(batch.top_n_tokens[i])
                        batch_top_token_ids_v.append(batch_top_token_ids[i])
                        batch_top_token_logprobs_v.append(batch_top_token_logprobs[i])
                    batch_top_token_ids = batch_top_token_ids_v
                    batch_top_token_logprobs = batch_top_token_logprobs_v
                    batch.top_n_tokens = top_n_tokens
                    batch.next_token_chooser = batch.next_token_chooser.filter(
                        batch.valid_indices
                    )
                    batch.valid_indices = None

                # Since we are done prefilling, all the tensors that were concatenating values for all the requests
                # instantly become of shape [BATCH_SIZE]
                if prefill:
                    indices = batch.cu_seqlen_prefill[1:] - 1
                    # pad in left
                    if batch.prefill_cache_indices is not None:
                        batch.position_ids = batch.position_ids[
                            batch.prefill_cache_indices
                        ][indices]
                    else:
                        batch.position_ids = batch.position_ids[indices]

                    batch.slot_indices = batch.slot_indices[indices[: len(batch)]]
                    if batch.adapter_meta is not None:
                        batch.adapter_meta.adapter_indices = (
                            batch.adapter_meta.adapter_indices[indices]
                        )
                # For each member of the batch
                # Cumulative length

                if batch.speculative_logits is not None:
                    cu_accepted_ids = accepted_ids.new_zeros(accepted_ids.shape[0] + 1)
                    torch.cumsum(accepted_ids, dim=0, out=cu_accepted_ids[1:])
                    for i in range(len(batch)):
                        batch.all_input_ids_tensor[
                            i,
                            batch.cache_lengths[i]
                            + batch.input_lengths[i] : batch.cache_lengths[i]
                            + batch.input_lengths[i]
                            + accepted_ids[i],
                        ] = next_input_ids[cu_accepted_ids[i] : cu_accepted_ids[i + 1]]
                    batch.input_ids = next_input_ids[cu_accepted_ids[1:] - 1]
                    accepted_ids = accepted_ids.cpu()
                    if batch.position_ids.dim() == 2:
                        # Qwen2_vl case:
                        batch.position_ids += accepted_ids.unsqueeze(-1)
                    else:
                        batch.position_ids += accepted_ids
                    batch.cache_lengths_tensor += (
                        batch.input_lengths_tensor + accepted_ids - 1
                    )
                    batch.input_lengths_tensor = torch.ones_like(
                        batch.input_lengths_tensor
                    )
                    batch.slot_indices += accepted_ids[: len(batch)]
                else:
                    index = batch.cache_lengths_tensor + batch.input_lengths_tensor
                    index = F.pad(
                        index, (0, next_input_ids.shape[0] - index.shape[0]), value=0
                    )
                    index = index.to(batch.all_input_ids_tensor.device)
                    batch_idx = torch.arange(
                        0,
                        index.shape[0],
                        dtype=torch.long,
                        device=batch.all_input_ids_tensor.device,
                    )
                    batch.all_input_ids_tensor.index_put_(
                        (batch_idx, index.long()), next_input_ids
                    )
                    batch.input_ids = next_input_ids
                    batch.position_ids += 1
                    batch.cache_lengths_tensor += batch.input_lengths_tensor
                    batch.input_lengths_tensor = torch.ones_like(
                        batch.input_lengths_tensor
                    )
                    batch.slot_indices += 1

                batch.speculative_ids = speculative_ids

                # Does a HPU <-> CPU sync internally
                if prefill and batch.adapter_meta is not None:
                    # adjust segment lengths to account for all request lengths being 1 during decoding
                    adapter_segments, _ = find_segments(
                        batch.adapter_meta.adapter_indices
                    )
                    batch.adapter_meta.adapter_segments = torch.tensor(
                        adapter_segments,
                        dtype=torch.int32,
                        device=batch.adapter_meta.adapter_segments.device,
                    )
                prev_batches.append(
                    {
                        "next_token_ids": next_input_ids,
                        "next_token_logprobs": next_token_logprobs,
                        "accepted_ids": accepted_ids,
                    }
                )
                idx = len(prev_batches) - 1

                for req_idx, req in enumerate(batch.requests):
                    new_input_length = 1
                    if batch.speculative_logits is not None:
                        new_cache_length = (
                            batch.cache_lengths[req_idx]
                            + batch.input_lengths[req_idx]
                            + accepted_ids[req_idx]
                            - 1
                        )
                    else:
                        new_cache_length = (
                            batch.cache_lengths[req_idx] + batch.input_lengths[req_idx]
                        )
                    batch.cache_lengths[req_idx] = new_cache_length
                    batch.max_input_length = max(
                        batch.max_input_length, new_input_length
                    )
                    batch.input_lengths[req_idx] = new_input_length
                    current_length = new_cache_length + new_input_length
                    batch.max_current_length = max(
                        batch.max_current_length, current_length
                    )

                    requests_to_generate.append(
                        {
                            "idx": idx,
                            "request_id": req.id,
                            "prefix_offset": batch.prefix_offsets[req_idx],
                            "read_offset": batch.read_offsets[req_idx],
                            "stopping_criteria": batch.stopping_criterias[req_idx],
                            "all_input_ids": batch.all_input_ids[req_idx],
                            "do_sample": batch.next_token_chooser.do_sample[req_idx],
                            "seed": batch.next_token_chooser.seeds[req_idx],
                            "top_n_tokens": batch.top_n_tokens[req_idx],
                            "top_token_ids": batch_top_token_ids[req_idx],
                            "top_token_logprobs": batch_top_token_logprobs[req_idx],
                        }
                    )
                if prefill:
                    # We do not need prefill tensors anymore
                    batch.cu_seqlen_prefill = None
                    batch.prefill_cache_indices = None
                    batch.prefill_cu_outlens = None
                    batch.prefill_head_indices = None
                    batch.prefill_next_token_indices = None
                batch.next_token_logits = None
                batch.speculative_ids = None

        htorch.core.mark_step()
        # Stage 2. Prepare new batch for speculative scheduling
        if len(batches) > 1:
            if self.bucketing_ctx is not None:
                total_batch_size = 0
                for b in batches:
                    total_batch_size += len(b)
                padded_total_bs = self.bucketing_ctx.get_padded_decode_batch_size(
                    total_batch_size
                )
                batch = self.batch_type.concatenate(
                    batches, padded_total_bs=padded_total_bs
                )
            else:
                batch = self.batch_type.concatenate(batches)
        else:
            batch = batches[0]
        prefill = batch.prefilling
        if prefill:
            if self.bucketing_ctx is not None:
                batch.prepare_for_prefill(
                    self.bucketing_ctx.get_padded_prompt_seq_len(
                        batch.max_input_length
                    ),
                    self.bucketing_ctx.get_padded_prompt_batch_size(len(batch)),
                    self.max_total_tokens,
                    self.tokenizer.pad_token_id,
                )
            else:
                batch.prepare_for_prefill(
                    batch.max_input_length,
                    len(batch),
                    self.max_total_tokens,
                    self.tokenizer.pad_token_id,
                )
        else:
            batch.prepare_for_decode(
                self.dtype,
                self.use_contiguous_pa,
                self.bucketing_ctx,
                self.tokenizer.pad_token_id,
                self.sliding_window,
            )
        if hasattr(self, "set_inputs_embeds") and callable(self.set_inputs_embeds):
            self.set_inputs_embeds(batch)
        prefill_logprobs = batch.prefill_next_token_indices is not None
        # Update adapter indices for speculative tokens (if present)
        adapter_meta = batch.adapter_meta
        if adapter_meta is not None:
            if batch.speculative_ids is not None:
                B, speculative_length = batch.speculative_ids.shape
                new_length = speculative_length + 1
                adapter_indices = (
                    adapter_meta.adapter_indices.unsqueeze(-1)
                    .expand(B, new_length)
                    .reshape(-1)
                )
                adapter_segments = adapter_meta.adapter_segments * new_length
                adapter_meta = AdapterBatchMetadata(
                    adapter_indices=adapter_indices,
                    adapter_set=adapter_meta.adapter_set,
                    adapter_segments=adapter_segments,
                    segment_indices=adapter_meta.segment_indices,
                )

            # Assign pointers to adapter weights
            # TODO(travis): don't update this if indices haven't changed
            adapter_data = AdapterBatchData.from_meta(
                adapter_meta,
                self.layer_to_adapter_weights,
                prefill,
                batch.prefill_head_indices,
            )
        else:
            adapter_data = None

        out, speculative_logits = self.forward(batch, adapter_data)

        if prefill:
            batch.next_token_logits = (
                out[batch.prefill_next_token_indices] if prefill_logprobs else out
            )
            if speculative_logits is not None:
                speculative_logits = (
                    speculative_logits[batch.prefill_next_token_indices]
                    if prefill_logprobs
                    else speculative_logits
                )
        else:
            prefill_logprobs = None
            batch.next_token_logits = out
        batch.speculative_logits = speculative_logits

        # HPU->CPU sync
        htorch.core.mark_step()
        start_decode = time.time_ns()
        for prev_batch in prev_batches:
            prev_batch["next_token_logprobs"] = prev_batch[
                "next_token_logprobs"
            ].tolist()
            prev_batch["next_token_ids"] = prev_batch["next_token_ids"].tolist()
            prev_batch["accepted_ids"] = prev_batch["accepted_ids"].tolist()
        htorch.core.mark_step()
        # Stage 3. Finish and return previous generations
        # Results
        generations: List[Generation] = []
        stopped = len(requests_to_generate) > 0
        # Reset max_input_length
        batch.max_input_length = 0
        # For each member of the batch
        indexs = [0] * len(prev_batches)
        idx_accept_ids = [0] * len(prev_batches)
        for i, req_data in enumerate(requests_to_generate):
            idx = req_data["idx"]
            request_id = req_data["request_id"]
            prefix_offset = req_data["prefix_offset"]
            read_offset = req_data["read_offset"]
            stopping_criteria = req_data["stopping_criteria"]
            all_input_ids = req_data["all_input_ids"]
            do_sample = req_data["do_sample"]
            seed = req_data["seed"]
            top_n_tokens = req_data["top_n_tokens"]
            n_accepted_ids = prev_batches[idx]["accepted_ids"][idx_accept_ids[idx]]
            top_token_ids = req_data["top_token_ids"]
            top_token_logprobs = req_data["top_token_logprobs"]
            # Append next token to all tokens
            next_token_texts = []
            left = 0

            if n_accepted_ids > 1:
                log_master(logger.debug, f"speculated ids {n_accepted_ids - 1}")

            current_stopped = False
            index = indexs[idx]
            for j in range(index, index + n_accepted_ids):
                # Generated token
                next_token_id = prev_batches[idx]["next_token_ids"][j]
                all_input_ids.append(next_token_id)
                next_token_text, prefix_offset, read_offset = self.decode_token(
                    all_input_ids,
                    prefix_offset,
                    read_offset,
                )
                next_token_texts.append(next_token_text)

                stop, reason = stopping_criteria(
                    next_token_id,
                    next_token_text,
                )

                if stop:
                    left = index + n_accepted_ids - j - 1
                    current_stopped = True
                    break
                else:
                    current_stopped = False
            stopped = stopped and current_stopped

            _next_token_ids = prev_batches[idx]["next_token_ids"][
                index : index + n_accepted_ids - left
            ]
            _next_token_logprobs = prev_batches[idx]["next_token_logprobs"][
                index : index + n_accepted_ids - left
            ]

            # Shard generations
            # All generations will be appended in the rust sharded client
            if request_id % self.world_size == self.rank:
                if stop:
                    # Decode generated tokens
                    output_text, _, _ = self.decode_token(
                        all_input_ids,
                        prefix_offset=len(all_input_ids)
                        - stopping_criteria.current_tokens
                        - 1,
                        read_offset=len(all_input_ids)
                        - stopping_criteria.current_tokens,
                        skip_special_tokens=True,
                    )
                    generated_text = GeneratedText(
                        output_text,
                        stopping_criteria.current_tokens,
                        reason,
                        seed if do_sample else None,
                    )
                else:
                    generated_text = None

                if top_n_tokens > 0:
                    all_top_tokens = []
                    for top_token_ids, top_token_logprobs in zip(
                        top_token_ids, top_token_logprobs
                    ):
                        toptoken_texts = self.tokenizer.batch_decode(
                            top_token_ids,
                            clean_up_tokenization_spaces=False,
                            skip_special_tokens=False,
                        )
                        special_toptokens = [
                            token_id in self.all_special_ids
                            for token_id in top_token_ids
                        ]
                        top_tokens = Tokens(
                            top_token_ids,
                            top_token_logprobs,
                            toptoken_texts,
                            special_toptokens,
                        )
                        all_top_tokens.append(top_tokens)
                    top_tokens = all_top_tokens
                else:
                    top_tokens = None

                generation = Generation(
                    request_id,
                    None,
                    Tokens(
                        _next_token_ids,
                        _next_token_logprobs,
                        next_token_texts,
                        [nid in self.all_special_ids for nid in _next_token_ids],
                    ),
                    generated_text,
                    top_tokens,
                )

                generations.append(generation)

                # accept each new token for this specific request since we may
                # have more than one new token per request with speculative decoding
                for next_token_id in _next_token_ids:
                    batch.next_token_chooser = (
                        batch.next_token_chooser.advance_grammar_single(
                            i, next_token_id
                        )
                    )

            # Update values
            indexs[idx] += n_accepted_ids
            idx_accept_ids[idx] += 1

            batch.prefix_offsets[i] = prefix_offset
            batch.read_offsets[i] = read_offset
            batch.all_input_ids[i] = all_input_ids
        htorch.core.mark_step()
        if stopped:
            # No need to return a batch if we know that all requests stopped
            forward_ns = start_decode - start
            decode_ns = time.time_ns() - start_decode
            return generations, None, (forward_ns, decode_ns)

        forward_ns = start_decode - start
        decode_ns = time.time_ns() - start_decode
        return generations, batch, (forward_ns, decode_ns)


================================================
FILE: backends/gaudi/server/text_generation_server/models/flash_vlm_causal_lm.py
================================================
import torch
from PIL import Image
from io import BytesIO
from dataclasses import dataclass
from opentelemetry import trace
from typing import Iterable, Optional, Tuple, List, Type, Dict

from transformers import PreTrainedTokenizerBase
from transformers.image_processing_utils import select_best_resolution
from text_generation_server.pb import generate_pb2
from text_generation_server.models.flash_causal_lm import (
    FlashCausalLMBatch,
    FlashCausalLM,
    generate_block_metadata,
)
from text_generation_server.models.globals import PREFIX_CACHING, BLOCK_SIZE
from loguru import logger
from text_generation_server.utils.log import log_master
from transformers import AutoProcessor
from text_generation_server.layers.attention import (
    Seqlen,
    trim_seqlen_metadata,
    _async_h2d_tensor_copy,
    HPUPagedAttentionMetadata,
    trim_attn_metadata,
)
import habana_frameworks.torch as htorch
import time
from text_generation_server.utils.import_utils import (
    synchronize,
)
from vllm_hpu_extension.profiler import HabanaMemoryProfiler, format_bytes

tracer = trace.get_tracer(__name__)

IDEFICS2_FAKE_TOKEN = "<fake_token_around_image>"
IDEFICS2_IMAGE_TOKEN = "<image>"

IDEFICS3_IMAGE_TOKEN = "<image>"
IDEFICS3_FAKE_IMAGE_TOKEN = "<fake_token_around_image>"
IDEFICS3_GLOBAL_IMG_TOKEN = "<global-img>"


def prompt_split_image_llama4(aspect_ratio, num_patches_per_chunk):
    """
    Create a structured string representation of image tokens

    Args:
       num_patches: Number of patches in the image

    Returns:
        String with appropriate image tokens
    """
    img_string = "<|image_start|>"
    ratio_h, ratio_w = aspect_ratio
    if ratio_h * ratio_w > 1:
        for yy in range(ratio_h):
            for xx in range(ratio_w):
                img_string += "<|patch|>" * num_patches_per_chunk
                if xx < ratio_w - 1:
                    img_string += "<|tile_x_separator|>"

            img_string += "<|tile_y_separator|>"
    img_string += "<|image|>"
    img_string += "<|patch|>" * num_patches_per_chunk
    img_string += "<|image_end|>"

    return img_string


# copied from: https://github.com/huggingface/transformers/blob/02ed609285c2448b3b54c31e362f2c389fa952ab/src/transformers/models/idefics3/processing_idefics3.py#L44-L60
def _prompt_split_image(
    *,
    image_seq_len: int,
    image_rows: int,
    image_cols: int,
    fake_token_around_image: str,
    image_token: str,
    global_img_token: str,
):
    """Prompt with expanded image tokens for when the image is split into patches."""
    text_split_images = ""
    for n_h in range(image_rows):
        for n_w in range(image_cols):
            text_split_images += (
                f"{fake_token_around_image}"
                + f"<row_{n_h + 1}_col_{n_w + 1}>"
                + f"{image_token}" * image_seq_len
            )
        text_split_images += "\n"

    text_split_images += (
        f"\n{fake_token_around_image}"
        + f"{global_img_token}"
        + f"{image_token}" * image_seq_len
        + f"{fake_token_around_image}"
    )
    return text_split_images


def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
    """
    Calculate the shape of the image patch grid after the preprocessing for images of any resolution.

    Args:
        image_size (`tuple`):
            The size of the input image in the format (height, width).
        grid_pinpoints (`List`):
            A list containing possible resolutions. Each item in the list should be a tuple or list
            of the form `(height, width)`.
        patch_size (`int`):
            The size of each image patch.

    Returns:
        tuple: The shape of the image patch grid in the format (width, height).
    """
    if not isinstance(grid_pinpoints, list):
        raise ValueError("grid_pinpoints should be a list of tuples or lists")

    height, width = select_best_resolution(image_size, grid_pinpoints)
    return height // patch_size, width // patch_size


def image_text_replacement(processor, image_input, config) -> str:
    if config.model_type == "idefics2":
        image_seq_len = 64
        image_str = f"{IDEFICS2_FAKE_TOKEN}{IDEFICS2_IMAGE_TOKEN * image_seq_len}{IDEFICS2_FAKE_TOKEN}"
        if processor.image_processor.do_image_splitting:
            image_str *= 5
        return image_str, IDEFICS2_FAKE_TOKEN
    if config.model_type == "idefics3":
        # TODO: implement this in a more general way
        n_rows = image_input["rows"][0][0]
        n_cols = image_input["cols"][0][0]
        image_seq_len = int(
            ((config.vision_config.image_size // config.vision_config.patch_size) ** 2)
            / (config.scale_factor**2)
        )
        image_str = _prompt_split_image(
            image_seq_len=image_seq_len,
            image_rows=n_rows,
            image_cols=n_cols,
            fake_token_around_image=IDEFICS3_FAKE_IMAGE_TOKEN,
            image_token=IDEFICS3_IMAGE_TOKEN,
            global_img_token=IDEFICS3_GLOBAL_IMG_TOKEN,
        )
        return image_str, IDEFICS3_FAKE_IMAGE_TOKEN
    elif config.model_type == "llava_next":
        height, width = image_input["image_sizes"][0]
        num_features = get_number_of_features(height, width, config)

        log_master(
            logger.info,
            f"Found {num_features} features in image of resolution {height}x{width}",
        )
        return "<image>" * num_features, "<image>"

    elif config.model_type == "paligemma":
        return "<image>" * config.text_config.num_image_tokens, "<image>"
    elif config.model_type == "qwen2_vl":
        grid_t, grid_h, grid_w = image_input["image_grid_thw"][0]
        num_pads = grid_t * grid_h * grid_w // 4
        padding = "<|image_pad|>" * num_pads
        return f"<|vision_start|>{padding}<|vision_end|>", "<|vision_start|>"
    elif config.model_type == "qwen2_5_vl":
        grid_t, grid_h, grid_w = image_input["image_grid_thw"][0]
        num_pads = grid_t * grid_h * grid_w // 4
        padding = "<|image_pad|>" * num_pads
        return f"<|vision_start|>{padding}<|vision_end|>", "<|vision_start|>"
    elif config.model_type == "gemma3":
        # TODO: get correct number of features via reviewing the Gemma3 architecture
        # and calculating the number of image tokens
        num_pads = 256
        padding = "<image_soft_token>" * num_pads
        return f"\n\n<start_of_image>{padding}<end_of_image>\n\n", "<start_of_image>"
    elif config.model_type == "llama4":
        patch_size = config.vision_config.patch_size
        pixel_shuffle_ratio = config.vision_config.pixel_shuffle_ratio
        downsample_ratio = int(round(1.0 / (pixel_shuffle_ratio**2)))
        aspect_ratios = image_input["aspect_ratios"][0]
        image_height, image_width = image_input["pixel_values"][0].shape[-2:]

        num_patches_per_chunk = int(
            (image_height // patch_size)
            * (image_width // patch_size)
            // downsample_ratio
        )
        tokens_for_this_image = prompt_split_image_llama4(
            aspect_ratios, num_patches_per_chunk
        )

        return tokens_for_this_image, "<|image_start|>"
    else:
        raise RuntimeError(f"Unknown config {config.model_type} for multimodal")


def image_text_replacement_fixup(config, text: str) -> str:
    if config.model_type == "idefics2":
        return text.replace(
            f"{IDEFICS2_FAKE_TOKEN}{IDEFICS2_FAKE_TOKEN}", IDEFICS2_FAKE_TOKEN
        )
    return text


def preprocess_text(config, text: str) -> str:
    if config.model_type == "paligemma":
        return "<bos>" + text + "\n"
    return text


def preprocess_image(config, img):
    model_type = config.model_type

    if model_type in {"qwen2_vl", "qwen2_5_vl"} and img.width <= 20:
        img = img.resize((img.width * 2, img.height * 2))
    if model_type == "paligemma":
        img = img.convert("RGB")

    if model_type not in {"llava_next", "gemma3", "llama4"}:
        # TODO: check if this is needed
        img = [img]

    return img


def get_unpadded_features(
    original_height: int,
    original_width: int,
    npatches: int,
    num_patch_height: int,
    num_patch_width: int,
) -> Tuple[int, int]:
    current_height = npatches * num_patch_height
    current_width = npatches * num_patch_width

    aspect_ratio: float = original_width / original_height
    current_aspect_ratio: float = current_width / current_height

    if aspect_ratio > current_aspect_ratio:
        new_height = (original_height * current_width) // original_width
        padding = (current_height - new_height) // 2
        current_height = current_height - (2 * padding)
    else:
        new_width = (original_width * current_height) // original_height
        padding = (current_width - new_width) // 2
        current_width = current_width - (2 * padding)

    unpadded_features = current_height * current_width
    newline_features = current_height
    return (unpadded_features, newline_features)


def get_number_of_features(height: int, width: int, config) -> int:
    # From config
    # Hardcoded for CLIP for now
    # image_grid_pinpoints = [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]
    image_grid_pinpoints = config.image_grid_pinpoints
    image_size = config.vision_config.image_size
    patch_size = config.vision_config.patch_size

    assert image_size % patch_size == 0

    npatches = image_size // patch_size

    # Dimensions are intentionally swapped to be bug-compatible with
    # upstream: https://github.com/LLaVA-VL/LLaVA-NeXT/issues/59
    num_patch_width, num_patch_height = get_anyres_image_grid_shape(
        [height, width],
        image_grid_pinpoints,
        image_size,
    )
    unpadded_features, newline_features = get_unpadded_features(
        height, width, npatches, num_patch_height, num_patch_width
    )
    # The base patch covers the entire image
    base_features = npatches**2
    return unpadded_features + newline_features + base_features


def scatter_image_embeds(
    embeds: torch.Tensor, is_embed: Optional[torch.Tensor]
) -> torch.Tensor:
    if is_embed is None:
        return embeds

    placeholders = embeds.new_full(
        (is_embed.shape[0], embeds.shape[-1]),
        fill_value=torch.nan,
    )
    placeholders[is_embed.to(embeds.device)] = embeds
    return placeholders


def gather_image_embeds(
    embeds: torch.Tensor, is_embed: Optional[torch.Tensor]
) -> Optional[torch.Tensor]:
    if is_embed is None:
        return embeds
    sel = embeds[is_embed.to(embeds.device)]
    return sel if sel.numel() else None


@dataclass
class ImagePositions:
    offset: int
    length: int
    id: int
    num_placeholder_tokens: int
    is_embed: Optional[torch.Tensor] = None


class FlashVlmCausalLMBatch(FlashCausalLMBatch):
    image_inputs: Optional[List[List[Dict[str, torch.Tensor]]]]
    image_positions: Optional[List[List[ImagePositions]]]
    encoder_cache: Optional[List[Dict[int, torch.Tensor]]]
    pixel_values: Optional[List[torch.Tensor]]
    pixel_attention_mask: Optional[List[torch.Tensor]]
    image_sizes: Optional[List[Tuple[int, int]]]
    image_grid_thw: Optional[torch.Tensor]
    cache_entries_to_free: List[Tuple[int, int]]
    has_image_inputs: bool = False
    inputs_embeds: Optional[torch.Tensor] = None

    @classmethod
    @tracer.start_as_current_span("concatenate")
    def concatenate(cls, batches, padded_total_bs: int = 0):
        batch = super(FlashVlmCausalLMBatch, cls).concatenate(batches, padded_total_bs)
        batch.image_inputs = []
        batch.image_positions = []
        batch.encoder_cache = []
        for b in batches:
            if b.image_inputs is not None:
                batch.image_inputs.extend(b.image_inputs)
            else:
                batch.image_inputs.append(None)
            if b.image_positions is not None:
                batch.image_positions.extend(b.image_positions)
            else:
                batch.image_positions.append(None)
            if b.encoder_cache is not None:
                batch.encoder_cache.extend(b.encoder_cache)
            else:
                batch.encoder_cache.append(None)

        batch.pixel_values = None
        batch.pixel_attention_mask = None
        batch.image_sizes = None
        batch.image_grid_thw = None
        batch.inputs_embeds = None
        # To be filled in prepare_for_prefill
        batch.has_image_inputs = False
        batch.cache_entries_to_free = []
        return batch

    @tracer.start_as_current_span("filter")
    def filter(self, request_ids: List[int]):
        if len(request_ids) == 0:
            raise ValueError("Batch must have at least one request")

        image_inputs = []
        image_positions = []
        encoder_cache = []

        for request_id in request_ids:
            idx = self.requests_idx_mapping[request_id]
            image_inputs.append(self.image_inputs[idx])
            image_positions.append(self.image_positions[idx])
            encoder_cache.append(self.encoder_cache[idx])

        batch = super().filter(request_ids)
        batch.pixel_values = None
        batch.pixel_attention_mask = None
        batch.image_sizes = None
        batch.image_grid_thw = None
        batch.inputs_embeds = None
        batch.image_inputs = image_inputs
        batch.image_positions = image_positions
        batch.encoder_cache = encoder_cache

        # To be filled in prepare_for_prefill
        batch.has_image_inputs = False
        batch.cache_entries_to_free = []
        return batch

    @classmethod
    def batch_tokenized_inputs(
        cls, requests: Iterable[generate_pb2.Request], tokenizer, processor, config
    ):
        kwargs = {}
        if (
            hasattr(processor, "image_processor_class")
            and processor.image_processor_class == "Idefics3ImageProcessor"
        ):
            kwargs["return_row_col_info"] = True

        max_length = 0
        vocab = tokenizer.get_vocab()

        if not hasattr(config, "image_token_index"):
            config.image_token_index = config.image_token_id

        batch_tokenized_inputs: List[List[int]] = []
        batch_image_inputs: List[Optional[List[dict]]] = []
        batch_image_positions: List[Optional[List[ImagePositions]]] = []

        for r in requests:
            text_parts = []
            image_inputs = []
            image_texts = []

            image_id = 0

            for chunk in r.input_chunks.chunks:
                chunk_type = chunk.WhichOneof("chunk")
                if chunk_type == "text":
                    text = preprocess_text(config, chunk.text)
                    text_parts.append(text)
                elif chunk_type == "image":
                    img = Image.open(BytesIO(chunk.image.data))
                    img = preprocess_image(config, img)

                    image_input = processor.image_processor(
                        [img], return_tensors="pt", **kwargs
                    )
                    image_inputs.append(image_input)

                    img_text, img_start_token_str = image_text_replacement(
                        processor, image_input, config
                    )
                    text_parts.append(img_text)

                    image_texts.append([image_id, img_start_token_str, img_text])
                    image_id += 1
                else:
                    raise RuntimeError(f"Invalid chunk type {chunk_type}")

            full_text = image_text_replacement_fixup(config, "".join(text_parts))
            input_ids = tokenizer(
                full_text,
                truncation=True,
                max_length=r.truncate,
                add_special_tokens=(
                    r.add_special_tokens if config.model_type != "paligemma" else False
                ),
            )["input_ids"]
            max_length = max(max_length, len(input_ids))

            if len(image_inputs) > 0:
                img_start_token = vocab[image_texts[0][1]]
                image_positions = cls.get_image_positions(
                    input_ids, image_texts, img_start_token, config, tokenizer
                )
            else:
                image_inputs = None
                image_positions = None

            batch_tokenized_inputs.append(input_ids)
            batch_image_inputs.append(image_inputs)
            batch_image_positions.append(image_positions)

        return batch_tokenized_inputs, batch_image_inputs, batch_image_positions

    @classmethod
    def get_image_positions(
        cls,
        input_ids: List[int],
        image_texts: List[Tuple[int, str, str]],
        img_start_token: int,
        config,
        tokenizer: PreTrainedTokenizerBase,
    ) -> List[ImagePositions]:
        image_positions = []
        num_images = len(image_texts)

        input_ids_t = torch.as_tensor(input_ids)
        img_start_token_pos = torch.where(input_ids_t.eq(img_start_token))[0]
        num_tokens = input_ids_t.numel()

        last_pos = 0
        for i in range(num_images):
            image_id, img_start_token_str, img_text = image_texts[i]
            img_text = image_text_replacement_fixup(config, img_text)

            if config.model_type == "gemma3":
                img_text = img_text.replace("\n\n", "")

            tokens = tokenizer(img_text, add_special_tokens=False, return_tensors="pt")[
                "input_ids"
            ][0]
            length = tokens.numel()

            assert (
                length <= num_tokens
            ), f"{length} > {num_tokens} Image is truncated, try increasing --max-batch-prefill-tokens"

            pos = torch.searchsorted(img_start_token_pos, last_pos, right=False)
            index = img_start_token_pos[pos]
            assert torch.equal(
                input_ids_t[index : index + length], tokens
            ), "Image tokens not found in input_ids"

            is_embed = tokens == config.image_token_index
            num_placeholder_tokens = int(is_embed.sum())
            if num_placeholder_tokens == length:
                is_embed = None

            pos = ImagePositions(
                offset=index,
                length=length,
                id=image_id,
                num_placeholder_tokens=num_placeholder_tokens,
                is_embed=is_embed,
            )

            image_positions.append(pos)
            last_pos = index + length

            if (
                config.model_type == "idefics2"
                and i + 1 != num_images
                and input_ids[last_pos] == config.image_token_index
            ):
                fake_token = last_pos - 1
                fake_token_index = torch.searchsorted(
                    img_start_token_pos, fake_token, right=False
                )
                img_start_token_pos[fake_token_index] = last_pos
                image_texts[i + 1][2] = image_texts[i + 1][2][
                    len(img_start_token_str) :
                ]

        return image_positions

    @classmethod
    def from_pb_processor(
        cls,
        pb: generate_pb2.Batch,
        tokenizer: PreTrainedTokenizerBase,
        processor,
        config,
        dtype: torch.dtype,
        device: torch.device,
    ) -> "FlashVlmCausalLMBatch":
        batch_tokenized_inputs, image_inputs, image_positions = (
            cls.batch_tokenized_inputs(pb.requests, tokenizer, processor, config)
        )
        batch = cls.from_tokenized(pb, tokenizer, batch_tokenized_inputs, dtype, device)
        batch.image_inputs = image_inputs
        batch.image_positions = image_positions
        batch.encoder_cache = [{} for _ in range(len(pb.requests))]
        if len(image_inputs):
            batch.pixel_values = None
            batch.pixel_attention_mask = None
            batch.image_sizes = None
            batch.image_grid_thw = None
        return batch

    def prepare_for_prefill(
        self, max_padded_input_len, max_padded_bs, max_total_tokens, pad_token_id
    ):
        super().prepare_for_prefill(
            max_padded_input_len, max_padded_bs, max_total_tokens, pad_token_id
        )

        self.has_image_inputs = False
        self.cache_entries_to_free = []

        self.pixel_values = []

        assert (
            len(self.cache_lengths)
            == len(self.input_lengths)
            == len(self.prefilling_mask)
        ), "Mismatch in lengths of cache_lengths, input_lengths, and prefilling_mask"

        for i, (
            cache_length,
            input_length,
            request_prefilling,
        ) in enumerate(
            zip(
                self.cache_lengths,
                self.input_lengths,
                self.prefilling_mask,
            )
        ):
            if not request_prefilling or self.image_positions[i] is None:
                continue

            for image_position in self.image_positions[i]:
                if image_position is None:
                    continue
                start_pos = image_position.offset
                length = image_position.length

                if start_pos >= cache_length + input_length:
                    # No encoder input required at this step
                    break
                if start_pos + length <= cache_length:
                    # The encode input is already processed
                    continue

                self.has_image_inputs = True

                if image_position.id not in self.encoder_cache[i]:
                    image_inputs = self.image_inputs[i][image_position.id]
                    self.pixel_values.append((i, image_position.id, image_inputs))

                    # Remove the image from the image_inputs
                    self.image_inputs[i][image_position.id] = None

        if not self.has_image_inputs:
            self.pixel_values = None
            self.pixel_attention_mask = None
            self.image_sizes = None
            self.image_grid_thw = None
        else:
            image_grid_thw_list = [
                x[2]["image_grid_thw"]
                for x in self.pixel_values
                if "image_grid_thw" in x[2]
            ]
            if image_grid_thw_list:
                self.image_grid_thw = torch.cat(image_grid_thw_list, dim=0)
            else:
                self.image_grid_thw = None

    def update_encoder_cache(self, encoder_outputs, request_id, img_pos):
        self.encoder_cache[request_id][img_pos.id] = scatter_image_embeds(
            encoder_outputs, img_pos.is_embed
        )

    def gather_vision_embeds(self):
        device = self.input_ids.device
        chunks = []
        for (
            i,
            cache_length,
            input_length,
            request_prefilling,
        ) in zip(
            range(len(self.requests)),
            self.cache_lengths,
            self.input_lengths,
            self.prefilling_mask,
        ):
            if not request_prefilling or self.image_positions[i] is None:
                continue

            for image_position in self.image_positions[i]:
                if image_position is None:
                    continue
                start_pos = image_position.offset
                length = image_position.length

                if start_pos >= cache_length + input_length:
                    # No encoder input required at this step
                    break
                if start_pos + length <= cache_length:
                    # The encode input is already processed
                    continue

                start_idx = max(cache_length - start_pos, 0)
                end_idx = min(cache_length - start_pos + input_length, length)

                assert (
                    image_position.id in self.encoder_cache[i]
                ), f"image_id {image_position.id} not in encoder_cache {self.encoder_cache[i]}"
                encoder_output = self.encoder_cache[i][image_position.id]

                is_embed = image_position.is_embed
                if is_embed is not None:
                    is_embed = is_embed[start_idx:end_idx]

                from loguru import logger

                logger.info(
                    f"image_id {image_position.id} start_idx {start_idx} end_idx {end_idx}, length {length}"
                )

                embeds = gather_image_embeds(
                    encoder_output[start_idx:end_idx],
                    is_embed=is_embed,
                )
                if embeds is not None:
                    chunks.append(embeds)

                if end_idx == length:
                    self.cache_entries_to_free.append((i, image_position.id))
                    self.image_positions[i][image_position.id] = None

        if len(chunks) == 0:
            return None
        return torch.cat(chunks, dim=0).to(device)

    def free_encoder_cache(self):
        for i, image_id in self.cache_entries_to_free:
            self.encoder_cache[i].pop(image_id, None)

        self.cache_entries_to_free = []


class FlashVlmCausalLM(FlashCausalLM):
    def __init__(
        self,
        model_id: str,
        *,
        processor_class=AutoProcessor,
        processor_kwargs=None,
        batch_class=FlashVlmCausalLMBatch,
        revision,
        trust_remote_code: bool,
        support_chunking: bool = False,
        **kwargs,
    ):
        if PREFIX_CACHING:
            raise NotImplementedError("Vlm do not work with prefix caching yet")
        if processor_kwargs is None:
            processor_kwargs = {}
        self.processor = processor_class.from_pretrained(
            model_id,
            revision=revision,
            trust_remote_code=trust_remote_code,
            **processor_kwargs,
        )
        self.batch_class = batch_class
        super().__init__(
            model_id=model_id,
            revision=revision,
            trust_remote_code=trust_remote_code,
            support_chunking=support_chunking,
            **kwargs,
        )

    @property
    def batch_type(self) -> Type[FlashVlmCausalLMBatch]:
        return self.batch_class

    def max_past(self) -> Optional[int]:
        return getattr(self.model.text_model, "max_past", None)

    def warmup_decode(
        self, batch_size: int, block_num: int, batch: FlashVlmCausalLMBatch
    ):
        input_ids = torch.zeros(batch_size, dtype=batch.input_ids.dtype)
        position_ids = torch.arange(batch_size, dtype=batch.position_ids.dtype)
        if batch.position_ids is not None and batch.position_ids.dim() == 2:
            # qwen2_vl and qwen2_5_vl case
            position_ids = position_ids.unsqueeze(-1).repeat(
                (1, batch.position_ids.shape[-1])
            )
        blocks = [block_num // batch_size for _ in range(batch_size)]
        blocks[0] += block_num % batch_size
        block_tables = []
        slots = []
        start_idx = 0
        slot_indices = []

        # fetch the last blocked to warmup block num

        for i in range(batch_size):
            block_array = list(range(start_idx, start_idx + blocks[i]))
            slots.append(BLOCK_SIZE * block_array[-1] + BLOCK_SIZE - 1)
            block_tables.append(block_array)
            slot_indices.append((start_idx + blocks[i]) * BLOCK_SIZE - 1)
            start_idx += blocks[i]
        input_lengths = torch.ones(batch_size, dtype=torch.int32)

        seqlen = Seqlen(
            input_lengths=_async_h2d_tensor_copy(input_lengths),
        )
        block_list, block_groups, block_usage, _, block_bucket_size = (
            generate_block_metadata(
                self.dtype,
                self.use_contiguous_pa,
                slots,
                block_tables,
                self.bucketing_ctx,
            )
        )
        meta = HPUPagedAttentionMetadata(
            block_list=_async_h2d_tensor_copy(block_list),
            block_groups=_async_h2d_tensor_copy(block_groups),
            block_usage=_async_h2d_tensor_copy(block_usage),
            block_mapping=None,
            attn_bias=None,
        )
        if self.sliding_window is not None:
            block_tables_in_window = []
            for i, bt in enumerate(block_tables):
                block_num_in_window = (
                    self.sliding_window + BLOCK_SIZE - 1
                ) // BLOCK_SIZE
                block_tables_in_window.append(
                    bt[max(0, blocks[i] - block_num_in_window) : blocks[i]]
                )
            slots_in_window = []
            start_idx = 0
            for i, indice in enumerate(slot_indices):
                mask = (
                    indice - torch.arange(start_idx, indice + 1)
                ) < self.sliding_window
                slots_in_window.append(torch.arange(start_idx, indice + 1)[mask])
                start_idx += blocks[i] * BLOCK_SIZE
            slots_in_window = torch.cat(slots_in_window, dim=0)
            (
                block_list_in_window,
                block_groups_in_window,
                block_usage_in_window,
                slots_in_window_mask,
                _,
            ) = generate_block_metadata(
                self.dtype,
                self.use_contiguous_pa,
                slots,
                block_tables_in_window,
                self.bucketing_ctx,
                slots_in_window,
                block_bucket_size,
            )
            meta.block_list_in_window = _async_h2d_tensor_copy(block_list_in_window)
            meta.block_groups_in_window = _async_h2d_tensor_copy(block_groups_in_window)
            meta.block_usage_in_window = _async_h2d_tensor_copy(block_usage_in_window)
            meta.slots_in_window_mask = _async_h2d_tensor_copy(slots_in_window_mask)

        hpu_attention_meta = trim_attn_metadata(meta)
        slots_tensor = torch.tensor(slots, dtype=batch.slots.dtype)
        inputs_embeds = self.get_inputs_embeds(
            input_ids=input_ids.to(self.device),
        )
        # We pass a `cu_seqlen_prefill` in order not to have to deal with paged attention cache allocation/deallocation.
        self.model.forward(
            inputs_embeds=inputs_embeds,
            position_ids=_async_h2d_tensor_copy(position_ids),
            cu_seqlen_prefill=None,
            kv_cache=self.kv_cache,
            slots=_async_h2d_tensor_copy(slots_tensor),
            seqlen=trim_seqlen_metadata(seqlen),
            hpu_attention_meta=hpu_attention_meta,
            lm_head_indices=None,
            attention_mask=None,
        )

    def warmup_hpu_graph(self, batch: FlashVlmCausalLMBatch):
        free_mem = HabanaMemoryProfiler.current_free_device_memory()
        graph_free_mem = free_mem - self.mem_reserved
        graph_free_mem = self.align_workers(
            graph_free_mem, torch.distributed.ReduceOp.MIN
        )
        decode_available_memory = graph_free_mem
        msg = (
            f"Using {format_bytes(graph_free_mem)}"
            f"/{format_bytes(free_mem)} "
            "of free device memory for HPUGraphs, "
            f"{format_bytes(decode_available_memory)} for decode "
        )
        log_master(logger.info, msg)
        start_time = time.time()
        warmup_shape_count = 0
        warmup_times = 3

        # only warmup decode, for prefill, image pixal size may change, make the warmup useless
        def ordering_function_max_bs(b):
            return (-b[0], b[1])

        self.bucketing_ctx.generate_decode_buckets(self.bucketing_ctx.num_hpu_blocks)
        buckets = list(
            sorted(self.bucketing_ctx.decode_buckets, key=ordering_function_max_bs)
        )
        total_batch_seq = 0.001
        total_mem = 0
        available_mem = decode_available_memory
        log_master(
            logger.info, f"Decode batch size list:{[bsz[0] for bsz in buckets]}\n"
        )
        for i, (batch_size, block_num) in enumerate(buckets):
            if batch_size > block_num:
                continue
            # Graph memory usage is proportional to seq dimension in a batch
            batch_seq = batch_size
            mem_estimate = batch_seq / total_batch_seq * total_mem
            graphed_bucket = (batch_size, block_num, False)
            if not mem_estimate >= available_mem:
                if graphed_bucket not in self.graphed_buckets:
                    self.graphed_buckets.add(graphed_bucket)
            warmup_shape_count += 1
            self.log_warmup(False, i, len(buckets), batch_size, block_num)
            with HabanaMemoryProfiler() as mem_prof:
                for index in range(warmup_times):
                    self.warmup_decode(batch_size, block_num, batch)
                    synchronize(self.device)
            used_mem = self.align_workers(
                mem_prof.consumed_device_memory, torch.distributed.ReduceOp.MAX
            )
            if graphed_bucket in self.graphed_buckets:

                available_mem -= used_mem
                total_mem += used_mem
                total_batch_seq += batch_seq

        log_master(logger.info, "Decode warmup successful.\n")

        log_master(
            logger.info,
            f"warmup hpu graph time {int(time.time() - start_time)}s warmup shape count {warmup_shape_count}",
        )

    def get_vision_embeds(
        self,
        pixel_values: torch.Tensor,
        pixel_attention_mask: torch.Tensor,
        image_sizes: torch.Tensor,
        image_grid_thw: torch.Tensor,
    ):
        embeds = self.model.get_vision_embeds(
            pixel_values=pixel_values,
            pixel_attention_mask=pixel_attention_mask,
            image_sizes=image_sizes,
            image_grid_thw=image_grid_thw,
        )
        return embeds

    def get_inputs_embeds(
        self,
        input_ids: torch.Tensor,
        vision_embeds: Optional[torch.Tensor] = None,
    ):
        return self.model.get_inputs_embeds(
            input_ids=input_ids,
            vision_embeds=vision_embeds,
        )

    def encode_images(self, batch):
        if batch.pixel_values is not None:
            device = batch.input_ids.device
            for request_id, image_id, image_input in batch.pixel_values:
                pixel_values = image_input["pixel_values"].to(device)

                if "pixel_attention_mask" in image_input:
                    pixel_attention_mask = image_input["pixel_attention_mask"].to(
                        device
                    )
                else:
                    pixel_attention_mask = None

                if "image_sizes" in image_input:
                    image_sizes = image_input["image_sizes"].to(device)
                else:
                    image_sizes = None

                if "image_grid_thw" in image_input:
                    image_grid_thw = image_input["image_grid_thw"]
                else:
                    image_grid_thw = None

                encoder_outputs = self.get_vision_embeds(
                    pixel_values=pixel_values,
                    pixel_attention_mask=pixel_attention_mask,
                    image_sizes=image_sizes,
                    image_grid_thw=image_grid_thw,
                )
                batch.update_encoder_cache(
                    encoder_outputs,
                    request_id,
                    batch.image_positions[request_id][image_id],
                )

        batch.pixel_values = None
        batch.pixel_attention_mask = None
        batch.image_sizes = None

    def set_inputs_embeds(self, batch):
        if batch.has_image_inputs:
            self.encode_images(batch)
            vision_embeds = batch.gather_vision_embeds()
            batch.has_image_inputs = False
        else:
            vision_embeds = None

        inputs_embeds = self.get_inputs_embeds(
            batch.input_ids, vision_embeds=vision_embeds
        )

        batch.inputs_embeds = inputs_embeds

    def forward(
        self,
        batch: FlashVlmCausalLMBatch,
        adapter_data: Optional[Dict[str, torch.Tensor]] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        # Model Forward
        if batch.speculative_ids is not None:
            input_ids = batch.input_ids
            position_ids = batch.position_ids
            cu_seqlen_prefill = batch.cu_seqlen_prefill
            kv_cache = self.kv_cache
            block_tables = batch.block_tables_tensor
            slots = batch.slots[batch.slot_indices]
            input_lengths = batch.input_lengths_tensor
            max_s = batch.max_current_length
            lm_head_indices = batch.prefill_head_indices

            speculative_ids = batch.speculative_ids

            B, speculative_length = speculative_ids.shape
            new_length = speculative_length + 1
            new_input_ids = torch.cat(
                [input_ids.unsqueeze(-1), speculative_ids], dim=1
            ).reshape(-1)
            arange = torch.arange(new_length, device=position_ids.device).unsqueeze(0)
            arange_int = arange.to(dtype=torch.int32)
            new_position_ids = (
                position_ids.unsqueeze(-1).expand(B, new_length) + arange
            ).view(-1)
            slots = (slots.unsqueeze(-1).expand(B, new_length) + arange_int).view(-1)
            input_lengths = (
                input_lengths.unsqueeze(-1).expand(B, new_length) + arange_int
            ).view(-1)

            # Add Copy the block tables for all members
            block_tables = (
                block_tables.unsqueeze(1)
                .expand(B, new_length, -1)
                .reshape(B * new_length, -1)
                .contiguous()
            )
            max_s = max_s + speculative_length

            input_ids = new_input_ids
            position_ids = new_position_ids
        else:
            input_ids = batch.input_ids
            inputs_embeds = batch.inputs_embeds
            position_ids = batch.position_ids
            cu_seqlen_prefill = batch.cu_seqlen_prefill
            kv_cache = self.kv_cache
            block_tables = batch.block_tables_tensor
            slots = batch.slots[batch.slot_indices]
            input_lengths = batch.input_lengths_tensor
            max_s = batch.max_current_length
            lm_head_indices = batch.prefill_head_indices

        if self.model.config.model_type in {"qwen2_vl", "qwen2_5_vl"}:
            if position_ids.dim() == 1 and batch.prefilling:
                position_ids = self.model.get_position_ids(
                    input_ids.cpu(), batch.image_grid_thw
                )
                batch.position_ids = position_ids

        attention_mask = None
        attention_mask_forward = None
        if self.model.config.model_type == "llama4":
            attention_mask = (input_ids != self.tokenizer.pad_token_id).long()
            attention_mask_forward = attention_mask.view(input_lengths.shape[0], -1)

        if cu_seqlen_prefill is None and self.max_past() is not None:
            # In decode, not prefill, we're actually overwriting the KV-cache
            # in a circular buffer mode.
            # This makes sure the max_s for the decode pass is correct.
            max_s = min(self.max_past(), max_s)

        if batch.prefill_cache_indices is not None:
            slots_pad = torch.zeros_like(input_ids, device=slots.device)
            slots_pad[batch.prefill_cache_indices] = slots
            slots = slots_pad
        else:
            slots_pad = torch.zeros_like(input_ids, device=slots.device)
            slots_pad[: slots.shape[0]] = slots
            slots = slots_pad

        seqlen = Seqlen(
            input_lengths=_async_h2d_tensor_copy(input_lengths),
        )
        kwargs = {}
        batch_size = input_lengths.shape[0]
        prompt_len = (
            input_ids.shape[0] // batch_size
            if batch.prefilling
            else batch.hpu_attn_meta.block_list.shape[0]
        )
        if htorch.utils.internal.is_lazy():
            kwargs["bypass_hpu_graphs"] = not self.use_graphs(
                batch.prefilling, prompt_len, batch_size
            )
        if self.sliding_window is not None:
            attn_mask = seqlen.make_sliding_window_bias(
                input_lengths.tolist(),
                self.sliding_window,
                self.dtype,
                prompt_len,
                batch_size,
            )
            seqlen.attn_mask = _async_h2d_tensor_copy(attn_mask)
        logits, speculative_logits = self.model.forward(
            inputs_embeds=inputs_embeds,
            position_ids=_async_h2d_tensor_copy(position_ids),
            cu_seqlen_prefill=_async_h2d_tensor_copy(cu_seqlen_prefill),
            kv_cache=kv_cache,
            slots=_async_h2d_tensor_copy(slots),
            seqlen=trim_seqlen_metadata(seqlen),
            hpu_attention_meta=batch.hpu_attn_meta,
            lm_head_indices=_async_h2d_tensor_copy(lm_head_indices),
            attention_mask=attention_mask_forward,
            **kwargs,
        )
        batch.image_grid_thw = None
        batch.free_encoder_cache()
        return logits, speculative_logits


================================================
FILE: backends/gaudi/server/text_generation_server/models/globals.py
================================================
import os
from typing import Dict, Optional
from loguru import logger
from text_generation_server.utils.log import log_master

REQUEST_LOGPROBS = os.getenv("REQUEST_LOGPROBS", "0").lower() in {"1", "true"}
ATTENTION = os.getenv("ATTENTION", "paged")
# default_prefix_caching = "1" if ATTENTION in {"flashinfer", "flashdecoding"} else "0"
PREFIX_CACHING = os.getenv("PREFIX_CACHING", "0").lower() in {
    "1",
    "true",
}
log_master(logger.info, f"Using prefix caching = {PREFIX_CACHING}")
_expected = {"paged"}
assert (
    ATTENTION in _expected
), f"Attention is not valid {ATTENTION}, expected {_expected}"
log_master(logger.info, f"Using Attention = {ATTENTION}")

TGI_WIGGLE_ROOM = float(os.getenv("TGI_WIGGLE_ROOM", "0.90"))
assert TGI_WIGGLE_ROOM > 0
assert TGI_WIGGLE_ROOM < 1

# This is overridden by the cli
BLOCK_SIZE: int

BLOCK_SIZE = 128


# This is overridden at model loading.
global MODEL_ID
MODEL_ID = None


def set_model_id(model_id: str):
    global MODEL_ID
    MODEL_ID = model_id


# NOTE: eventually we should move this into the router and pass back the
# index in all cases.
ADAPTER_TO_INDEX: Optional[Dict[str, int]] = None


def set_adapter_to_index(adapter_to_index: Dict[str, int]):
    global ADAPTER_TO_INDEX
    ADAPTER_TO_INDEX = adapter_to_index


def get_adapter_to_index():
    global ADAPTER_TO_INDEX
    return ADAPTER_TO_INDEX


================================================
FILE: backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py
================================================
import torch

import numpy as np

from typing import Iterable, Optional, Tuple, List, Dict
from text_generation_server.pb.generate_pb2 import Request
from io import BytesIO
from PIL import Image
from dataclasses import dataclass
from opentelemetry import trace
from transformers import (
    PreTrainedTokenizerBase,
)
from text_generation_server.models.flash_causal_lm import (
    generate_block_metadata,
)
from text_generation_server.models.flash_vlm_causal_lm import (
    FlashVlmCausalLMBatch,
    FlashVlmCausalLM,
)
from text_generation_server.pb import generate_pb2
from text_generation_server.layers.attention import (
    Seqlen,
    trim_seqlen_metadata,
    _async_h2d_tensor_copy,
    HPUPagedAttentionMetadata,
    trim_attn_metadata,
)
import habana_frameworks.torch as htorch
from loguru import logger
from text_generation_server.models.globals import BLOCK_SIZE
from text_generation_server.utils.import_utils import (
    synchronize,
)
import torch.nn.functional as F
from text_generation_server.utils.log import log_master
import time
import os
from vllm_hpu_extension.profiler import HabanaMemoryProfiler, format_bytes

tracer = trace.get_tracer(__name__)


@dataclass
class FlashMllamaCausalLMBatch(FlashVlmCausalLMBatch):
    image_indices: List[int] = 42
    aspect_ratio_ids: Optional[torch.Tensor] = None
    aspect_ratio_mask: Optional[torch.Tensor] = None
    cross_attention_states: Optional[torch.Tensor] = None

    def prepare_for_prefill(
        self, max_padded_input_len, max_padded_bs, max_total_tokens, pad_token_id
    ):
        super(FlashVlmCausalLMBatch, self).prepare_for_prefill(
            max_padded_input_len, max_padded_bs, max_total_tokens, pad_token_id
        )

    @classmethod
    @tracer.start_as_current_span("concatenate")
    def concatenate(cls, batches, padded_total_bs: int = 0):
        batch = super(FlashVlmCausalLMBatch, cls).concatenate(batches, padded_total_bs)
        batch.pixel_values = None
        batch.pixel_attention_mask = None

        offset = 0
        image_indices = []
        attention_states = []
        for b in batches:
            if b.cross_attention_states is not None:
                attention_states.append(b.cross_attention_states)
            image_indices.extend([i + offset for i in b.image_indices])
            offset += len(b.image_indices)
        if len(attention_states) > 0:
            assert len(image_indices) > 0
            batch.cross_attention_states = torch.cat(attention_states, dim=0)
            batch.image_indices = image_indices
        else:
            batch.cross_attention_states = None
            batch.image_indices = []
        return batch

    @tracer.start_as_current_span("filter")
    def filter(self, request_ids: List[int]):
        assert self.image_indices is not None
        batch = super(FlashVlmCausalLMBatch, self).filter(request_ids)
        assert self.image_indices is not None
        indices = []
        for i, request_id in enumerate(request_ids):
            idx = self.requests_idx_mapping[request_id]
            indices.append(idx)

        offset = 0
        new_image_indices = []
        prev_i = None
        for i in self.image_indices:
            if i in indices:
                new_image_indices.append(offset)
                if i != prev_i:
                    offset += 1
                prev_i = i

        batch.image_indices = new_image_indices
        if len(new_image_indices) > 0:
            assert max(new_image_indices) < self.cross_attention_states.shape[0]
            assert offset <= self.cross_attention_states.shape[0]
            batch.cross_attention_states = self.cross_attention_states[
                new_image_indices
            ]
        else:
            batch.cross_attention_states = None
        batch.pixel_values = None
        return batch

    @classmethod
    def batch_tokenized_inputs(
        cls, requests: Iterable[Request], tokenizer, processor, config
    ):
        image_inputs = []
        texts = []
        image_indices = []
        batch_tokenized_inputs = []

        for i, r in enumerate(requests):
            # Each input is encoded into a list, where each element of this input list is either a string or a URL
            curr_text = ""
            curr_image = None
            curr_i = None
            for chunk in r.input_chunks.chunks:
                chunk_type = chunk.WhichOneof("chunk")
                if chunk_type == "text":
                    curr_text += chunk.text
                elif chunk_type == "image":
                    image = Image.open(BytesIO(chunk.image.data))
                    # TODO unsure about BOS
                    curr_text += "<|image|>"
                    image_input = processor.image_processor(image, return_tensors="pt")
                    curr_image = image_input
                    curr_i = i
                    # image_inputs.append(image_input)
                    # image_indices.append(i)
                else:
                    raise RuntimeError(f"Invalid chunk type {chunk_type}")
            texts.append(curr_text)
            if curr_image is not None:
                image_inputs.append(curr_image)
                image_indices.append(curr_i)

            input_ids = tokenizer(
                curr_text,
                truncation=True,
                max_length=r.truncate,
                add_special_tokens=r.add_special_tokens,
            )["input_ids"]
            batch_tokenized_inputs.append(input_ids)
        if image_inputs:
            image_input = image_inputs[0]
            new_image_inputs = {
                "pixel_values": torch.cat(
                    [img["pixel_values"] for img in image_inputs], dim=0
                ),
            }
            if "aspect_ratio_ids" in image_input:
                new_image_inputs["aspect_ratio_ids"] = torch.cat(
                    [img["aspect_ratio_ids"] for img in image_inputs], dim=0
                )
            if "aspect_ratio_mask" in image_input:
                new_image_inputs["aspect_ratio_mask"] = torch.cat(
                    [img["aspect_ratio_mask"] for img in image_inputs], dim=0
                )
            image_inputs = new_image_inputs
            image_inputs["image_indices"] = image_indices
        else:
            image_inputs = None

        if image_inputs is not None:
            assert len(image_indices) == image_inputs["pixel_values"].shape[0]

        return batch_tokenized_inputs, image_inputs

    @classmethod
    def from_pb_processor(
        cls,
        pb: generate_pb2.Batch,
        tokenizer: PreTrainedTokenizerBase,
        processor,
        config,
        dtype: torch.dtype,
        device: torch.device,
    ) -> "FlashVlmCausalLMBatch":
        batch_tokenized_inputs, image_inputs = cls.batch_tokenized_inputs(
            pb.requests, tokenizer, processor, config
        )
        batch = cls.from_tokenized(pb, tokenizer, batch_tokenized_inputs, dtype, device)
        # XXX: <|image|> token is actually out of bounds and bugs out the logit processors.
        batch.all_input_ids_tensor = batch.all_input_ids_tensor.clamp(
            max=config.text_config.vocab_size - 1
        )
        if isinstance(batch.input_ids, list):
            if len(batch) > 1:
                input_ids = np.concatenate(batch.input_ids, dtype=np.int64)
            else:
                input_ids = batch.input_ids[0]
            batch.input_ids = torch.tensor(input_ids, dtype=torch.int64, device=device)

        batch.input_ids = batch.input_ids.clamp(max=config.text_config.vocab_size - 1)

        if image_inputs is not None:
            batch.pixel_values = image_inputs["pixel_values"].to(
                device=device, dtype=dtype
            )
            batch.aspect_ratio_ids = image_inputs["aspect_ratio_ids"].to(device=device)
            batch.aspect_ratio_mask = image_inputs["aspect_ratio_mask"].to(
                device=device
            )
            batch.image_indices = image_inputs["image_indices"]
        else:
            batch.pixel_values = None
            batch.aspect_ratio_ids = None
            batch.aspect_ratio_mask = None
            batch.image_indices = []
        assert batch.image_indices is not None
        return batch


def generate_cross_attention_states(
    cross_attention_states, image_indices, input_lengths, pad_seq_len, prefilling
):
    if cross_attention_states is None:
        return None, None
    indices_list = []
    if prefilling:
        for i in image_indices:
            indices_list.append(torch.arange(pad_seq_len * i, pad_seq_len * (i + 1)))
        indices = torch.cat(indices_list, dim=0)
    else:
        indices = image_indices[:]
    return indices, input_lengths.index_select(0, image_indices)


class FlashMllamaCausalLM(FlashVlmCausalLM):
    def set_inputs_embeds(self, batch):
        # Set the input embeddings to None, as we are using the input_ids for the model
        batch.inputs_embeds = None

    def warmup_decode(
        self, batch_size: int, block_num: int, batch: FlashMllamaCausalLMBatch
    ):
        input_ids = torch.zeros(batch_size, dtype=batch.input_ids.dtype)
        position_ids = torch.arange(batch_size, dtype=batch.position_ids.dtype)
        blocks = [block_num // batch_size for _ in range(batch_size)]
        blocks[0] += block_num % batch_size
        block_tables = []
        slots = []
        start_idx = 0
        slot_indices = []

        # fetch the last blocked to warmup block num
        for i in range(batch_size):
            block_array = list(range(start_idx, start_idx + blocks[i]))
            slots.append(BLOCK_SIZE * block_array[-1] + BLOCK_SIZE - 1)
            block_tables.append(block_array)
            slot_indices.append((start_idx + blocks[i]) * BLOCK_SIZE - 1)
            start_idx += blocks[i]
        input_lengths = torch.ones(batch_size, dtype=torch.int32)

        seqlen = Seqlen(
            input_lengths=_async_h2d_tensor_copy(input_lengths),
        )
        block_list, block_groups, block_usage, _, block_bucket_size = (
            generate_block_metadata(
                self.dtype,
                self.use_contiguous_pa,
                slots,
                block_tables,
                self.bucketing_ctx,
            )
        )
        meta = HPUPagedAttentionMetadata(
            block_list=_async_h2d_tensor_copy(block_list),
            block_groups=_async_h2d_tensor_copy(block_groups),
            block_usage=_async_h2d_tensor_copy(block_usage),
            block_mapping=None,
            attn_bias=None,
        )

        hpu_attention_meta = trim_attn_metadata(meta)
        # We pass a `cu_seqlen_prefill` in order not to have to deal with paged attention cache allocation/deallocation.
        image_indices = torch.tensor(batch.image_indices)
        image_indices = image_indices.repeat(batch_size)
        cross_attention_states = batch.cross_attention_states.repeat(batch_size, 1, 1)
        indices, cross_attention_len = generate_cross_attention_states(
            cross_attention_states, image_indices, input_lengths, 1, False
        )
        slots_tensor = torch.tensor(slots, dtype=batch.slots.dtype)
        kwargs = {}
        if htorch.utils.internal.is_lazy():
            kwargs["bypass_hpu_graphs"] = not self.use_graphs(
                False, hpu_attention_meta.block_list.shape[0], batch_size
            )
        self.model.forward(
            input_ids=_async_h2d_tensor_copy(input_ids),
            position_ids=_async_h2d_tensor_copy(position_ids),
            cu_seqlen_prefill=None,
            kv_cache=self.kv_cache,
            slots=_async_h2d_tensor_copy(slots_tensor),
            seqlen=trim_seqlen_metadata(seqlen),
            hpu_attention_meta=hpu_attention_meta,
            lm_head_indices=None,
            adapter_data=None,
            cross_attention_states=cross_attention_states,
            indices=_async_h2d_tensor_copy(indices),
            cross_attention_len=_async_h2d_tensor_copy(cross_attention_len),
            **kwargs,
        )

    def warmup_prefill(
        self, prompt_len: int, batch_size: int, batch: FlashMllamaCausalLMBatch
    ):
        input_ids = torch.zeros(prompt_len, dtype=batch.input_ids.dtype).repeat(
            batch_size
        )
        position_ids = torch.arange(prompt_len, dtype=batch.position_ids.dtype).repeat(
            batch_size
        )
        max_bt = (prompt_len // BLOCK_SIZE + 1) * batch_size
        block_tables = torch.arange(max_bt, dtype=torch.int32).reshape(batch_size, -1)
        slot_acc = []
        for i in range(batch_size):
            slots = []
            for b in block_tables[i]:
                slots.extend(range(b * BLOCK_SIZE, (b + 1) * BLOCK_SIZE))
            slot_acc.extend(slots[:prompt_len])
        slots = torch.tensor(slot_acc, dtype=batch.slots.dtype)

        input_lengths = (
            torch.ones(
                batch_size,
                dtype=torch.int32,
            )
            * prompt_len
        )
        cu_seqlen_prefill = torch.zeros(batch_size + 1, dtype=torch.int32)
        torch.cumsum(input_lengths, -1, out=cu_seqlen_prefill[1:])

        lm_head_indices = input_lengths - 1

        # We pass a `cu_seqlen_prefill` in order not to have to deal with paged attention cache allocation/deallocation.
        image_indices = torch.tensor(batch.image_indices)
        image_indices = image_indices.repeat(batch_size)
        cross_attention_states = batch.cross_attention_states.repeat(batch_size, 1, 1)
        indices, cross_attention_len = generate_cross_attention_states(
            cross_attention_states, image_indices, input_lengths, prompt_len, True
        )
        seqlen = Seqlen(
            input_lengths=_async_h2d_tensor_copy(input_lengths),
        )
        kwargs = {}
        if htorch.utils.internal.is_lazy():
            kwargs["bypass_hpu_graphs"] = not self.use_graphs(
                True, prompt_len, batch_size
            )
        self.model.forward(
            input_ids=_async_h2d_tensor_copy(input_ids),
            position_ids=_async_h2d_tensor_copy(position_ids),
            cu_seqlen_prefill=_async_h2d_tensor_copy(cu_seqlen_prefill),
            kv_cache=self.kv_cache,
            slots=_async_h2d_tensor_copy(slots),
            seqlen=trim_seqlen_metadata(seqlen),
            hpu_attention_meta=None,
            lm_head_indices=_async_h2d_tensor_copy(lm_head_indices),
            adapter_data=None,
            cross_attention_states=cross_attention_states,
            indices=_async_h2d_tensor_copy(indices),
            cross_attention_len=_async_h2d_tensor_copy(cross_attention_len),
            **kwargs,
        )

    def warmup_hpu_graph(self, batch: FlashMllamaCausalLMBatch):
        prompt_graph_mem_ratio = float(os.environ.get("VLLM_GRAPH_PROMPT_RATIO", "0.3"))
        free_mem = HabanaMemoryProfiler.current_free_device_memory()
        graph_free_mem = free_mem - self.mem_reserved
        graph_free_mem = self.align_workers(
            graph_free_mem, torch.distributed.ReduceOp.MIN
        )
        prompt_available_memory = prompt_graph_mem_ratio * graph_free_mem
        decode_available_memory = graph_free_mem - prompt_available_memory
        msg = (
            f"Using {format_bytes(graph_free_mem)}"
            f"/{format_bytes(free_mem)} "
            "of free device memory for HPUGraphs, "
            f"{format_bytes(prompt_available_memory)} for prompt and "
            f"{format_bytes(decode_available_memory)} for decode "
            f"(VLLM_GRAPH_PROMPT_RATIO={prompt_graph_mem_ratio})"
        )
        log_master(logger.info, msg)
        start_time = time.time()
        warmup_shape_count = 0
        warmup_times = 3
        self.bucketing_ctx.generate_prompt_buckets()

        def ordering_function_min_tokens(b):
            return (b[0] * b[1], b[1], b[0])

        buckets = list(
            sorted(self.bucketing_ctx.prompt_buckets, key=ordering_function_min_tokens)
        )
        graph_free_mem
        total_batch_seq = 0.001
        total_mem = 0
        available_mem = prompt_available_memory
        msg = (
            f"Prefill batch size list:{[bsz[0] for bsz in buckets]}\n"
            f"Prefill sequence length list:{[seq[1] for seq in buckets]}\n"
        )
        log_master(logger.info, msg)
        for i, (batch_size, seq_len) in enumerate(buckets):
            if batch_size * seq_len > self.max_batch_prefill_tokens:
                continue
            # Graph memory usage is proportional to seq dimension in a batch
            batch_seq = batch_size * seq_len
            mem_estimate = batch_seq / total_batch_seq * total_mem
            graphed_bucket = (batch_size, seq_len, True)
            if not (
                mem_estimate >= available_mem or batch_seq > self.max_seq_len_to_capture
            ):
                if graphed_bucket not in self.graphed_buckets:
                    self.graphed_buckets.add(graphed_bucket)
            warmup_shape_count += 1
            self.log_warmup(True, i, len(buckets), batch_size, seq_len)
            with HabanaMemoryProfiler() as mem_prof:
                for index in range(warmup_times):
                    self.warmup_prefill(seq_len, batch_size, batch)
                    synchronize(self.device)
            used_mem = self.align_workers(
                mem_prof.consumed_device_memory, torch.distributed.ReduceOp.MAX
            )
            if graphed_bucket in self.graphed_buckets:
                available_mem -= used_mem
                total_mem += used_mem
                total_batch_seq += batch_seq

        log_master(logger.info, "Prefill warmup successful.\n")

        def ordering_function_max_bs(b):
            return (-b[0], b[1])

        self.bucketing_ctx.generate_decode_buckets(self.bucketing_ctx.num_hpu_blocks)
        buckets = list(
            sorted(self.bucketing_ctx.decode_buckets, key=ordering_function_max_bs)
        )
        free_mem = HabanaMemoryProfiler.current_free_device_memory()
        total_batch_seq = 0.001
        total_mem = 0
        available_mem = free_mem - self.mem_reserved
        log_master(
            logger.info, f"Decode batch size list:{[bsz[0] for bsz in buckets]}\n"
        )
        for i, (batch_size, block_num) in enumerate(buckets):
            if batch_size > block_num:
                continue
            # Graph memory usage is proportional to seq dimension in a batch
            batch_seq = batch_size
            mem_estimate = batch_seq / total_batch_seq * total_mem
            graphed_bucket = (batch_size, block_num, False)
            if not mem_estimate >= available_mem:
                if graphed_bucket not in self.graphed_buckets:
                    self.graphed_buckets.add(graphed_bucket)
            warmup_shape_count += 1
            self.log_warmup(False, i, len(buckets), batch_size, block_num)
            with HabanaMemoryProfiler() as mem_prof:
                for index in range(warmup_times):
                    self.warmup_decode(batch_size, block_num, batch)
                    synchronize(self.device)
            used_mem = self.align_workers(
                mem_prof.consumed_device_memory, torch.distributed.ReduceOp.MAX
            )
            if graphed_bucket in self.graphed_buckets:
                available_mem -= used_mem
                total_mem += used_mem
                total_batch_seq += batch_seq

        log_master(logger.info, "Decode warmup successful.\n")

        log_master(
            logger.info,
            f"warmup hpu graph time {int(time.time() - start_time)}s warmup shape count {warmup_shape_count}",
        )

    def forward(
        self,
        batch: FlashMllamaCausalLMBatch,
        adapter_data: Optional[Dict[str, torch.Tensor]] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        # Model Forward
        if batch.speculative_ids is not None:
            input_ids = batch.input_ids
            position_ids = batch.position_ids
            cu_seqlen_prefill = batch.cu_seqlen_prefill
            kv_cache = self.kv_cache
            block_tables = batch.block_tables_tensor
            slots = batch.slots[batch.slot_indices]
            input_lengths = batch.input_lengths_tensor
            max_s = batch.max_current_length
            lm_head_indices = batch.prefill_head_indices

            speculative_ids = batch.speculative_ids

            B, speculative_length = speculative_ids.shape
            new_length = speculative_length + 1
            new_input_ids = torch.cat(
                [input_ids.unsqueeze(-1), speculative_ids], dim=1
            ).reshape(-1)
            arange = torch.arange(new_length, device=position_ids.device).unsqueeze(0)
            arange_int = arange.to(dtype=torch.int32)
            new_position_ids = (
                position_ids.unsqueeze(-1).expand(B, new_length) + arange
            ).view(-1)
            slots = (slots.unsqueeze(-1).expand(B, new_length) + arange_int).view(-1)
            input_lengths = (
                input_lengths.unsqueeze(-1).expand(B, new_length) + arange_int
            ).view(-1)

            # Add Copy the block tables for all members
            block_tables = (
                block_tables.unsqueeze(1)
                .expand(B, new_length, -1)
                .reshape(B * new_length, -1)
                .contiguous()
            )
            max_s = max_s + speculative_length

            input_ids = new_input_ids
            position_ids = new_position_ids
        else:
            input_ids = batch.input_ids
            position_ids = batch.position_ids
            cu_seqlen_prefill = batch.cu_seqlen_prefill
            kv_cache = self.kv_cache
            block_tables = batch.block_tables_tensor
            slots = batch.slots[batch.slot_indices]
            input_lengths = batch.input_lengths_tensor
            max_s = batch.max_current_length
            lm_head_indices = batch.prefill_head_indices

        if cu_seqlen_prefill is None and self.max_past() is not None:
            # In decode, not prefill, we're actually overwriting the KV-cache
            # in a circular buffer mode.
            # This makes sure the max_s for the decode pass is correct.
            max_s = min(self.max_past(), max_s)

        if batch.pixel_values is not None:
            cross_attention_states = self.model.vision_forward(
                pixel_values=batch.pixel_values,
                aspect_ratio_ids=batch.aspect_ratio_ids,
                aspect_ratio_mask=batch.aspect_ratio_mask,
            )
            batch.cross_attention_states = cross_attention_states

        cross_attention_states = batch.cross_attention_states

        kwargs = {}
        if htorch.utils.internal.is_lazy():
            batch_size = input_lengths.shape[0]
            seqlen = (
                input_ids.shape[0] // batch_size
                if batch.prefilling
                else batch.hpu_attn_meta.block_list.shape[0]
            )
            kwargs["bypass_hpu_graphs"] = not self.use_graphs(
                batch.prefilling, seqlen, batch_size
            )

        if batch.prefill_cache_indices is not None:
            slots_pad = torch.zeros_like(input_ids, device=slots.device)
            slots_pad[batch.prefill_cache_indices] = slots
            slots = slots_pad
        else:
            slots_pad = torch.zeros_like(input_ids, device=slots.device)
            slots_pad[: slots.shape[0]] = slots
            slots = slots_pad
        orig_bs = len(batch)
        padded_bs = batch.input_lengths_tensor.shape[0]
        padded_input_len = input_ids.view(padded_bs, -1).shape[-1]
        image_indices = torch.tensor(batch.image_indices)

        if cross_attention_states is not None:
            cross_attention_states = F.pad(
                cross_attention_states,
                (0, 0, 0, 0, 0, (padded_bs - orig_bs)),
                value=0,
            )
        if len(image_indices) != 0:
            pad_indices = torch.arange(orig_bs, padded_bs)
            image_indices = torch.cat((image_indices, pad_indices), dim=0)

        indices, cross_attention_len = generate_cross_attention_states(
            cross_attention_states,
            image_indices,
            input_lengths,
            padded_input_len,
            batch.prefilling,
        )
        seqlen = Seqlen(
            input_lengths=_async_h2d_tensor_copy(input_lengths),
        )
        logits, speculative_logits = self.model.forward(
            input_ids=input_ids,
            position_ids=_async_h2d_tensor_copy(position_ids),
            cu_seqlen_prefill=_async_h2d_tensor_copy(cu_seqlen_prefill),
            kv_cache=kv_cache,
            slots=_async_h2d_tensor_copy(slots),
            seqlen=trim_seqlen_metadata(seqlen),
            hpu_attention_meta=batch.hpu_attn_meta,
            lm_head_indices=_async_h2d_tensor_copy(lm_head_indices),
            # TODO list
            adapter_data=None,
            cross_attention_states=cross_attention_states,
            indices=_async_h2d_tensor_copy(indices),
            cross_attention_len=_async_h2d_tensor_copy(cross_attention_len),
            **kwargs,
        )
        if batch.pixel_values is not None:
            batch.pixel_values = None
        return logits, speculative_logits


================================================
FILE: backends/gaudi/server/text_generation_server/models/model.py
================================================
import inspect
import torch

from abc import ABC, abstractmethod
from typing import List, Tuple, Optional, TypeVar, Type, Dict
from collections import defaultdict
from transformers import PreTrainedTokenizerBase

from text_generation_server.models.types import Batch, Generation
from text_generation_server.models.globals import BLOCK_SIZE
from text_generation_server.utils.speculate import get_speculate
from text_generation_server.pb.generate_pb2 import InfoResponse
from text_generation_server.adapters.weights import LayerAdapterWeights
from text_generation_server.pb import generate_pb2

BASE_MODEL_ADAPTER_ID = "__base_model__"


B = TypeVar("B", bound=Batch)


class Model(ABC):
    def __init__(
        self,
        model_id: str,
        model: torch.nn.Module,
        tokenizer: PreTrainedTokenizerBase,
        requires_padding: bool,
        dtype: torch.dtype,
        device: torch.device,
        rank: int = 0,
        world_size: int = 1,
        sliding_window: Optional[int] = None,
        speculate: Optional[int] = None,
        adapter_id: str = BASE_MODEL_ADAPTER_ID,
        support_chunking: bool = False,
    ):
        self.model_id = model_id
        self.model = model.eval()
        self.tokenizer = tokenizer

        # all_special_ids is not set correctly if the rust tokenizer is unpacked
        # TODO report this to transformers.
        other_special_ids = {
            id for id, token in tokenizer.added_tokens_decoder.items() if token.special
        }
        self.all_special_ids = set(tokenizer.all_special_ids)
        self.all_special_ids.update(other_special_ids)
        self.requires_padding = requires_padding
        self.dtype = dtype
        self.device = device
        self.rank = rank
        self.world_size = world_size
        self.sliding_window = sliding_window if sliding_window != -1 else None

        self.layer_to_adapter_weights: Dict[str, LayerAdapterWeights] = defaultdict(
            LayerAdapterWeights
        )
        self.loaded_adapters = set()
        self.static_adapter_id = adapter_id

        if speculate is None:
            speculate = get_speculate()
        self.speculate = speculate

        self.has_position_ids = (
            inspect.signature(model.forward).parameters.get("position_ids", None)
            is not None
        )

        self.check_initialized()

    @property
    def info(self) -> InfoResponse:
        if self.requires_padding and self.sliding_window is not None:
            raise NotImplementedError("sliding_window is not implemented with padding")

        return InfoResponse(
            requires_padding=self.requires_padding,
            dtype=str(self.dtype),
            device_type=self.device.type,
            window_size=None,
            speculate=self.speculate,
            block_size=BLOCK_SIZE,
        )

    @property
    @abstractmethod
    def batch_type(self) -> Type[B]:
        raise NotImplementedError

    @abstractmethod
    def generate_token(
        self, batch: B
    ) -> Tuple[List[Generation], Optional[B], Tuple[int, int]]:
        raise NotImplementedError

    def warmup(
        self, batch: generate_pb2.WarmupRequest
    ) -> Tuple[Optional[int], Optional[int], Optional[int]]:
        self.generate_token(batch)
        return None, None, None

    def decode_token(
        self,
        all_input_ids: List[int],
        prefix_offset: int = 0,
        read_offset: int = 0,
        skip_special_tokens: bool = False,
    ) -> Tuple[str, int, int]:
        """Hack to hopefully support generate_stream for the maximum number of tokenizers"""

        # The prefix text is necessary only to defeat cleanup algorithms in the decode
        # which decide to add a space or not depending on the surrounding ids.
        prefix_text = self.tokenizer.decode(
            all_input_ids[prefix_offset:read_offset],
            skip_special_tokens=skip_special_tokens,
        )

        new_text = self.tokenizer.decode(
            all_input_ids[prefix_offset:], skip_special_tokens=skip_special_tokens
        )

        if len(new_text) > len(prefix_text) and not new_text.endswith("�"):
            # utf-8 char at the end means it's a potential unfinished byte sequence
            # from byte fallback tokenization.
            # If it's in the middle, it's probably a real invalid id generated
            # by the model
            new_text = new_text[len(prefix_text) :]
            return new_text, read_offset, len(all_input_ids)
        else:
            return "", prefix_offset, read_offset

    def check_initialized(self):
        uninitialized_parameters = []
        for n, p in self.model.named_parameters():
            if p.data.device == torch.device("meta"):
                uninitialized_parameters.append(n)
        if uninitialized_parameters:
            raise RuntimeError(
                f"found uninitialized parameters in model {self.__class__.__name__}: {uninitialized_parameters}"
            )


================================================
FILE: backends/gaudi/server/text_generation_server/models/seq2seq_lm.py
================================================
import torch
import torch.distributed
import time
from dataclasses import dataclass
from opentelemetry import trace
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    PreTrainedTokenizerBase,
    AutoConfig,
)
from typing import Optional, Tuple, List, Type, Dict
from text_generation_server.utils import (
    initialize_torch_distributed,
    weight_files,
    Weights,
)
from text_generation_server.utils.chunks import concat_text_chunks
from text_generation_server.utils.quantization import get_loader
from text_generation_server.utils.tokens import batch_top_tokens
from text_generation_server.models import Model
from text_generation_server.models.types import (
    GeneratedText,
    Batch,
    Generation,
    Tokens,
)
from text_generation_server.pb import generate_pb2
from text_generation_server.utils import NextTokenChooser, StoppingCriteria, Sampling

tracer = trace.get_tracer(__name__)


@dataclass
class Seq2SeqLMBatch(Batch):
    batch_id: int
    requests: List[generate_pb2.Request]
    requests_idx_mapping: Dict[int, int]

    # Encoder values
    input_ids: Optional[torch.Tensor]
    attention_mask: torch.Tensor

    # Decoder values
    decoder_input_ids: torch.Tensor
    decoder_attention_mask: Optional[torch.Tensor]
    encoder_last_hidden_state: Optional[torch.Tensor]

    # All tokens
    all_decoder_input_ids: List[torch.Tensor]

    # Seq2SeqLM keeps track of both encoder and decoder attention keys and values
    past_key_values: Optional[List[Tuple]]

    # Lengths of all generations present in the batch
    input_lengths: List[int]
    decoder_input_lengths: List[int]
    prefix_offsets: List[int]
    read_offsets: List[int]

    # Generation helpers
    next_token_choosers: List[NextTokenChooser]
    stopping_criterias: List[StoppingCriteria]
    top_n_tokens: List[int]
    top_n_tokens_tensor: torch.Tensor

    # Metadata used for padding
    max_input_length: int
    max_decoder_input_length: int
    padding_right_offset: int

    # Maximum number of tokens this batch will grow to
    max_tokens: int

    def to_pb(self) -> generate_pb2.CachedBatch:
        """Convert a Seq2SeqLMBatch to a text_generation_server.v1.CachedBatch protobuf"""
        return generate_pb2.CachedBatch(
            id=self.batch_id,
            request_ids=[r.id for r in self.requests],
            size=len(self),
            max_tokens=self.max_tokens,
        )

    @classmethod
    def from_pb(
        cls,
        pb: generate_pb2.Batch,
        tokenizer: PreTrainedTokenizerBase,
        dtype: torch.dtype,
        device: torch.device,
    ) -> "Seq2SeqLMBatch":
        """Convert a text_generation_server.v1.Batch protobuf to a Seq2SeqLMBatch"""
        inputs = []
        next_token_choosers = []
        stopping_criterias = []
        top_n_tokens = []
        decoder_input_lengths = []
        prefix_offsets = []
        read_offsets = []
        requests_idx_mapping = {}

        # Parse batch
        max_truncation = 0
        padding_right_offset = 0
        max_decode_tokens = 0
        for i, r in enumerate(pb.requests):
            inputs.append(concat_text_chunks(r.input_chunks.chunks))
            requests_idx_mapping[r.id] = i
            decoder_input_lengths.append(1)
            next_token_choosers.append(
                NextTokenChooser.from_pb(r.parameters, device, tokenizer)
            )
            stopping_criteria = StoppingCriteria.from_pb(
                r.stopping_parameters, tokenizer
            )
            stopping_criterias.append(stopping_criteria)
            top_n_tokens.append(r.top_n_tokens)
            max_truncation = max(max_truncation, r.truncate)
            max_decode_tokens += stopping_criteria.max_new_tokens
            padding_right_offset = max(
                padding_right_offset, stopping_criteria.max_new_tokens
            )

        # Tokenize batch
        tokenized_inputs = tokenizer(
            inputs,
            return_tensors="pt",
            padding=True,
            return_token_type_ids=False,
            truncation=True,
            max_length=max_truncation,
        ).to(device)

        input_lengths = tokenized_inputs["attention_mask"].sum(1)
        max_input_length = input_lengths.max()

        # Decoder sequence only contains the bos_token
        decoder_input_ids = (
            torch.tensor(tokenizer.bos_token_id, device=device)
            .repeat(len(pb.requests))
            .view(-1, 1)
        )
        for _ in pb.requests:
            prefix_offsets.append(0)
            read_offsets.append(1)
        all_decoder_input_ids = decoder_input_ids.view(-1).split(1)
        top_n_tokens_tensor = torch.tensor(
            top_n_tokens, device=device, dtype=torch.int64
        )

        max_tokens = len(inputs) * (max_input_length + max_decode_tokens)

        return cls(
            batch_id=pb.id,
            requests=pb.requests,
            requests_idx_mapping=requests_idx_mapping,
            input_ids=tokenized_inputs["input_ids"],
            attention_mask=tokenized_inputs["attention_mask"],
            decoder_input_ids=decoder_input_ids,
            all_decoder_input_ids=list(all_decoder_input_ids),
            decoder_attention_mask=None,
            encoder_last_hidden_state=None,
            past_key_values=None,
            input_lengths=input_lengths.tolist(),
            decoder_input_lengths=decoder_input_lengths,
            prefix_offsets=prefix_offsets,
            read_offsets=read_offsets,
            next_token_choosers=next_token_choosers,
            stopping_criterias=stopping_criterias,
            top_n_tokens=top_n_tokens,
            top_n_tokens_tensor=top_n_tokens_tensor,
            max_input_length=max_input_length.item(),
            max_decoder_input_length=1,
            padding_right_offset=padding_right_offset,
            max_tokens=max_tokens,
        )

    @tracer.start_as_current_span("filter")
    def filter(self, request_ids: List[int]) -> Optional["Seq2SeqLMBatch"]:
        if len(request_ids) == 0:
            raise ValueError("Batch must have at least one request")
        if len(request_ids) == len(self):
            return self

        keep_indices = []

        # New values after filtering
        requests_idx_mapping = {}
        requests = []
        input_lengths = []
        decoder_input_lengths = []
        prefix_offsets = []
        read_offsets = []

        all_decoder_input_ids = []

        next_token_choosers = []
        stopping_criterias = []
        top_n_tokens = []

        max_input_length = 0
        max_decoder_input_length = 0
        padding_right_offset = 0

        total_remaining_decode_tokens = 0

        for i, request_id in enumerate(request_ids):
            idx = self.requests_idx_mapping[request_id]
            requests_idx_mapping[request_id] = i
            keep_indices.append(idx)

            requests.append(self.requests[idx])
            prefix_offsets.append(self.prefix_offsets[idx])
            read_offsets.append(self.read_offsets[idx])

            all_decoder_input_ids.append(self.all_decoder_input_ids[idx])

            request_input_length = self.input_lengths[idx]
            input_lengths.append(request_input_length)
            max_input_length = max(max_input_length, request_input_length)

            request_decoder_input_length = self.decoder_input_lengths[idx]
            decoder_input_lengths.append(request_decoder_input_length)
            max_decoder_input_length = max(
                max_decoder_input_length, request_decoder_input_length
            )

            next_token_choosers.append(self.next_token_choosers[idx])
            stopping_criteria = self.stopping_criterias[idx]
            stopping_criterias.append(stopping_criteria)
            top_n_tokens.append(self.top_n_tokens[idx])
            remaining_decode_tokens = (
                stopping_criteria.max_new_tokens - stopping_criteria.current_tokens
            )
            total_remaining_decode_tokens += remaining_decode_tokens
            padding_right_offset = max(padding_right_offset, remaining_decode_tokens)

        # Apply indices to input_ids, attention mask, past key values and other items that need to be cached
        self.decoder_input_ids = self.decoder_input_ids[keep_indices]
        self.attention_mask = self.attention_mask[keep_indices, -max_input_length:]
        if self.decoder_attention_mask is not None:
            self.decoder_attention_mask = self.decoder_attention_mask[
                keep_indices,
                -(self.padding_right_offset + max_decoder_input_length) : (
                    self.decoder_attention_mask.shape[1] - self.padding_right_offset
                )
                + padding_right_offset,
            ]

        self.encoder_last_hidden_state = self.encoder_last_hidden_state[
            keep_indices, -max_input_length:
        ]

        # Ensure that past_key_values tensors can be updated in-place
        if type(self.past_key_values[0]) is tuple:
            self.past_key_values = [
                [t for t in layer] for layer in self.past_key_values
            ]

        decoder_past_seq_len = max_decoder_input_length - 1
        for layer in self.past_key_values:
            layer[0] = layer[0][keep_indices, :, -decoder_past_seq_len:]
            layer[1] = layer[1][keep_indices, :, -decoder_past_seq_len:]
            layer[2] = layer[2][keep_indices, :, -max_input_length:]
            layer[3] = layer[3][keep_indices, :, -max_input_length:]

        top_n_tokens_tensor = self.top_n_tokens_tensor[keep_indices]
        max_tokens = (
            len(request_ids) * (max_input_length + max_decoder_input_length)
            + remaining_decode_tokens
        )

        self.requests = requests
        self.requests_idx_mapping = requests_idx_mapping
        self.input_ids = None
        self.all_decoder_input_ids = all_decoder_input_ids
        self.input_lengths = input_lengths
        self.decoder_input_lengths = decoder_input_lengths
        self.prefix_offsets = prefix_offsets
        self.read_offsets = read_offsets
        self.next_token_choosers = next_token_choosers
        self.stopping_criterias = stopping_criterias
        self.top_n_tokens = top_n_tokens
        self.top_n_tokens_tensor = top_n_tokens_tensor
        self.max_input_length = max_input_length
        self.max_decoder_input_length = max_decoder_input_length
        self.padding_right_offset = padding_right_offset
        self.max_tokens = max_tokens

        return self

    @classmethod
    @tracer.start_as_current_span("concatenate")
    def concatenate(cls, batches: List["Seq2SeqLMBatch"]) -> "Seq2SeqLMBatch":
        """Concatenate multiple batches together by padding internal torch tensors"""

        # Used for padding
        total_batch_size = 0
        max_input_length = 0
        max_decoder_input_length = 0
        padding_right_offset = 0
        for batch in batches:
            total_batch_size += len(batch)
            max_input_length = max(max_input_length, batch.max_input_length)
            max_decoder_input_length = max(
                max_decoder_input_length, batch.max_decoder_input_length
            )
            padding_right_offset = max(padding_right_offset, batch.padding_right_offset)

        # Batch attributes
        requests = []
        requests_idx_mapping = {}
        all_decoder_input_ids = []
        input_lengths = []
        decoder_input_lengths = []
        prefix_offsets = []
        read_offsets = []
        next_token_choosers = []
        stopping_criterias = []
        top_n_tokens = []
        max_tokens = 0

        # Batch tensors
        attention_mask = None
        decoder_input_ids = None
        decoder_attention_mask = None
        encoder_last_hidden_state = None
        top_n_tokens_tensor = None
        past_key_values = []

        # Used for slicing correctly inside the tensors
        # Equivalent to a cumsum on batch sizes
        start_index = 0

        for i, batch in enumerate(batches):
            # Extend all list attributes
            requests.extend(batch.requests)
            all_decoder_input_ids.extend(batch.all_decoder_input_ids)
            input_lengths.extend(batch.input_lengths)
            decoder_input_lengths.extend(batch.decoder_input_lengths)
            prefix_offsets.extend(batch.prefix_offsets)
            read_offsets.extend(batch.read_offsets)
            next_token_choosers.extend(batch.next_token_choosers)
            stopping_criterias.extend(batch.stopping_criterias)
            top_n_tokens.extend(batch.top_n_tokens)

            if i == 0:
                requests_idx_mapping = batch.requests_idx_mapping
            else:
                # We need to offset the mapping for each batch by the cumulative batch size
                for k, v in batch.requests_idx_mapping.items():
                    requests_idx_mapping[k] = v + start_index

            # Slicing end index for this batch
            end_index = start_index + len(batch)

            # We only concatenate batches that did at least one step
            if batch.encoder_last_hidden_state is None:
                raise ValueError("Batch encoder_last_hidden_state cannot be None")

            # Create padded tensor
            if attention_mask is None:
                attention_mask = batch.attention_mask.new_zeros(
                    (total_batch_size, max_input_length),
                )
            # Copy to correct indices
            attention_mask[start_index:end_index, -batch.max_input_length :] = (
                batch.attention_mask[:, -batch.max_input_length :]
            )

            # Create padded tensor
            if decoder_input_ids is None:
                decoder_input_ids = batch.decoder_input_ids.new_zeros(
                    (total_batch_size, 1),
                )
            # Copy to correct indices
            decoder_input_ids[start_index:end_index] = batch.decoder_input_ids

            # Create padded tensor
            if decoder_attention_mask is None:
                # As decoder_attention_mask might not exist, we use `batch.attention_mask` for device here
                decoder_attention_mask = batch.attention_mask.new_zeros(
                    (total_batch_size, max_decoder_input_length + padding_right_offset),
                )
            # If the decoder mask does not exist yet, all generations started at the same time and we never concatenated
            # this batch. All generations are of length `batch.max_decoder_input_length`.
            left_offset = max_decoder_input_length - batch.max_decoder_input_length
            if batch.decoder_attention_mask is None:
                decoder_attention_mask[
                    start_index:end_index,
                    left_offset:-padding_right_offset,
                ] = 1
            # If it exists, we need to index
            else:
                batch_left_offset = (
                    batch.decoder_attention_mask.shape[1]
                    - batch.max_decoder_input_length
                    - batch.padding_right_offset
                )
                decoder_attention_mask[
                    start_index:end_index,
                    left_offset:-padding_right_offset,
                ] = batch.decoder_attention_mask[
                    :,
                    batch_left_offset : -batch.padding_right_offset,
                ]

            # Create padded tensor
            if encoder_last_hidden_state is None:
                encoder_last_hidden_state = batch.encoder_last_hidden_state.new_zeros(
                    (
                        total_batch_size,
                        max_input_length,
                        batch.encoder_last_hidden_state.shape[-1],
                    ),
                )

            if top_n_tokens_tensor is None:
                top_n_tokens_tensor = batches[0].top_n_tokens_tensor.new_zeros(
                    total_batch_size,
                )
            top_n_tokens_tensor[start_index:end_index] = batch.top_n_tokens_tensor

            # Copy to correct indices
            encoder_last_hidden_state[
                start_index:end_index, -batch.max_input_length :, :
            ] = batch.encoder_last_hidden_state[:, -batch.max_input_length :, :]
            batch.encoder_last_hidden_state = None

            # Ensure that we can update tensors in-place
            if isinstance(batch.past_key_values[0], tuple):
                batch.past_key_values = [
                    [t for t in layer] for layer in batch.past_key_values
                ]

            # Add eventual padding tokens that were added while concatenating
            max_tokens += batch.max_tokens + (
                max_input_length
                - batch.max_input_length
                + max_decoder_input_length
                - batch.max_decoder_input_length
            ) * len(batch)

            start_index = end_index

        # Determine shapes for new past kv tensors
        first_past_kvs = batches[0].past_key_values
        _, num_heads, _, head_dim = first_past_kvs[0][0].shape

        padded_dec_t_shape = (
            total_batch_size,
            num_heads,
            (max_decoder_input_length - 1),
            head_dim,
        )

        padded_enc_t_shape = (
            total_batch_size,
            num_heads,
            max_input_length,
            head_dim,
        )

        # Iterate over attention layers
        for j in range(len(first_past_kvs)):
            past_key_values.append([])

            # Decoder past
            for k in range(0, 2):
                # Initialize tensors
                padded_past_values = first_past_kvs[j][k].new_zeros(padded_dec_t_shape)
                past_key_values[j].append(padded_past_values)

                start_index = 0
                for batch in batches:
                    t = batch.past_key_values[j][k]
                    # Clear reference to the original tensor
                    batch.past_key_values[j][k] = None
                    # Slicing end index for this batch
                    end_index = start_index + len(batch)
                    # We slice the past keys and values to remove the padding from previous batches
                    past_seq_len = batch.max_decoder_input_length - 1
                    padded_past_values[start_index:end_index, :, -past_seq_len:, :] = t[
                        :, :, -past_seq_len:, :
                    ]
                    del t

                    start_index = end_index

            # Encoder past
            for k in range(2, 4):
                # Initialize tensors
                padded_past_values = first_past_kvs[j][k].new_zeros(padded_enc_t_shape)
                past_key_values[j].append(padded_past_values)

                start_index = 0
                for batch in batches:
                    t = batch.past_key_values[j][k]
                    # Clear reference to the original tensor
                    batch.past_key_values[j][k] = None
                    # Slicing end index for this batch
                    end_index = start_index + len(batch)
                    # We slice the past keys and values to remove the padding from previous batches
                    padded_past_values[
                        start_index:end_index, :, -batch.max_input_length :, :
                    ] = t[:, :, -batch.max_input_length :, :]
                    del t

                    start_index = end_index

        return cls(
            batch_id=batches[0].batch_id,
            requests=requests,
            requests_idx_mapping=requests_idx_mapping,
            input_ids=None,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            all_decoder_input_ids=all_decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            encoder_last_hidden_state=encoder_last_hidden_state,
            past_key_values=past_key_values,
            input_lengths=input_lengths,
            decoder_input_lengths=decoder_input_lengths,
            prefix_offsets=prefix_offsets,
            read_offsets=read_offsets,
            next_token_choosers=next_token_choosers,
            stopping_criterias=stopping_criterias,
            top_n_tokens=top_n_tokens,
            top_n_tokens_tensor=top_n_tokens_tensor,
            max_input_length=max_input_length,
            max_decoder_input_length=max_decoder_input_length,
            padding_right_offset=padding_right_offset,
            max_tokens=max_tokens,
        )

    def __len__(self):
        return len(self.requests)


class Seq2SeqLM(Model):
    def __init__(
        self,
        model_id: str,
        model_class,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
        speculator: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        default_dtype=torch.float16,
        trust_remote_code: bool = False,
        config_class=AutoConfig,
        tokenizer_class=AutoTokenizer,
        aliases=None,
    ):
        self.quantize = quantize
        self.process_group, rank, world_size = initialize_torch_distributed()

        device = torch.device("hpu")
        dtype = torch.bfloat16 if dtype is None else dtype

        config = config_class.from_pretrained(
            model_id,
            revision=revision,
            trust_remote_code=trust_remote_code,
        )
        config.quantize = quantize
        config.speculator = speculator

        tokenizer = tokenizer_class.from_pretrained(
            model_id,
            revision=revision,
            padding_side="left",
            truncation_side="left",
            trust_remote_code=trust_remote_code,
        )
        tokenizer.bos_token_id = config.decoder_start_token_id

        weights_loader = get_loader(
            quantize=quantize, model_id=model_id, revision=revision
        )
        torch.distributed.barrier(group=self.process_group)
        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
        weights = Weights(
            filenames,
            device=device,
            dtype=dtype,
            process_group=self.process_group,
            aliases=aliases,
            weights_loader=weights_loader,
        )
        if config.quantize in ["awq", "gptq"]:
            weights._set_gptq_params(model_id, revision)

        model = model_class(config, weights)

        torch.distributed.barrier(group=self.process_group)
        super().__init__(
            model_id=model_id,
            model=model,
            tokenizer=tokenizer,
            requires_padding=True,
            dtype=dtype,
            device=device,
            rank=rank,
            world_size=world_size,
        )

    @classmethod
    def fallback(
        cls,
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
        speculator: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
    ):
        if speculator:
            raise RuntimeError("Speculator decoding is not enabled for AutoModel")

        if torch.cuda.is_available():
            device = torch.device("cuda")
            dtype = torch.float16 if dtype is None else dtype
        else:
            if quantize:
                raise ValueError("quantization is not available on CPU")

            device = torch.device("cpu")
            dtype = torch.float32 if dtype is None else dtype

        model = AutoModelForSeq2SeqLM.from_pretrained(
            model_id,
            revision=revision,
            torch_dtype=dtype,
            device_map=(
                "auto"
                if torch.cuda.is_available() and torch.cuda.device_count() > 1
                else None
            ),
            load_in_8bit=quantize == "bitsandbytes",
            trust_remote_code=trust_remote_code,
        )
        if torch.cuda.is_available() and torch.cuda.device_count() == 1:
            model = model.cuda()

        tokenizer = AutoTokenizer.from_pretrained(
            model_id,
            revision=revision,
            padding_side="left",
            truncation_side="left",
            trust_remote_code=trust_remote_code,
        )
        tokenizer.bos_token_id = model.config.decoder_start_token_id

        self = cls.__new__(
            cls,
        )
        super().__init__(
            self,
            model_id=model_id,
            model=model,
            tokenizer=tokenizer,
            requires_padding=True,
            dtype=dtype,
            device=device,
        )
        self.quantize = quantize
        return self

    @property
    def batch_type(self) -> Type[Seq2SeqLMBatch]:
        return Seq2SeqLMBatch

    def forward(
        self,
        input_ids,
        attention_mask,
        decoder_input_ids,
        decoder_attention_mask: Optional,
        encoder_last_hidden_state: Optional,
        past_key_values: Optional = None,
    ) -> Tuple[
        torch.Tensor,
        Optional[torch.Tensor],
        torch.Tensor,
        List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]],
    ]:
        # Model Forward
        outputs = self.model.forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            encoder_outputs=encoder_last_hidden_state,
            past_key_values=past_key_values,
            use_cache=True,
        )
        if isinstance(outputs, tuple):
            # Our custom models
            outputs, speculative_logits = outputs
        else:
            # Generic transformers models
            speculative_logits = None
        return (
            outputs.logits,
            speculative_logits,
            outputs.encoder_last_hidden_state,
            outputs.past_key_values,
        )

    @tracer.start_as_current_span("generate_token")
    def generate_token(
        self, batch: Seq2SeqLMBatch
    ) -> Tuple[List[Generation], Optional[Seq2SeqLMBatch], Tuple[int, int]]:
        start = time.time_ns()
        if batch.decoder_attention_mask is not None:
            # slice to the correct shape
            decoder_attention_mask = batch.decoder_attention_mask[
                :, : -batch.padding_right_offset
            ]
        else:
            decoder_attention_mask = None

        # Wrap `encoder_last_hidden_state` because for some reason, Transformers does a `encoder_last_hidden_state[0]`
        # internally...
        if batch.encoder_last_hidden_state is not None:
            encoder_last_hidden_state = [batch.encoder_last_hidden_state]
        else:
            encoder_last_hidden_state = None

        logits, speculative_logits, encoder_last_hidden_state, past = self.forward(
            batch.input_ids,
            batch.attention_mask,
            batch.decoder_input_ids,
            decoder_attention_mask,
            encoder_last_hidden_state,
            batch.past_key_values,
        )

        # Speculation is not active for seq2seq
        accepted_ids = torch.ones_like(batch.decoder_input_ids)[:, 0]
        batch_top_token_ids, batch_top_token_logprobs = batch_top_tokens(
            batch.top_n_tokens,
            batch.top_n_tokens_tensor,
            torch.log_softmax(logits[:, -1], -1),
            accepted_ids,
        )

        start_decode = time.time_ns()

        # Finished requests
        generations: List[Generation] = []
        stopped = True

        # Zipped iterator
        iterator = zip(
            batch.requests,
            batch.input_lengths,
            batch.prefix_offsets,
            batch.read_offsets,
            batch.decoder_input_lengths,
            logits,
            batch.next_token_choosers,
            batch.stopping_criterias,
            batch.all_decoder_input_ids,
            batch.top_n_tokens,
            batch_top_token_ids,
            batch_top_token_logprobs,
        )

        # For each member of the batch
        for i, (
            request,
            input_length,
            prefix_offset,
            read_offset,
            decoder_input_length,
            logits,
            next_token_chooser,
            stopping_criteria,
            all_decoder_input_ids,
            top_n_tokens,
            top_token_ids,
            top_token_logprobs,
        ) in enumerate(iterator):
            # Select next token
            next_token_id, logprobs = next_token_chooser(
                all_decoder_input_ids.view(1, -1), logits[-1:, :]
            )

            # Append next token to decoder tokens
            all_decoder_input_ids = torch.cat(
                [all_decoder_input_ids, next_token_id.squeeze(1)]
            )
            new_decoder_input_length = decoder_input_length + 1

            # Generated token
            next_token_logprob = logprobs[-1, next_token_id]
            next_token_id_squeezed = next_token_id.squeeze()
            next_token_text, prefix_offset, read_offset = self.decode_token(
                all_decoder_input_ids, prefix_offset, read_offset
            )

            # Evaluate stopping criteria
            stop, reason = stopping_criteria(next_token_id, next_token_text)

            if not stop:
                stopped = False

            # Shard generations
            # All generations will be appended in the rust sharded client
            if i % self.world_size == self.rank:
                if stop:
                    # Slice with decoder_input_length to remove padding
                    # Decode all tokens
                    output_text, _, _ = self.decode_token(
                        all_decoder_input_ids,
                        prefix_offset=len(all_decoder_input_ids)
                        - decoder_input_length
                        - 1,
                        read_offset=len(all_decoder_input_ids) - decoder_input_length,
                        skip_special_tokens=True,
                    )

                    # Get seed
                    if isinstance(next_token_chooser.choice, Sampling):
                        seed = next_token_chooser.choice.seed
                    else:
                        seed = None

                    generated_text = GeneratedText(
                        output_text, stopping_criteria.current_tokens, reason, seed
                    )
                else:
                    generated_text = None

                # Prefill
                if stopping_criteria.current_tokens == 1 and request.prefill_logprobs:
                    prefill_tokens = Tokens(
                        [self.tokenizer.bos_token_id],
                        [float("nan")],
                        [self.tokenizer.bos_token],
                        [False],
                    )
                else:
                    prefill_tokens = None

                if top_n_tokens > 0:
                    all_top_tokens = []
                    for top_token_ids, top_token_logprobs in zip(
                        top_token_ids, top_token_logprobs
                    ):
                        toptoken_texts = self.tokenizer.batch_decode(
                            top_token_ids,
                            clean_up_tokenization_spaces=False,
                            skip_special_tokens=False,
                        )
                        special_toptokens = [
                            token_id in self.all_special_ids
                            for token_id in top_token_ids
                        ]
                        top_tokens = Tokens(
                            top_token_ids,
                            top_token_logprobs,
                            toptoken_texts,
                            special_toptokens,
                        )
                        all_top_tokens.append(top_tokens)
                    top_tokens = all_top_tokens
                else:
                    top_tokens = None

                generation = Generation(
                    request.id,
                    prefill_tokens,
                    Tokens(
                        [next_token_id_squeezed],
                        [next_token_logprob],
                        [next_token_text],
                        [next_token_id_squeezed.item() in self.all_special_ids],
                    ),
                    generated_text,
                    top_tokens,
                )

                generations.append(generation)

            # Update values
            batch.next_token_choosers[i] = batch.next_token_choosers[i].advance_grammar(
                next_token_id_squeezed.item()
            )
            batch.decoder_input_ids[i] = next_token_id
            batch.all_decoder_input_ids[i] = all_decoder_input_ids
            batch.input_lengths[i] = input_length
            batch.decoder_input_lengths[i] = new_decoder_input_length
            batch.prefix_offsets[i] = prefix_offset
            batch.read_offsets[i] = read_offset
            batch.max_input_length = max(batch.max_input_length, input_length)
            batch.max_decoder_input_length = max(
                batch.max_decoder_input_length, new_decoder_input_length
            )

        # We finished all generations in the batch; there is no next batch
        if stopped:
            forward_ns = start_decode - start
            decode_ns = time.time_ns() - start_decode
            return generations, None, (forward_ns, decode_ns)

        # We don't need input_ids after the prefill forward
        batch.input_ids = None
        batch.encoder_last_hidden_state = encoder_last_hidden_state
        batch.past_key_values = past
        # Update decoder_attention_mask as we added a new token to input_ids
        if batch.decoder_attention_mask is not None:
            batch.decoder_attention_mask[:, -batch.padding_right_offset] = 1
        batch.padding_right_offset -= 1

        forward_ns = start_decode - start
        decode_ns = time.time_ns() - start_decode
        return generations, batch, (forward_ns, decode_ns)


================================================
FILE: backends/gaudi/server/text_generation_server/models/types.py
================================================
import torch

from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import List, Optional

from transformers import PreTrainedTokenizerBase

from text_generation_server.pb import generate_pb2
from text_generation_server.pb.generate_pb2 import FinishReason


class Batch(ABC):
    @abstractmethod
    def to_pb(self) -> generate_pb2.CachedBatch:
        raise NotImplementedError

    @classmethod
    @abstractmethod
    def from_pb(
        cls,
        pb: generate_pb2.Batch,
        tokenizer: PreTrainedTokenizerBase,
        dtype: torch.dtype,
        device: torch.device,
    ) -> "Batch":
        raise NotImplementedError

    @abstractmethod
    def filter(self, request_ids: List[int]) -> "Batch":
        raise NotImplementedError

    @classmethod
    @abstractmethod
    def concatenate(cls, batches: List["Batch"]) -> "Batch":
        raise NotImplementedError

    @abstractmethod
    def __len__(self):
        raise NotImplementedError


@dataclass
class GeneratedText:
    text: str
    generated_tokens: int
    finish_reason: FinishReason
    seed: Optional[int]

    def to_pb(self) -> generate_pb2.GeneratedText:
        return generate_pb2.GeneratedText(
            text=self.text,
            generated_tokens=self.generated_tokens,
            finish_reason=self.finish_reason,
            seed=self.seed,
        )


@dataclass
class Tokens:
    token_ids: List[int]
    logprobs: List[float]
    texts: List[str]
    is_special: List[bool]

    def to_pb(self) -> generate_pb2.Tokens:
        return generate_pb2.Tokens(
            ids=self.token_ids,
            logprobs=self.logprobs,
            texts=self.texts,
            is_special=self.is_special,
        )

    def __len__(self):
        return len(self.token_ids)


@dataclass
class Generation:
    request_id: int
    prefill_tokens: Optional[Tokens]
    tokens: Tokens
    generated_text: Optional[GeneratedText]
    # Optional for now, since it's not yet supported for every model.
    top_tokens: Optional[List[Tokens]]

    def to_pb(self) -> generate_pb2.Generation:
        return generate_pb2.Generation(
            request_id=self.request_id,
            prefill_tokens=(
                self.prefill_tokens.to_pb() if self.prefill_tokens is not None else None
            ),
            tokens=self.tokens.to_pb(),
            generated_text=(
                self.generated_text.to_pb() if self.generated_text is not None else None
            ),
            top_tokens=(
                [top_tokens.to_pb() for top_tokens in self.top_tokens]
                if self.top_tokens is not None
                else None
            ),
        )


================================================
FILE: backends/gaudi/server/text_generation_server/pb/.gitignore
================================================
*.py
*.pyi
*.py-e


================================================
FILE: backends/gaudi/server/text_generation_server/server.py
================================================
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.

import asyncio
import os
import torch
import time
import signal

from grpc import aio
from loguru import logger

from grpc_reflection.v1alpha import reflection
from pathlib import Path
from typing import List, Optional

from text_generation_server.cache import Cache
from text_generation_server.interceptor import ExceptionInterceptor
from text_generation_server.models import Model, get_model_with_lora_adapters
from text_generation_server.pb import generate_pb2_grpc, generate_pb2
from text_generation_server.tracing import UDSOpenTelemetryAioServerInterceptor
from text_generation_server.models.globals import set_model_id, ATTENTION
from text_generation_server.models.globals import set_adapter_to_index
from text_generation_server.utils.adapter import AdapterInfo
from text_generation_server.utils.tokens import make_tokenizer_optional
from text_generation_server.utils.prefill_chunking import set_max_prefill_tokens
from text_generation_server.models import VLM_BATCH_TYPES

from text_generation_server.utils.version import (
    is_driver_compatible,
    MIN_TGI_GAUDI_SYNAPSE_VERSION,
)


class SignalHandler:
    KEEP_PROCESSING = True

    def __init__(self):
        signal.signal(signal.SIGINT, self.exit_gracefully)
        signal.signal(signal.SIGTERM, self.exit_gracefully)

    def exit_gracefully(self, signum, frame):
        print(f"Exiting gracefully: Signal {signum}")
        self.KEEP_PROCESSING = False


class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
    def __init__(
        self,
        model: Model,
        cache: Cache,
        server_urls: List[str],
    ):
        self.cache = cache
        self.model = model
        # Quantize is resolved during model loading
        self.quantize = model.quantize
        self.server_urls = server_urls
        # For some reason, inference_mode does not work well with GLOO which we use on CPU
        # TODO: The inferecemode set messes up the autograd op dispatch. And results in aten::matmul
        # op not optimized issue. Will investigate further.
        # if model.device.type == "hpu":
        # Force inference mode for the lifetime of TextGenerationService
        # self._inference_mode_raii_guard = torch._C._InferenceMode(True)

    async def Info(self, request, context):
        return self.model.info

    async def Health(self, request, context):
        if self.model.device.type == "hpu":
            torch.zeros((2, 2)).to("hpu")
        return generate_pb2.HealthResponse()

    async def ServiceDiscovery(self, request, context):
        return generate_pb2.ServiceDiscoveryResponse(urls=self.server_urls)

    async def ClearCache(self, request, context):
        if request.HasField("id"):
            self.cache.delete(request.id)
        else:
            self.cache.clear()
        return generate_pb2.ClearCacheResponse()

    async def FilterBatch(self, request, context):
        batch = self.cache.pop(request.batch_id)
        if batch is None:
            raise ValueError(f"Batch ID {request.batch_id} not found in cache.")
        filtered_batch = batch.filter(request.request_ids)
        self.cache.set(filtered_batch)

        return generate_pb2.FilterBatchResponse(batch=filtered_batch.to_pb())

    async def Warmup(self, request, context):
        if ATTENTION == "paged":
            set_max_prefill_tokens(request.max_prefill_tokens)
            if (
                self.model.batch_type in VLM_BATCH_TYPES
            ):  # Hack, i would rather use kwargs in the `from_pb` call
                batch = self.model.batch_type.from_pb_processor(
                    request.batch,
                    self.model.tokenizer,
                    self.model.processor,
                    self.model.model.config,
                    self.model.dtype,
                    self.model.device,
                )
            else:
                batch = self.model.batch_type.from_pb(
                    request.batch,
                    self.model.tokenizer,
                    self.model.dtype,
                    self.model.device,
                )

            # Override default values with None for clearer semantics.
            max_input_tokens = (
                request.max_input_tokens
                if request.HasField("max_input_tokens")
                else None
            )
            max_total_tokens = (
                request.max_total_tokens
                if request.HasField("max_total_tokens")
                else None
            )
            max_supported_total_tokens, max_input_tokens, max_total_tokens = (
                self.model.warmup(batch, max_input_tokens, max_total_tokens)
            )
        else:
            max_supported_total_tokens, max_input_tokens, max_total_tokens = (
                self.model.warmup(request)
            )

            # W/A for the skip tokenizer path
            # We need to call make_tokenizer_optional after the warmup,
            # because router is not aware of that feature
            make_tokenizer_optional(self.model.tokenizer)

        return generate_pb2.WarmupResponse(
            max_supported_total_tokens=max_supported_total_tokens,
            max_input_tokens=max_input_tokens,
            max_total_tokens=max_total_tokens,
        )

    async def Prefill(self, request, context):
        start = time.time_ns()
        if (
            self.model.batch_type in VLM_BATCH_TYPES
        ):  # Hack, i would rather use kwargs in the `from_pb` call
            batch = self.model.batch_type.from_pb_processor(
                request.batch,
                self.model.tokenizer,
                self.model.processor,
                self.model.model.config,
                self.model.dtype,
                self.model.device,
            )
        else:
            batch = self.model.batch_type.from_pb(
                request.batch, self.model.tokenizer, self.model.dtype, self.model.device
            )

        generations, next_batch, timings = self.model.generate_token([batch])
        self.cache.set(next_batch)

        return generate_pb2.PrefillResponse(
            generations=[generation.to_pb() for generation in generations],
            batch=next_batch.to_pb() if next_batch else None,
            forward_ns=timings[0],
            decode_ns=timings[1],
            total_ns=time.time_ns() - start,
        )

    async def Decode(self, request, context):
        start = time.time_ns()
        if len(request.batches) == 0:
            raise ValueError("Must provide at least one batch")

        batches = []
        for batch_pb in request.batches:
            batch = self.cache.pop(batch_pb.id)
            if batch is None:
                raise ValueError(f"Batch ID {batch_pb.id} not found in cache.")
            batches.append(batch)

        if len(batches) == 0:
            raise ValueError("All batches are empty")

        generations, next_batch, timings = self.model.generate_token(batches)
        self.cache.set(next_batch)

        return generate_pb2.DecodeResponse(
            generations=[generation.to_pb() for generation in generations],
            batch=next_batch.to_pb() if next_batch else None,
            concat_ns=None,
            forward_ns=timings[0],
            decode_ns=timings[1],
            total_ns=time.time_ns() - start,
        )


def serve(
    model_id: str,
    lora_adapters: Optional[List[AdapterInfo]],
    revision: Optional[str],
    sharded: bool,
    quantize: Optional[str],
    speculate: Optional[int],
    dtype: Optional[str],
    kv_cache_dtype: Optional[str],
    trust_remote_code: bool,
    uds_path: Path,
    max_input_tokens: int,
):
    async def serve_inner(
        model_id: str,
        lora_adapters: Optional[List[AdapterInfo]],
        revision: Optional[str],
        sharded: bool = False,
        quantize: Optional[str] = None,
        speculate: Optional[int] = None,
        dtype: Optional[str] = None,
        kv_cache_dtype: Optional[str] = None,
        trust_remote_code: bool = False,
    ):
        if not is_driver_compatible():
            logger.warning(
                f"Current Synapse version is lower than the minimum version supported: {MIN_TGI_GAUDI_SYNAPSE_VERSION}, this could result in failures"
            )

        unix_socket_template = "unix://{}-{}"
        adapter_to_index = {}
        logger.info("Server:server_inner: sharded ={}".format(sharded))

        if sharded:
            rank = int(os.environ["RANK"])
            logger.info("Server:server_inner: rank ={}".format(rank))
            server_urls = [
                unix_socket_template.format(uds_path, rank)
                for rank in range(int(os.environ["WORLD_SIZE"]))
            ]
            local_url = server_urls[int(os.environ["RANK"])]
        else:
            local_url = unix_socket_template.format(uds_path, 0)
            server_urls = [local_url]

        logger.info(
            "Server:server_inner: data type = {}, local_url = {}".format(
                dtype, local_url
            )
        )
        if dtype == "bfloat16" or None:
            data_type = torch.bfloat16
        else:
            data_type = torch.float
        if revision == "None":
            revision = None
        try:
            model = get_model_with_lora_adapters(
                model_id,
                lora_adapters,
                revision,
                sharded,
                quantize,
                speculate,
                data_type,
                kv_cache_dtype,
                trust_remote_code,
                max_input_tokens,
                adapter_to_index,
            )

        except Exception:
            logger.exception("Error when initializing model")
            raise

        set_adapter_to_index(adapter_to_index)
        server = aio.server(
            interceptors=[
                ExceptionInterceptor(),
                UDSOpenTelemetryAioServerInterceptor(),
            ],
            options=[
                # Set the maximum possible message length: i32::MAX
                ("grpc.max_receive_message_length", (1 << 31) - 1)
            ],
        )
        generate_pb2_grpc.add_TextGenerationServiceServicer_to_server(
            TextGenerationService(model, Cache(), server_urls), server
        )
        SERVICE_NAMES = (
            generate_pb2.DESCRIPTOR.services_by_name["TextGenerationService"].full_name,
            reflection.SERVICE_NAME,
        )
        reflection.enable_server_reflection(SERVICE_NAMES, server)
        server.add_insecure_port(local_url)

        await server.start()

        logger.info("Server started at {}".format(local_url))
        signal_handler = SignalHandler()
        while signal_handler.KEEP_PROCESSING:
            await asyncio.sleep(0.5)

    set_model_id(model_id)
    asyncio.run(
        serve_inner(
            model_id,
            lora_adapters,
            revision,
            sharded,
            quantize,
            speculate,
            dtype,
            kv_cache_dtype,
            trust_remote_code,
        )
    )


================================================
FILE: backends/gaudi/server/text_generation_server/tracing.py
================================================
import grpc

from opentelemetry import trace
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation.grpc._aio_server import (
    OpenTelemetryAioServerInterceptor,
)
from opentelemetry.semconv.trace import SpanAttributes
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import (
    BatchSpanProcessor,
)


class UDSOpenTelemetryAioServerInterceptor(OpenTelemetryAioServerInterceptor):
    def __init__(self):
        super().__init__(trace.get_tracer(__name__))

    def _start_span(self, handler_call_details, context, set_status_on_exception=False):
        """
        Rewrite _start_span method to support Unix Domain Socket gRPC contexts
        """

        # standard attributes
        attributes = {
            SpanAttributes.RPC_SYSTEM: "grpc",
            SpanAttributes.RPC_GRPC_STATUS_CODE: grpc.StatusCode.OK.value[0],
        }

        # if we have details about the call, split into service and method
        if handler_call_details.method:
            service, method = handler_call_details.method.lstrip("/").split("/", 1)
            attributes.update(
                {
                    SpanAttributes.RPC_METHOD: method,
                    SpanAttributes.RPC_SERVICE: service,
                }
            )

        # add some attributes from the metadata
        metadata = dict(context.invocation_metadata())
        if "user-agent" in metadata:
            attributes["rpc.user_agent"] = metadata["user-agent"]

        # We use gRPC over a UNIX socket
        attributes.update({SpanAttributes.NET_TRANSPORT: "unix"})

        return self._tracer.start_as_current_span(
            name=handler_call_details.method,
            kind=trace.SpanKind.SERVER,
            attributes=attributes,
            set_status_on_exception=set_status_on_exception,
        )


def setup_tracing(otlp_service_name: str, otlp_endpoint: str):
    resource = Resource.create(attributes={"service.name": otlp_service_name})
    span_exporter = OTLPSpanExporter(endpoint=otlp_endpoint, insecure=True)
    span_processor = BatchSpanProcessor(span_exporter)

    trace.set_tracer_provider(TracerProvider(resource=resource))
    trace.get_tracer_provider().add_span_processor(span_processor)


================================================
FILE: backends/gaudi/server/text_generation_server/utils/__init__.py
================================================
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.

from text_generation_server.utils.convert import convert_file, convert_files
from text_generation_server.utils.dist import initialize_torch_distributed
from text_generation_server.utils.weights import Weights
from text_generation_server.utils.peft import download_and_unload_peft
from text_generation_server.utils.hub import (
    weight_files,
    weight_hub_files,
    download_weights,
    EntryNotFoundError,
    LocalEntryNotFoundError,
    RevisionNotFoundError,
)
from text_generation_server.utils.tokens import (
    NextTokenChooser,
    HeterogeneousNextTokenChooser,
    StoppingCriteria,
    StopSequenceCriteria,
    FinishReason,
    Sampling,
    Greedy,
    make_tokenizer_optional,
    is_tokenizer_transparent,
    pad_next_token_chooser_parameters,
)

__all__ = [
    "convert_file",
    "convert_files",
    "initialize_torch_distributed",
    "weight_files",
    "weight_hub_files",
    "download_weights",
    "download_and_unload_peft",
    "EntryNotFoundError",
    "HeterogeneousNextTokenChooser",
    "LocalEntryNotFoundError",
    "RevisionNotFoundError",
    "Greedy",
    "NextTokenChooser",
    "Sampling",
    "StoppingCriteria",
    "StopSequenceCriteria",
    "FinishReason",
    "Weights",
    "make_tokenizer_optional",
    "is_tokenizer_transparent",
    "pad_next_token_chooser_parameters",
]


================================================
FILE: backends/gaudi/server/text_generation_server/utils/adapter.py
================================================
# Origin:   https://github.com/predibase/lorax
# Path:     lorax/server/lorax_server/utils/adapter.py
# License:  Apache License Version 2.0, January 2004

import warnings
import re
from dataclasses import dataclass
from functools import lru_cache
from typing import TYPE_CHECKING, Set, Tuple, Optional, List

from safetensors.torch import load_file
from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizer

from text_generation_server.utils.merges.strategies import merge_adapters

from text_generation_server.utils import hub
from text_generation_server.adapters.lora import LoraConfig


if TYPE_CHECKING:
    from text_generation_server.adapters.config import AdapterConfig, ModuleMap


BASE_MODEL_ADAPTER_ID = "__base_model__"


@dataclass
class AdapterInfo:
    id: str
    path: Optional[str]
    revision: Optional[str] = None


@dataclass
class AdapterParameters:
    adapter_info: Tuple[AdapterInfo]
    weights: Tuple[float]
    merge_strategy: NotImplemented
    density: float
    majority_sign_method: NotImplemented


@dataclass
class AdapterSource:
    adapter_id: str
    model_id: str
    revision: str


def parse_lora_adapters(lora_adapters: Optional[str]) -> List[AdapterInfo]:
    if not lora_adapters:
        return []

    adapter_list = []
    for adapter in lora_adapters.split(","):
        adapter = adapter.strip()
        if adapter.count("=") > 1 or adapter.count("@") > 1:
            raise ValueError(f"Invalid LoRA adapter format: {adapter}")
        match = re.match(r"^([^=@]+)(?:=([^@]+))?(?:@(.+))?$", adapter)

        if match:
            adapter_id, path, revision = match.groups()
            adapter_list.append(
                AdapterInfo(id=adapter_id, path=path, revision=revision)
            )
        else:
            raise ValueError(f"Invalid LoRA adapter format: {adapter}")
    return adapter_list


def load_and_merge_adapters(
    model_id: str,
    adapter_parameters: AdapterParameters,
    adapter_index: int,
    weight_names: Tuple[str],
    trust_remote_code: bool = False,
) -> Tuple["ModuleMap", "AdapterConfig", Set[str], PreTrainedTokenizer]:
    if len(adapter_parameters.adapter_info) == 1:
        adapter = next(iter(adapter_parameters.adapter_info))
        return load_module_map(
            model_id,
            adapter.revision,
            adapter.id,
            adapter.path,
            weight_names,
            trust_remote_code,
        )

    adapter_params = AdapterParametersContainer(adapter_parameters, adapter_index)
    return _load_and_merge(
        model_id,
        adapter_params,
        weight_names,
        trust_remote_code,
    )


@dataclass
class AdapterParametersContainer:
    adapter_parameters: AdapterParameters
    adapter_index: int

    def __hash__(self) -> int:
        return self.adapter_index


@lru_cache(maxsize=32)
def _load_and_merge(
    model_id: str,
    adapter_params: AdapterParametersContainer,
    weight_names: Tuple[str],
    trust_remote_code: bool = False,
) -> Tuple["ModuleMap", "AdapterConfig", Set[str], PreTrainedTokenizer]:
    params = adapter_params.adapter_parameters

    adapters_to_merge = []
    merged_weight_names = set()
    tokenizer = None
    for adapter in params.adapter_info:
        if adapter.id == BASE_MODEL_ADAPTER_ID:
            raise ValueError("Base model adapter cannot be merged.")

        module_map, adapter_config, adapter_weight_names, adapter_tokenizer = (
            load_module_map(
                model_id,
                adapter.revision,
                adapter.id,
                adapter.path,
                weight_names,
                trust_remote_code,
            )
        )

        adapters_to_merge.append((module_map, adapter_config))
        merged_weight_names = merged_weight_names.union(adapter_weight_names)
        if tokenizer is None:
            tokenizer = adapter_tokenizer

    if len(adapters_to_merge) == 0:
        raise ValueError("No adapters to merge.")

    module_map, adapter_config = merge_adapters(adapters_to_merge, params)
    return module_map, adapter_config, merged_weight_names, tokenizer


def check_architectures(
    model_id: str,
    adapter_id: str,
    adapter_config: "AdapterConfig",
    trust_remote_code: bool = False,
):
    try:
        if not adapter_config.base_model_name_or_path:
            # Avoid execution latency caused by the network connection retrying for AutoConfig.from_pretrained(None)
            return

        expected_config = AutoConfig.from_pretrained(
            model_id, trust_remote_code=trust_remote_code
        )
        model_config = AutoConfig.from_pretrained(
            adapter_config.base_model_name_or_path, trust_remote_code=trust_remote_code
        )
    except Exception as e:
        warnings.warn(
            f"Unable to check architecture compatibility for adapter '{adapter_id}' "
            f"against model '{model_id}'. Assuming they are compatible. Error: {e}"
        )
        return

    if model_config.architectures == expected_config.architectures:
        warnings.warn(
            f"Adapter '{adapter_id}' was not trained on base model '{model_id}'. "
            f"If you encounter issues, use --model-id '{adapter_config.base_model_name_or_path}' instead."
        )
    else:
        # TODO(travis): revisit this when we support clasification heads which will not use CausalLM
        raise ValueError(
            f"Adapter '{adapter_id}' is not compatible with model '{model_id}'. "
            f"Architectures differ: {model_config.architectures} != {expected_config.architectures}. "
            f"Use --model-id '{adapter_config.base_model_name_or_path}' instead."
        )


@lru_cache(maxsize=128)
def load_module_map(
    model_id: str,
    revision: str,
    adapter_id: str,
    adapter_path: Optional[str],
    weight_names: Tuple[str],
    trust_remote_code: bool = False,
) -> Tuple["ModuleMap", "AdapterConfig", Set[str], PreTrainedTokenizer]:
    adapter_config = LoraConfig.load(adapter_path or adapter_id, None)

    if not adapter_path and adapter_config.base_model_name_or_path != model_id:
        check_architectures(model_id, adapter_id, adapter_config, trust_remote_code)

    adapter_filenames = (
        hub._weight_files_from_dir(adapter_path, extension=".safetensors")
        if adapter_path
        else hub._cached_weight_files(
            adapter_id, revision=revision, extension=".safetensors"
        )
    )

    # throw an error if no adapter weights are found
    if not adapter_filenames:
        raise FileNotFoundError(
            f"No adapter weights found for adapter '{adapter_id}' and revision '{revision}'."
        )

    try:
        adapter_tokenizer = AutoTokenizer.from_pretrained(
            adapter_config.config_path,
            trust_remote_code=trust_remote_code,
        )
    except Exception:
        # Adapter does not have a tokenizer, so fallback to base model tokenizer
        adapter_tokenizer = None

    # load adapter weights from all shards (should have relatively small memory footprint)
    adapter_weights = {}
    for filename in adapter_filenames:
        adapter_weights.update(load_file(filename))

    # map the model weights to the relevant adapter weights (LoRA A and B matrices)
    module_map, adapter_weight_names = adapter_config.map_weights_for_model(
        adapter_weights, weight_names
    )
    return module_map, adapter_config, adapter_weight_names, adapter_tokenizer


def get_attn_weights(i, layer):
    qkv = layer.self_attn.query_key_value
    weights = {}

    for k in ["q", "k", "v"]:
        key = (i, f"{k}_proj")
        value = (f"model.layers.{i}.self_attn.{k}_proj", qkv)
        weights[key] = value

    # also add the qkv_proj weight for the adapter
    weights[(i, "qkv_proj")] = (
        f"model.layers.{i}.self_attn.qkv_proj",
        qkv,
    )

    weights[(i, "o_proj")] = (
        f"model.layers.{i}.self_attn.o_proj",
        layer.self_attn.o_proj,
    )

    return weights


def get_mlp_weights(i, layer):
    weights = {}
    if hasattr(layer, "mlp"):
        mlp = layer.mlp
        if hasattr(mlp, "gate_up_proj"):
            # handle combined gate_up_proj (e.g., for some LLaMA variants)
            weights.update(
                {
                    (i, "gate_proj"): (
                        f"model.layers.{i}.mlp.gate_proj",
                        mlp.gate_up_proj,
                    ),
                    (i, "up_proj"): (f"model.layers.{i}.mlp.up_proj", mlp.gate_up_proj),
                }
            )
        else:
            # handle separate gate_proj, up_proj, and down_proj (e.g., for Gemma)
            if hasattr(mlp, "gate_proj"):
                weights[(i, "gate_proj")] = (
                    f"model.layers.{i}.mlp.gate_proj",
                    mlp.gate_proj,
                )
            if hasattr(mlp, "up_proj"):
                weights[(i, "up_proj")] = (f"model.layers.{i}.mlp.up_proj", mlp.up_proj)

        if hasattr(mlp, "down_proj"):
            weights[(i, "down_proj")] = (
                f"model.layers.{i}.mlp.down_proj",
                mlp.down_proj,
            )

    return weights


# build_layer_weight_lookup creates a mapping of model layers to their corresponding
# weight tensors and paths. It builds a dictionary that maps layer identifiers to tuples
# containing the weight tensor path and the actual layer object. This mapping is needed
# for the lora adapter to know which weights to update when applying the adapter.
def build_layer_weight_lookup(model):
    if hasattr(model, "language_model"):
        m = model.language_model.model
    elif hasattr(model, "text_model"):
        m = model.text_model.model
    else:
        m = model.model

    layer_weights = {}

    for i, layer in enumerate(m.layers):
        attn_weights = get_attn_weights(i, layer)
        mlp_weights = get_mlp_weights(i, layer)

        layer_weights.update(attn_weights)
        layer_weights.update(mlp_weights)

    lm_head = None
    if hasattr(m, "lm_head"):
        lm_head = m.lm_head
    elif hasattr(model, "lm_head"):
        lm_head = model.lm_head

    if lm_head:
        layer_weights[(0, "lm_head")] = ("lm_head", lm_head)

    return layer_weights


================================================
FILE: backends/gaudi/server/text_generation_server/utils/chunks.py
================================================
from typing import Iterable

from loguru import logger

from text_generation_server.pb import generate_pb2


def concat_text_chunks(chunks: Iterable[generate_pb2.InputChunk]) -> str:
    """
    Concatenate text in text chunks. Non-text chunks are dropped.
    """
    text = None
    for chunk in chunks:
        chunk_type = chunk.WhichOneof("chunk")
        if chunk_type == "text":
            if text is None:
                text = chunk.text
            else:
                raise NotImplementedError("Request contained more than one text chunk")
        else:
            # We cannot reject this, e.g. warmup sends an image chunk.
            logger.debug(f"Encountered non-text chunk type {chunk_type}")

    if text is None:
        raise NotImplementedError("Request without a text chunk")

    return text


================================================
FILE: backends/gaudi/server/text_generation_server/utils/convert.py
================================================
import datetime
import torch
import os

from loguru import logger
from pathlib import Path
from safetensors.torch import save_file, load_file, _find_shared_tensors, _is_complete
from typing import List, Dict
from collections import defaultdict


def _remove_duplicate_names(
    state_dict: Dict[str, torch.Tensor],
    *,
    preferred_names: List[str] = None,
    discard_names: List[str] = None,
) -> Dict[str, List[str]]:
    if preferred_names is None:
        preferred_names = []
    preferred_names = set(preferred_names)
    if discard_names is None:
        discard_names = []
    discard_names = set(discard_names)

    shareds = _find_shared_tensors(state_dict)
    to_remove = defaultdict(list)
    for shared in shareds:
        complete_names = set(
            [name for name in shared if _is_complete(state_dict[name])]
        )
        if not complete_names:
            if len(shared) == 1:
                # Force contiguous
                name = list(shared)[0]
                state_dict[name] = state_dict[name].clone()
                complete_names = {name}
            else:
                raise RuntimeError(
                    f"Error while trying to find names to remove to save state dict, but found no suitable name to keep for saving amongst: {shared}. None is covering the entire storage.Refusing to save/load the model since you could be storing much more memory than needed. Please refer to https://huggingface.co/docs/safetensors/torch_shared_tensors for more information. Or open an issue."
                )

        keep_name = sorted(list(complete_names))[0]

        # Mecanism to preferentially select keys to keep
        # coming from the on-disk file to allow
        # loading models saved with a different choice
        # of keep_name
        preferred = complete_names.difference(discard_names)
        if preferred:
            keep_name = sorted(list(preferred))[0]

        if preferred_names:
            preferred = preferred_names.intersection(complete_names)
            if preferred:
                keep_name = sorted(list(preferred))[0]
        for name in sorted(shared):
            if name != keep_name:
                to_remove[keep_name].append(name)
    return to_remove


def convert_file(pt_file: Path, sf_file: Path, discard_names: List[str]):
    """
    Convert a pytorch file to a safetensors file
    This will remove duplicate tensors from the file.

    Unfortunately, this might not respect *transformers* convention.
    Forcing us to check for potentially different keys during load when looking
    for specific tensors (making tensor sharing explicit).
    """
    loaded = torch.load(pt_file, map_location="cpu", weights_only=True)
    if "state_dict" in loaded:
        loaded = loaded["state_dict"]
    to_removes = _remove_duplicate_names(loaded, discard_names=discard_names)

    metadata = {"format": "pt"}
    for kept_name, to_remove_group in to_removes.items():
        for to_remove in to_remove_group:
            if to_remove not in metadata:
                metadata[to_remove] = kept_name
            del loaded[to_remove]
    # Force tensors to be contiguous
    loaded = {k: v.contiguous() for k, v in loaded.items()}

    dirname = os.path.dirname(sf_file)
    os.makedirs(dirname, exist_ok=True)
    save_file(loaded, sf_file, metadata=metadata)
    reloaded = load_file(sf_file)
    for k in loaded:
        pt_tensor = loaded[k]
        sf_tensor = reloaded[k]
        if not torch.equal(pt_tensor, sf_tensor):
            raise RuntimeError(f"The output tensors do not match for key {k}")


def convert_files(pt_files: List[Path], sf_files: List[Path], discard_names: List[str]):
    assert len(pt_files) == len(sf_files)

    N = len(pt_files)
    # We do this instead of using tqdm because we want to parse the logs with the launcher

    for i, (pt_file, sf_file) in enumerate(zip(pt_files, sf_files)):
        # Skip blacklisted files
        if (
            "arguments" in pt_file.name
            or "args" in pt_file.name
            or "training" in pt_file.name
        ):
            continue

        start = datetime.datetime.now()
        convert_file(pt_file, sf_file, discard_names)
        elapsed = datetime.datetime.now() - start
        logger.info(f"Convert: [{i + 1}/{N}] -- Took: {elapsed}")


================================================
FILE: backends/gaudi/server/text_generation_server/utils/debug.py
================================================
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.

import os
import glob
import time

import habana_frameworks.torch as htorch
import numpy as np

START_TS = None
DBG_TRACE_FILENAME = os.environ.get("DBG_TRACE_FILENAME")
if "GRAPH_VISUALIZATION" in os.environ:
    for f in glob.glob(".graph_dumps/*"):
        os.remove(f)


def to_gb_rounded(mem: float) -> float:
    """
    Rounds and converts to GB.

    Args:
        mem (float): memory in bytes

    Returns:
        float: memory in GB rounded to the second decimal
    """
    return np.round(mem / 1024**3, 2)


def count_hpu_graphs():
    return len(glob.glob(".graph_dumps/*PreGraph*"))


def dbg_trace(tag, txt):
    global START_TS
    if DBG_TRACE_FILENAME is not None and int(os.getenv("RANK", 0)) == 0:
        if START_TS is None:
            START_TS = time.perf_counter()
        time_offset = time.perf_counter() - START_TS
        mem_stats = htorch.hpu.memory.memory_stats()
        mem_used = to_gb_rounded(mem_stats["InUse"])
        max_mem_used = to_gb_rounded(mem_stats["MaxInUse"])
        print(
            f"ts:{time_offset:.3f}s g:{count_hpu_graphs()} mu:{mem_used:.1f}GB "
            f"mmu:{max_mem_used:.1f}GB | {tag} | {txt}",
            flush=True,
            file=open(DBG_TRACE_FILENAME, "a"),
        )


================================================
FILE: backends/gaudi/server/text_generation_server/utils/dist.py
================================================
import os
import torch
from torch.distributed import ProcessGroup
from datetime import timedelta
from loguru import logger

# Tensor Parallelism settings
RANK = int(os.getenv("RANK", "0"))
WORLD_SIZE = int(os.getenv("WORLD_SIZE", "1"))
MEMORY_FRACTION = float(os.getenv("HPU_MEMORY_FRACTION", "0.8"))


class FakeBarrier:
    def wait(self):
        pass


class FakeGroup(ProcessGroup):
    def __init__(self, rank, size):
        self._rank = rank
        self._size = size
        super().__init__(rank, size)

    def allreduce(self, *args, **kwargs):
        return FakeBarrier()

    def allgather(self, inputs, local_tensor, **kwargs):
        assert (
            len(inputs[0]) == len(local_tensor) == 1
        ), f"{len(inputs[0])} != {len(local_tensor)} != 1, and the FakeGroup is supposed to join on simple tensors"
        for input_ in inputs:
            input_[0].data = local_tensor[0].data
        return FakeBarrier()

    def barrier(self, *args, **kwargs):
        return FakeBarrier()

    def size(self):
        return self._size

    def rank(self):
        return self._rank

    def _get_backend_name(self):
        return "fake"


def initialize_torch_distributed():
    if WORLD_SIZE == 1:
        return FakeGroup(RANK, WORLD_SIZE), RANK, WORLD_SIZE
    else:
        if os.getenv("DEBUG", None) == "1":
            return FakeGroup(RANK, WORLD_SIZE), RANK, WORLD_SIZE

        if not torch.distributed.is_initialized():
            # Call the init process.
            torch.distributed.init_process_group(
                backend="hccl",
                world_size=WORLD_SIZE,
                rank=RANK,
                timeout=timedelta(seconds=120),
            )
        else:
            logger.warning("torch.distributed is already initialized.")

        return torch.distributed.group.WORLD, RANK, WORLD_SIZE


================================================
FILE: backends/gaudi/server/text_generation_server/utils/hub.py
================================================
import time
import os

from datetime import timedelta
from loguru import logger
from pathlib import Path
from typing import Optional, List

from huggingface_hub import file_download, hf_api, HfApi, hf_hub_download
from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
from huggingface_hub.utils import (
    LocalEntryNotFoundError,
    EntryNotFoundError,
    RevisionNotFoundError,  # noqa # Import here to ease try/except in other part of the lib
)

WEIGHTS_CACHE_OVERRIDE = os.getenv("WEIGHTS_CACHE_OVERRIDE", None)
HF_HUB_OFFLINE = os.environ.get("HF_HUB_OFFLINE", "0").lower() in ["true", "1", "yes"]


def _cached_weight_files(
    model_id: str, revision: Optional[str], extension: str
) -> List[str]:
    """Guess weight files from the cached revision snapshot directory"""
    d = _get_cached_revision_directory(model_id, revision)
    if not d:
        return []
    filenames = _weight_files_from_dir(d, extension)
    return filenames


def _weight_hub_files_from_model_info(
    info: hf_api.ModelInfo, extension: str
) -> List[str]:
    return [
        s.rfilename
        for s in info.siblings
        if s.rfilename.endswith(extension)
        and len(s.rfilename.split("/")) == 1
        and "arguments" not in s.rfilename
        and "args" not in s.rfilename
        and "training" not in s.rfilename
    ]


def _weight_files_from_dir(d: Path, extension: str) -> List[str]:
    # os.walk: do not iterate, just scan for depth 1, not recursively
    # see _weight_hub_files_from_model_info, that's also what is
    # done there with the len(s.rfilename.split("/")) == 1 condition
    root, _, files = next(os.walk(str(d)))
    filenames = [
        os.path.join(root, f)
        for f in files
        if f.endswith(extension)
        and "arguments" not in f
        and "args" not in f
        and "training" not in f
    ]
    return filenames


def _get_cached_revision_directory(
    model_id: str, revision: Optional[str]
) -> Optional[Path]:
    if revision is None:
        revision = "main"

    repo_cache = Path(HUGGINGFACE_HUB_CACHE) / Path(
        file_download.repo_folder_name(repo_id=model_id, repo_type="model")
    )

    if not repo_cache.is_dir():
        # No cache for this model
        return None

    refs_dir = repo_cache / "refs"
    snapshots_dir = repo_cache / "snapshots"

    # Resolve refs (for instance to convert main to the associated commit sha)
    if refs_dir.is_dir():
        revision_file = refs_dir / revision
        if revision_file.exists():
            with revision_file.open() as f:
                revision = f.read()

    # Check if revision folder exists
    if not snapshots_dir.exists():
        return None
    cached_shas = os.listdir(snapshots_dir)
    if revision not in cached_shas:
        # No cache for this revision and we won't try to return a random revision
        return None

    return snapshots_dir / revision


def weight_hub_files(
    model_id: str, revision: Optional[str] = None, extension: str = ".safetensors"
) -> List[str]:
    """Get the weights filenames on the hub"""
    api = HfApi()

    if HF_HUB_OFFLINE:
        filenames = _cached_weight_files(model_id, revision, extension)
    else:
        # Online case, fetch model info from the Hub
        info = api.model_info(model_id, revision=revision)
        filenames = _weight_hub_files_from_model_info(info, extension)

    if not filenames:
        raise EntryNotFoundError(
            f"No {extension} weights found for model {model_id} and revision {revision}.",
            None,
        )

    return filenames


def try_to_load_from_cache(
    model_id: str, revision: Optional[str], filename: str
) -> Optional[Path]:
    """Try to load a file from the Hugging Face cache"""

    d = _get_cached_revision_directory(model_id, revision)
    if not d:
        return None

    # Check if file exists in cache
    cached_file = d / filename
    return cached_file if cached_file.is_file() else None


def weight_files(
    model_id: str, revision: Optional[str] = None, extension: str = ".safetensors"
) -> List[Path]:
    """Get the local files"""
    # Local model
    d = Path(model_id)
    if d.exists() and d.is_dir():
        local_files = _weight_files_from_dir(d, extension)
        if not local_files:
            raise FileNotFoundError(
                f"No local weights found in {model_id} with extension {extension}"
            )
        return [Path(f) for f in local_files]

    try:
        filenames = weight_hub_files(model_id, revision, extension)
    except EntryNotFoundError as e:
        if extension != ".safetensors":
            raise e
        # Try to see if there are pytorch weights
        pt_filenames = weight_hub_files(model_id, revision, extension=".bin")
        # Change pytorch extension to safetensors extension
        # It is possible that we have safetensors weights locally even though they are not on the
        # hub if we converted weights locally without pushing them
        filenames = [
            f"{Path(f).stem.lstrip('pytorch_')}.safetensors" for f in pt_filenames
        ]

    if WEIGHTS_CACHE_OVERRIDE is not None:
        files = []
        for filename in filenames:
            p = Path(WEIGHTS_CACHE_OVERRIDE) / filename
            if not p.exists():
                raise FileNotFoundError(
                    f"File {p} not found in {WEIGHTS_CACHE_OVERRIDE}."
                )
            files.append(p)
        return files

    files = []
    for filename in filenames:
        cache_file = try_to_load_from_cache(
            model_id, revision=revision, filename=filename
        )
        if cache_file is None:
            raise LocalEntryNotFoundError(
                f"File {filename} of model {model_id} not found in "
                f"{os.getenv('HUGGINGFACE_HUB_CACHE', 'the local cache')}. "
                f"Please run `text-generation-server download-weights {model_id}` first."
            )
        files.append(cache_file)

    return files


def download_weights(
    filenames: List[str], model_id: str, revision: Optional[str] = None
) -> List[Path]:
    """Download the safetensors files from the hub"""

    def download_file(fname, tries=5, backoff: int = 5):
        local_file = try_to_load_from_cache(model_id, revision, fname)
        if local_file is not None:
            logger.info(f"File {fname} already present in cache.")
            return Path(local_file)

        for idx in range(tries):
            try:
                logger.info(f"Download file: {fname}")
                stime = time.time()
                local_file = hf_hub_download(
                    filename=fname,
                    repo_id=model_id,
                    revision=revision,
                    local_files_only=HF_HUB_OFFLINE,
                )
                logger.info(
                    f"Downloaded {local_file} in {timedelta(seconds=int(time.time() - stime))}."
                )
                return Path(local_file)
            except Exception as e:
                if idx + 1 == tries:
                    raise e
                logger.error(e)
                logger.info(f"Retrying in {backoff} seconds")
                time.sleep(backoff)
                logger.info(f"Retry {idx + 1}/{tries - 1}")

    # We do this instead of using tqdm because we want to parse the logs with the launcher
    start_time = time.time()
    files = []
    for i, filename in enumerate(filenames):
        file = download_file(filename)

        elapsed = timedelta(seconds=int(time.time() - start_time))
        remaining = len(filenames) - (i + 1)
        eta = (elapsed / (i + 1)) * remaining if remaining > 0 else 0

        logger.info(f"Download: [{i + 1}/{len(filenames)}] -- ETA: {eta}")
        files.append(file)

    return files


================================================
FILE: backends/gaudi/server/text_generation_server/utils/import_utils.py
================================================
import torch


def get_hpu_free_memory(device, memory_fraction):
    free_hpu_memory, _ = torch.hpu.mem_get_info()
    return free_hpu_memory


def synchronize_hpu(device):
    torch.hpu.synchronize()


def noop(*args, **kwargs):
    pass


empty_cache = noop
synchronize = synchronize_hpu
get_free_memory = get_hpu_free_memory


================================================
FILE: backends/gaudi/server/text_generation_server/utils/kernels.py
================================================
import importlib

from loguru import logger
from hf_kernels import load_kernel as hf_load_kernel

from text_generation_server.utils.log import log_once


def load_kernel(*, module: str, repo_id: str):
    """
    Load a kernel. First try to load it as the given module (e.g. for
    local development), falling back to a locked Hub kernel.
    """
    try:
        m = importlib.import_module(module)
        log_once(logger.info, f"Using local module for `{module}`")
        return m
    except ModuleNotFoundError:
        return hf_load_kernel(repo_id=repo_id)


__all__ = ["load_kernel"]


================================================
FILE: backends/gaudi/server/text_generation_server/utils/log.py
================================================
from functools import lru_cache
from text_generation_server.utils.dist import RANK


@lru_cache(10)
def log_once(log, msg: str, master=True):
    if master:
        log_master(log, msg)
    else:
        log(msg)


def log_master(log, msg: str):
    if RANK == 0:
        log(msg)


================================================
FILE: backends/gaudi/server/text_generation_server/utils/logits_process.py
================================================
import math
import torch
import habana_frameworks.torch.core as htcore

from loguru import logger
from typing import Dict
from text_generation_server.pb.generate_pb2 import GrammarType

from outlines.fsm.fsm import RegexFSM
from outlines.fsm.json_schema import build_regex_from_schema
from functools import lru_cache
from typing import List, Optional, DefaultDict
import time

from transformers import (
    LogitsProcessor,
    TemperatureLogitsWarper,
    TopKLogitsWarper,
    TopPLogitsWarper,
    TypicalLogitsWarper,
)

mempool = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None


class StaticWarper:
    def __init__(
        self,
        temperature=1.0,
        top_k=None,
        top_p=None,
        typical_p=None,
    ):
        self.warpers = []

        if temperature is not None and temperature != 1.0:
            temperature = float(temperature)
            self.warpers.append(TemperatureLogitsWarper(temperature))
        if top_k is not None and top_k != 0:
            self.warpers.append(TopKLogitsWarper(top_k=top_k))
        if top_p is not None and top_p < 1.0:
            self.warpers.append(TopPLogitsWarper(top_p=top_p))
        if typical_p is not None and typical_p < 1.0:
            self.warpers.append(TypicalLogitsWarper(mass=typical_p))

        self.hpu_graph = None
        self.static_scores = None
        self.static_warped_scores = None
        self.static_next_logprob = None

    def __call__(self, scores):
        if self.hpu_graph is None:
            self.static_scores = scores.clone().contiguous()
            self.static_warped_scores = scores.clone().contiguous()
            self.static_next_logprob = scores.clone().contiguous()
            self.hpu_graph = htcore.hpu.HPUGraph()

            with htcore.hpu.graph(self.hpu_graph):
                local_scores = self.static_scores
                for warper in self.warpers:
                    local_scores = warper(None, local_scores)

                self.static_warped_scores.copy_(local_scores)
                # Compute logprobs
                self.static_next_logprob.copy_(
                    torch.log_softmax(self.static_warped_scores, -1)
                )

        self.static_scores.copy_(scores)
        self.hpu_graph.replay()

        return self.static_warped_scores, self.static_next_logprob


@lru_cache(10)
def static_warper(
    temperature: Optional[float],
    top_k: Optional[int],
    top_p: Optional[float],
    typical_p: Optional[float],
) -> StaticWarper:
    return StaticWarper(
        temperature=temperature, top_k=top_k, top_p=top_p, typical_p=typical_p
    )


class HeterogeneousRepetitionPenaltyLogitsProcessor(LogitsProcessor):
    r"""
    [`LogitsProcessor`] enforcing an exponential penalty on repeated sequences.
    This version allows for a separate value for each sample and runs inplace when possible.
    It doesn't validate inputs.

    Args:
        repetition_penalty (`List[float]`):
            The parameter for repetition penalty. 1.0 means no penalty. See [this
            paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
    """

    def __init__(self, penalty: List[float], dtype: torch.dtype, device: torch.device):
        self.penalty = penalty
        self.penalty_tensor = torch.tensor(
            penalty, dtype=dtype, device=device
        ).unsqueeze(1)

    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
        score = torch.gather(scores, 1, input_ids)

        # if score < 0 then repetition penalty has to be multiplied to reduce the previous token probability
        score = torch.where(
            score < 0, score * self.penalty_tensor, score / self.penalty_tensor
        )

        scores.scatter_(1, input_ids, score)
        return scores

    def filter(self, indices):
        self.penalty = [self.penalty[i] for i in indices]
        if any([x != 1.0 for x in self.penalty]):
            self.penalty_tensor = self.penalty_tensor[indices]
            return self
        return None


class FrequencyPenaltyLogitsProcessor(LogitsProcessor):
    r"""
    Frequency penalty as defined by OpenAI

    Args:
        penalty (`float`):
            The parameter for frequency penalty. 0.0 means no penalty.
    """

    def __init__(self, penalty: float):
        self.penalty = penalty

    def __call__(
        self, input_ids: torch.LongTensor, scores: torch.FloatTensor
    ) -> torch.FloatTensor:
        score = torch.gather(scores, 1, input_ids)
        # if score < 0 then penalty has to be multiplied to reduce the previous token probability
        score = -torch.where(score < 0, score * self.penalty, score / self.penalty)
        # set score to 0 where input_ids is a padding token
        score *= input_ids.ne(0)

        return scores.scatter_add_(1, input_ids, score)


class HeterogeneousFrequencyPenaltyLogitsProcessor(LogitsProcessor):
    r"""
    Frequency penalty as defined by OpenAI in
    https://platform.openai.com/docs/guides/text-generation/parameter-details

    Args:
        frequency_penalty (`List[float]`):
            The parameter for frequency penalty. 0.0 means no penalty.
    """

    def __init__(self, penalty: List[float], dtype: torch.dtype, device: torch.device):
        self.penalty = penalty
        self.penalty_tensor = torch.tensor(
            penalty, dtype=dtype, device=device
        ).unsqueeze(1)

    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
        batch_size, input_size = input_ids.size()
        vocab_size = scores.size(1)

        # Calculate the frequency for each token so far
        token_freq = torch.zeros(
            batch_size, vocab_size, dtype=scores.dtype, device=scores.device
        )
        token_freq.scatter_add_(
            1,
            input_ids,
            torch.ones_like(input_ids, dtype=scores.dtype, device=scores.device),
        )
        token_freq /= input_size

        # Apply the frequency penalty to logits
        scores -= token_freq * self.penalty_tensor
        return scores

    def filter(self, indices):
        self.penalty = [self.penalty[i] for i in indices]
        if any([x != 0.0 for x in self.penalty]):
            self.penalty_tensor = self.penalty_tensor[indices]
            return self
        return None


class HeterogeneousTemperatureLogitsWarper:
    r"""
    [`LogitsProcessor`] for temperature (exponential scaling output probability distribution).
    This version allows for a separate value for each sample and runs inplace when possible.
    It doesn't validate inputs.

    Args:
        temperature (`float`):
            The value used to module the logits distribution.
    """

    def __init__(
        self, temperature: List[float], dtype: torch.dtype, device: torch.device
    ):
        self.temperature = temperature
        self.temperature_tensor = torch.tensor(
            temperature, dtype=dtype, device=device
        ).unsqueeze(1)

    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
        scores.div_(self.temperature_tensor)
        return scores

    def filter(self, indices):
        self.temperature = [self.temperature[i] for i in indices]
        if any([x != 1.0 for x in self.temperature]):
            self.temperature_tensor = self.temperature_tensor[indices]
            return self
        return None


class HeterogeneousTopPLogitsWarper(LogitsProcessor):
    """
    [`LogitsWarper`] that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <= prob_cut_off.
    This version allows for a separate value for each sample and runs inplace when possible.
    It doesn't validate inputs.

    Args:
        top_p (`float`):
            If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
            higher are kept for generation.
        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
            All filtered values will be set to this float value.
        min_tokens_to_keep (`int`, *optional*, defaults to 1):
            Minimum number of tokens that cannot be filtered.
    """

    def __init__(
        self,
        top_p: List[float],
        dtype: torch.dtype,
        device: torch.device,
        filter_value: float = -math.inf,
        min_tokens_to_keep: int = 1,
    ):
        self.top_p = top_p
        self.top_p_opposite = 1 - torch.tensor(
            top_p, dtype=dtype, device=device
        ).unsqueeze(1)
        self.filter_value = filter_value
        self.min_tokens_to_keep = min_tokens_to_keep

    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
        sorted_logits, sorted_indices = torch.sort(scores, descending=False)
        probs = sorted_logits.softmax(dim=-1)
        # This is way faster for some reason
        for i in range(probs.shape[0]):
            probs[i] = probs[i].cumsum(dim=-1)

        # Remove tokens with cumulative top_p above the threshold (token with 0 are kept)
        sorted_indices_to_remove = probs <= self.top_p_opposite
        # Keep at least min_tokens_to_keep
        sorted_indices_to_remove[..., -self.min_tokens_to_keep :] = 0

        # scatter sorted tensors to original indexing
        indices_to_remove = sorted_indices_to_remove.scatter(
            1, sorted_indices, sorted_indices_to_remove
        )
        warped_scores = scores.masked_fill_(indices_to_remove, self.filter_value)

        return warped_scores

    def filter(self, indices):
        self.top_p = [self.top_p[i] for i in indices]
        if any([x < 1.0 for x in self.top_p]):
            self.top_p_opposite = self.top_p_opposite[indices]
            return self
        return None


class HeterogeneousTopKLogitsWarper(LogitsProcessor):
    r"""
    [`LogitsProcessor`] that performs top-k, i.e. restricting to the k highest probability elements.
    This version allows for a separate value for each sample and runs inplace when possible.
    It doesn't validate inputs.

    Args:
        top_k (`int`):
            The number of highest probability vocabulary tokens to keep for top-k-filtering.
        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
            All filtered values will be set to this float value.
        min_tokens_to_keep (`int`, *optional*, defaults to 1):
            Minimum number of tokens that cannot be filtered.
    """

    def __init__(
        self,
        top_k: List[int],
        device: torch.device,
        filter_value: float = -math.inf,
        min_tokens_to_keep: int = 1,
    ):
        self.top_k = top_k
        self.max_top_k = max(top_k)
        # value - 1 as we will use top_k to index and python uses 0 based numbering
        self.top_k_tensor = torch.tensor(
            [max(x - 1, min_tokens_to_keep - 1) for x in top_k],
            dtype=torch.int64,
            device=device,
        ).unsqueeze(1)

        # 0 is a special value that disables top_k warping for this member of the batch
        disabled = [x == 0 for x in top_k]

        if any(disabled):
            self.top_k_disabled_mask = torch.tensor(
                disabled, dtype=torch.bool, device=device
            ).view(-1, 1)
        else:
            self.top_k_disabled_mask = None

        self.filter_value = filter_value

    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
        # If max_top_k is superior to the vocab, we need to clamp or the warper will fail
        if scores.size(-1) < self.max_top_k:
            max_top_k = scores.size(-1)
            top_k = torch.clamp_max(self.top_k_tensor, max_top_k)
        else:
            max_top_k = self.max_top_k
            top_k = self.top_k_tensor

        # Get the kth score for each member of the batch
        kth_scores = torch.gather(torch.topk(scores, max_top_k)[0], 1, top_k)

        # Mask member of kth_scores that do not want to use top_k warping
        if self.top_k_disabled_mask is not None:
            kth_scores.masked_fill_(self.top_k_disabled_mask, self.filter_value)

        # Remove all tokens with a probability less than the last token of the top-k
        indices_to_remove = scores < kth_scores
        scores.masked_fill_(indices_to_remove, self.filter_value)
        return scores

    def filter(self, indices):
        self.top_k = [self.top_k[i] for i in indices]
        disabled = [x == 0 for x in self.top_k]

        if not all(disabled):
            self.top_k_tensor = self.top_k_tensor[indices]
            self.max_top_k = max(self.top_k)

            if self.top_k_disabled_mask is not None:
                self.top_k_disabled_mask = (
                    self.top_k_disabled_mask[indices] if any(disabled) else None
                )

            return self
        return None


class HeterogeneousTypicalLogitsWarper(LogitsProcessor):
    r"""
    [`LogitsProcessor`] that performs typical decoding. See [Typical Decoding for Natural Language
    Generation](https://arxiv.org/abs/2202.00666) for more information.
    This version allows for a separate value for each sample and runs inplace when possible.
    It doesn't validate inputs.

    Args:
        mass (`float`):
            Value of typical_p between 0 and 1 inclusive, defaults to 0.9.
        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
            All filtered values will be set to this float value.
        min_tokens_to_keep (`int`, *optional*, defaults to 1):
            Minimum number of tokens that cannot be filtered.
    """

    def __init__(
        self,
        mass: List[float],
        dtype: torch.dtype,
        device: torch.device,
        filter_value: float = -math.inf,
        min_tokens_to_keep: int = 1,
    ):
        self.mass = mass
        self.mass_tensor = torch.tensor(mass, dtype=dtype, device=device).unsqueeze(1)

        # 1 is a special value that disables typical_p warping for this member of the batch
        disabled = [x == 1.0 for x in mass]

        if any(disabled):
            self.disabled_mask = torch.tensor(disabled, dtype=torch.bool, device=device)
        else:
            self.disabled_mask = None

        self.filter_value = filter_value
        self.min_tokens_to_keep = min_tokens_to_keep

    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
        # calculate entropy
        normalized = torch.nn.functional.log_softmax(scores, dim=-1)
        p = torch.exp(normalized)
        ent = -(normalized * p).nansum(-1, keepdim=True)

        # shift and sort
        shifted_scores = torch.abs((-normalized) - ent)
        sorted_scores, sorted_indices = torch.sort(shifted_scores, descending=False)
        sorted_logits = scores.gather(-1, sorted_indices)
        probs = sorted_logits.softmax(dim=-1)
        # This is way faster for some reason
        for i in range(probs.shape[0]):
            probs[i] = probs[i].cumsum(dim=-1)

        # Remove tokens with cumulative mass above the threshold
        last_ind = (probs < self.mass_tensor).sum(dim=1)
        last_ind[last_ind < 0] = 0

        if self.disabled_mask is not None:
            last_ind.masked_fill_(self.disabled_mask, scores.shape[-1] - 1)

        sorted_indices_to_remove = sorted_scores > sorted_scores.gather(
            1, last_ind.view(-1, 1)
        )
        if self.min_tokens_to_keep > 1:
            # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
            sorted_indices_to_remove[..., : self.min_tokens_to_keep] = 0
        indices_to_remove = sorted_indices_to_remove.scatter(
            1, sorted_indices, sorted_indices_to_remove
        )

        warped_scores = scores.masked_fill_(indices_to_remove, self.filter_value)

        return warped_scores

    def filter(self, indices):
        self.mass = [self.mass[i] for i in indices]
        disabled = [x == 1.0 for x in self.mass]

        if not all(disabled):
            self.mass_tensor = self.mass_tensor[indices]

            if self.disabled_mask is not None:
                self.disabled_mask = (
                    self.disabled_mask[indices] if any(disabled) else None
                )

            return self
        return None


class HeterogeneousProcessorWrapper(LogitsProcessor):
    r"""
    A wrapper for logit warpers or processors without heterogeneous parameter support.
    Args:
        processors (`Dict[int, LogitsProcessor]`):
            A mapping of sample indices to logit warpers or processors, to be run sequentially.
    """

    def __init__(
        self,
        processors: Dict[int, LogitsProcessor],
    ):
        self.processors = processors

    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
        for i, processor in self.processors.items():
            scores[i : i + 1] = processor(input_ids[i : i + 1], scores[i : i + 1])
        return scores

    def filter(self, indices):
        new_processors = {}
        for i, idx in enumerate(indices):
            if idx in self.processors:
                new_processors[i] = self.processors[idx]

        if new_processors:
            self.processors = new_processors
            return self
        return None


class GrammarLogitProcessor(LogitsProcessor):
    fsm_state: DefaultDict[int, int]
    fsm: RegexFSM

    def __init__(self, tokenizer, device, grammar, grammar_type):
        self.device = device
        self.tokenizer = GrammarLogitProcessor._cached_adapt_tokenizer(tokenizer)
        self.fsm = GrammarLogitProcessor._cached_compile_fsm(
            grammar_type, grammar, self.tokenizer
        )

    def __call__(
        self,
        logits: torch.Tensor,
        fsm_grammar_state: int,
    ):
        if fsm_grammar_state == -1 or self.fsm is None:
            return logits
        allowed_tokens = self.fsm.allowed_token_ids(fsm_grammar_state)
        mask = torch.full_like(logits, -math.inf)
        mask[:, allowed_tokens] = 0
        biased_scores = logits + mask
        return biased_scores

    def advance(self, next_token_id, fsm_grammar_state):
        return GrammarLogitProcessor._advance(
            next_token_id, fsm_grammar_state, self.fsm
        )

    @staticmethod
    def _advance(next_token_id, fsm_grammar_state, fsm):
        if fsm_grammar_state == -1:
            return fsm_grammar_state
        return fsm.next_state(fsm_grammar_state, next_token_id)

    # TODO: move grammar compilation into the router
    @staticmethod
    @lru_cache(maxsize=32, typed=True)
    def _cached_compile_fsm(grammar_type, schema, tokenizer):
        start_time = time.time()
        if grammar_type == GrammarType.GRAMMAR_TYPE_JSON:
            schema = build_regex_from_schema(schema)
        elif grammar_type == GrammarType.GRAMMAR_TYPE_REGEX:
            pass  # schema is already a regex just here for clarity
        fsm = RegexFSM(schema, tokenizer)
        logger.debug(f"Compiled FSM in {time.time() - start_time:.2f}s")
        return fsm

    @staticmethod
    @lru_cache(maxsize=32, typed=True)
    def _cached_adapt_tokenizer(tokenizer):
        """Adapt tokenizer to work with the FSM.

        The API of Outlines tokenizers is slightly different to that of
        `transformers`. In addition we need to handle the missing spaces to
        Llama's tokenizer to be able to compile FSMs for this model.

        """
        start_time = time.time()
        tokenizer.vocabulary = tokenizer.get_vocab()
        tokenizer.special_tokens = set(tokenizer.all_special_tokens)

        def convert_token_to_string(token: str) -> str:
            from transformers.file_utils import SPIECE_UNDERLINE

            string = tokenizer.convert_tokens_to_string([token])

            # A hack to handle missing spaces to HF's Llama tokenizers
            if token.startswith(SPIECE_UNDERLINE) or token == "<0x20>":
                return " " + string

            return string

        tokenizer.convert_token_to_string = convert_token_to_string
        logger.debug(f"Adapted tokenizer in {time.time() - start_time:.2f}s")
        return tokenizer


class HeterogeneousGrammarLogitProcessor(LogitsProcessor):
    def __init__(self, tokenizer, device, grammars, grammar_types):
        self.device = device
        self.tokenizer = GrammarLogitProcessor._cached_adapt_tokenizer(tokenizer)
        self.fsms = []
        for grammar, grammar_type in zip(grammars, grammar_types):
            if len(grammar) == 0:
                self.fsms.append(None)
                continue
            fsm = GrammarLogitProcessor._cached_compile_fsm(
                grammar_type, grammar, self.tokenizer
            )
            self.fsms.append(fsm)

    def __call__(
        self,
        logits: torch.Tensor,
        fsm_grammar_states: List[int],
    ):
        mask = torch.full_like(logits, -math.inf)
        for i in range(logits.shape[0]):
            fsm = self.fsms[i]
            if fsm is None:
                continue
            allowed_tokens = fsm.allowed_token_ids(fsm_grammar_states[i])
            mask[i, allowed_tokens] = 0
            logits[i] += mask[i]
        return logits

    def advance_batch(self, next_token_ids, fsm_grammar_states):
        return [
            GrammarLogitProcessor._advance(
                next_token_ids[i], fsm_grammar_states[i], self.fsms[i]
            )
            for i in range(len(next_token_ids))
        ]

    def advance_at_index(self, next_token_id, fsm_grammar_state, index):
        if self.fsms[index] is None:
            return fsm_grammar_state
        return GrammarLogitProcessor._advance(
            next_token_id, fsm_grammar_state, self.fsms[index]
        )

    def filter(self, indices):
        new_fsms = []
        for i in indices:
            new_fsms.append(self.fsms[i])
        self.fsms = new_fsms
        return self


================================================
FILE: backends/gaudi/server/text_generation_server/utils/merges/strategies.py
================================================
import copy
from abc import ABC
from collections import defaultdict
from typing import TYPE_CHECKING, Dict, List, Tuple, Type, Union
from text_generation_server.utils.merges.utils import (
    calculate_majority_sign_mask,
    disjoint_merge,
    prune,
)
import torch

if TYPE_CHECKING:
    from text_generation_server.adapters.lora import LoraConfig
    from text_generation_server.utils.adapter import ModuleMap


class AdapterParameters:
    def __init__(
        self, adapter_ids, weights, merge_strategy, density, majority_sign_method
    ):
        self.adapter_ids = adapter_ids
        self.weights = weights
        self.merge_strategy = merge_strategy
        self.density = density
        self.majority_sign_method = majority_sign_method


def _apply_weights(
    tensors: Union[torch.Tensor, List[torch.Tensor]], w: torch.Tensor
) -> torch.Tensor:
    if isinstance(tensors, torch.Tensor):
        t = tensors
    else:
        t = torch.stack(tensors, dim=0)

    # element-wise weighting of each task tensor
    # need to unsqueeze weights to match task tensor dimensions
    # for multiplication to apply element-wise
    while len(t.shape) > len(w.shape):
        w = w.unsqueeze(-1)
    return t * w


class MergeStrategy(ABC):
    def merge(
        self, task_tensors: List[torch.Tensor], weights: torch.Tensor
    ) -> torch.Tensor:
        raise NotImplementedError()


class LinearMerge(MergeStrategy):
    def __init__(self, **kwargs):
        pass

    def merge(
        self, task_tensors: List[torch.Tensor], weights: torch.Tensor
    ) -> torch.Tensor:
        weighted_task_tensors = _apply_weights(task_tensors, weights)
        return weighted_task_tensors.sum(dim=0)


class TiesMerge(MergeStrategy):
    def __init__(self, density: float, majority_sign_method: str = "total", **kwargs):
        self.density = density
        self.majority_sign_method = majority_sign_method

    def merge(
        self, task_tensors: List[torch.Tensor], weights: torch.Tensor
    ) -> torch.Tensor:
        # sparsify
        task_tensors = [
            prune(tensor, self.density, method="magnitude") for tensor in task_tensors
        ]
        task_tensors = torch.stack(task_tensors, dim=0)

        # elect sign before applying weights
        majority_sign_mask = calculate_majority_sign_mask(
            task_tensors, method=self.majority_sign_method
        )
        weighted_task_tensors = _apply_weights(task_tensors, weights)

        # disjoint merge
        return disjoint_merge(weighted_task_tensors, majority_sign_mask)


class DareLinearMerge(MergeStrategy):
    def __init__(self, density: float, **kwargs):
        self.density = density

    def merge(
        self, task_tensors: List[torch.Tensor], weights: torch.Tensor
    ) -> torch.Tensor:
        # sparsify
        task_tensors = [
            prune(tensor, self.density, method="random", rescale=True)
            for tensor in task_tensors
        ]
        weighted_task_tensors = _apply_weights(task_tensors, weights)
        return weighted_task_tensors.sum(dim=0)


class DareTiesMerge(MergeStrategy):
    def __init__(self, density: float, majority_sign_method: str = "total", **kwargs):
        self.density = density
        self.majority_sign_method = majority_sign_method

    def merge(
        self, task_tensors: List[torch.Tensor], weights: torch.Tensor
    ) -> torch.Tensor:
        # sparsify
        task_tensors = [
            prune(tensor, self.density, method="random", rescale=True)
            for tensor in task_tensors
        ]
        task_tensors = torch.stack(task_tensors, dim=0)

        # elect sign before applying weights
        majority_sign_mask = calculate_majority_sign_mask(
            task_tensors, method=self.majority_sign_method
        )
        weighted_task_tensors = _apply_weights(task_tensors, weights)

        # disjoint merge
        mixed_task_tensors = disjoint_merge(weighted_task_tensors, majority_sign_mask)
        return mixed_task_tensors


strategy_registry: Dict[str, Type[MergeStrategy]] = {
    "linear": LinearMerge,
    "ties": TiesMerge,
    "dare_linear": DareLinearMerge,
    "dare_ties": DareTiesMerge,
}


def merge_adapters(
    adapters: List[Tuple["ModuleMap", "LoraConfig"]],
    merge_params: AdapterParameters,
) -> Tuple["ModuleMap", "LoraConfig"]:
    # strategy_name = MergeStrategyEnum.Name(merge_params.merge_strategy).lower()
    strategy_name = "linear"

    weights = merge_params.weights
    if not weights:
        weights = torch.ones(len(adapters))
    else:
        weights = torch.tensor(weights)

    merge_config = {
        "density": merge_params.density,
        # "majority_sign_method": MajoritySignMethodEnum.Name(
        #     merge_params.majority_sign_method
        # ).lower(),
        "majority_sign_method": "total",
    }
    merge_strategy = strategy_registry[strategy_name](**merge_config)

    module_maps: Dict[str, Dict[str, Dict[str, List[torch.Tensor]]]] = defaultdict(
        lambda: defaultdict(lambda: defaultdict(list))
    )
    lora_configs = []
    weight_name_to_adapter_idx = defaultdict(list)

    # input is list of (module_map, lora_config) tuples
    # convert into dict[k][param_name] -> list of tensors
    for idx, (module_map, lora_config) in enumerate(adapters):
        for weight_name, data in module_map.items():
            weight_name_to_adapter_idx[weight_name].append(idx)
            for k, (param_data, param_name) in data.items():
                module_maps[weight_name][k][param_name].append(param_data)
        lora_configs.append(lora_config)

    # validate lora configs are compatible
    _validate_lora_configs(lora_configs)

    # merge tensors for each module such that we have a single ModuleMap:
    # dict[k] -> merged tensor
    merged_module_map: "ModuleMap" = defaultdict(dict)
    for weight_name, data in module_maps.items():
        indices = weight_name_to_adapter_idx[weight_name]
        param_weights = weights[indices]
        for k, param_data in data.items():
            for param_name, tensors in param_data.items():
                merged_tensor = merge_strategy.merge(tensors, param_weights)
                merged_module_map[weight_name][k] = (merged_tensor, param_name)

    # merge lora configs
    merged_lora_config = _merge_lora_configs(lora_configs)

    return merged_module_map, merged_lora_config


def _validate_lora_configs(lora_configs: List["LoraConfig"]):
    # check that all configs have the same rank
    ranks = set(lora_config.r for lora_config in lora_configs)
    if len(ranks) > 1:
        raise ValueError(
            f"unable to merge adapters, lora configs have different ranks: {ranks}"
        )

    if all(len(lora_config.target_modules) == 0 for lora_config in lora_configs):
        raise ValueError(
            "unable to merge adapters, lora configs have no target modules"
        )


def _merge_lora_configs(lora_configs: List["LoraConfig"]) -> "LoraConfig":
    merged_lora_config = copy.copy(lora_configs[0])

    # merge target modules as a union operation
    merged_target_modules = sorted(
        set(
            module
            for lora_config in lora_configs
            for module in lora_config.target_modules
        )
    )
    merged_lora_config.target_modules = merged_target_modules

    return merged_lora_config


================================================
FILE: backends/gaudi/server/text_generation_server/utils/merges/utils.py
================================================
# coding=utf-8
# From: https://github.com/huggingface/peft/pull/1364
# Copyright 2024-present the HuggingFace Inc. team.
# Modifications by Predibase, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Literal

import torch


def magnitude_based_pruning(tensor: torch.Tensor, density: float) -> torch.Tensor:
    """
    Prune the smallest values of the task tensors and retain the top-k values based on the specified fraction
    `density`.

    Args:
    tensor (`torch.Tensor`):The tensor to prune.
    density (`float`):The fraction of values to preserve. Should be in [0,1].
    """
    mask = torch.zeros_like(tensor).reshape(-1)
    k = int(density * tensor.reshape(-1).shape[0])
    top_k = torch.topk(tensor.abs().reshape(-1), k=k, largest=True)
    mask[top_k[1]] = 1
    return tensor * mask.reshape(tensor.shape)


def random_pruning(tensor: torch.Tensor, density: float, rescale: bool) -> torch.Tensor:
    """
    Prune the smallest values of the task tensors and retain the top-k values based on the specified fraction
    `density`.

    Args:
    tensor (`torch.Tensor`):The tensor to prune.
    density (`float`):The fraction of values to preserve. Should be in [0,1].
    rescale (`bool`):Whether to rescale the result to preserve the expected value of the original tensor.
    """
    mask = torch.bernoulli(torch.full_like(input=tensor, fill_value=density))
    pruned_tensor = tensor * mask
    if rescale:
        torch.div(input=pruned_tensor, other=density)
    return pruned_tensor


def prune(
    tensor: torch.Tensor,
    density: float,
    method: Literal["magnitude", "random"],
    rescale: bool = False,
) -> torch.Tensor:
    """
    Prune the values of task tensors based on the `method`.

    Args:
    tensor (`torch.Tensor`):The tensor to prune.
    density (`float`):The fraction of values to preserve. Should be in [0,1].
    method (`str`):The method to use to prune. Should be one of ["magnitude", "random"].
    rescale (`bool`):Whether to rescale the result to preserve the expected value of the original tensor.
    """
    if density >= 1:
        return tensor
    elif density < 0:
        raise ValueError("Density should be >= 0, got {density}")
    if method == "magnitude":
        return magnitude_based_pruning(tensor, density)
    elif method == "random":
        return random_pruning(tensor, density, rescale=rescale)
    else:
        raise ValueError(f"Unknown method {method}")


def calculate_majority_sign_mask(
    tensor: torch.Tensor, method: Literal["total", "frequency"] = "total"
):
    """
    Get the mask of the majority sign across the task tensors. Task tensors are stacked on dimension 0.

    Args:
    tensor (`torch.Tensor`):The tensor to get the mask from.
    method (`str`):The method to use to get the mask. Should be one of ["total", "frequency"].
    """

    sign = tensor.sign()
    if method == "total":
        sign_magnitude = (sign * tensor.abs()).sum(dim=0)
    elif method == "frequency":
        sign_magnitude = sign.sum(dim=0)
    else:
        raise RuntimeError(f'Unimplemented mask method "{method}"')
    majority_sign = torch.where(sign_magnitude >= 0, 1, -1)
    return sign == majority_sign


def disjoint_merge(task_tensors, majority_sign_mask):
    mixed_task_tensors = (task_tensors * majority_sign_mask).sum(dim=0)
    num_params_preserved = majority_sign_mask.sum(dim=0)
    return mixed_task_tensors / torch.clamp(num_params_preserved, min=1.0)


================================================
FILE: backends/gaudi/server/text_generation_server/utils/peft.py
================================================
import os
from typing import Union
from loguru import logger
import torch

from transformers import AutoTokenizer
from peft import AutoPeftModelForCausalLM, AutoPeftModelForSeq2SeqLM


def download_and_unload_peft(model_id, revision, trust_remote_code):
    torch_dtype = torch.float16

    logger.info("Trying to load a Peft model. It might take a while without feedback")
    try:
        model = AutoPeftModelForCausalLM.from_pretrained(
            model_id,
            revision=revision,
            torch_dtype=torch_dtype,
            trust_remote_code=trust_remote_code,
            low_cpu_mem_usage=True,
        )
    except Exception:
        model = AutoPeftModelForSeq2SeqLM.from_pretrained(
            model_id,
            revision=revision,
            torch_dtype=torch_dtype,
            trust_remote_code=trust_remote_code,
            low_cpu_mem_usage=True,
        )
    logger.info("Peft model detected.")
    logger.info("Merging the lora weights.")

    base_model_id = model.peft_config["default"].base_model_name_or_path

    model = model.merge_and_unload()

    os.makedirs(model_id, exist_ok=True)
    cache_dir = model_id
    logger.info(f"Saving the newly created merged model to {cache_dir}")
    tokenizer = AutoTokenizer.from_pretrained(
        base_model_id, trust_remote_code=trust_remote_code
    )
    model.save_pretrained(cache_dir, safe_serialization=True)
    model.config.save_pretrained(cache_dir)
    tokenizer.save_pretrained(cache_dir)


def download_peft(
    model_id: Union[str, os.PathLike], revision: str, trust_remote_code: bool
):
    torch_dtype = torch.float16
    try:
        _model = AutoPeftModelForCausalLM.from_pretrained(
            model_id,
            revision=revision,
            torch_dtype=torch_dtype,
            trust_remote_code=trust_remote_code,
            low_cpu_mem_usage=True,
        )
    except Exception:
        _model = AutoPeftModelForSeq2SeqLM.from_pretrained(
            model_id,
            revision=revision,
            torch_dtype=torch_dtype,
            trust_remote_code=trust_remote_code,
            low_cpu_mem_usage=True,
        )
    logger.info("Peft model downloaded.")


================================================
FILE: backends/gaudi/server/text_generation_server/utils/prefill_chunking.py
================================================
from typing import Optional

SUPPORT_CHUNKING: Optional[bool] = None
MAX_PREFILL_TOKENS: Optional[int] = None


def set_support_chunking(support_chunking: bool):
    global SUPPORT_CHUNKING
    SUPPORT_CHUNKING = support_chunking


def get_support_chunking() -> bool:
    global SUPPORT_CHUNKING
    return SUPPORT_CHUNKING


def set_max_prefill_tokens(max_prefill_tokens: int):
    global MAX_PREFILL_TOKENS
    MAX_PREFILL_TOKENS = max_prefill_tokens


def get_max_prefill_tokens() -> int:
    global MAX_PREFILL_TOKENS
    return MAX_PREFILL_TOKENS


================================================
FILE: backends/gaudi/server/text_generation_server/utils/quantization.py
================================================
import json
import os
from dataclasses import dataclass
from typing import Optional, List

from huggingface_hub import hf_hub_download
from text_generation_server.utils.weights import (
    WeightsLoader,
)


# TODO: Split this config to have a single config type per quant method
@dataclass
class _QuantizerConfig:
    bits: int
    checkpoint_format: Optional[str]
    desc_act: bool
    groupsize: int
    quant_method: str
    sym: bool
    weight_block_size: Optional[List[int]]
    modules_to_not_convert: List[str]


@dataclass
class _FP8QuantizerConfig:
    activation_scale_ub: float


def _get_config_json(model_id: str, revision: Optional[str], filename: str):
    if os.path.exists(
        os.path.join(
            model_id,
        )
    ):
        filename = os.path.join(model_id, filename)
    else:
        filename = hf_hub_download(model_id, filename=filename, revision=revision)
    with open(filename, "r") as f:
        return json.load(f)


# We should probably do this with Pydantic JSON deserialization,
# but for now we'll stay close to the old _set_gptq_params.
def _get_quantizer_config(model_id, revision):
    bits = 4
    groupsize = -1
    quant_method = "gptq"
    checkpoint_format = None
    sym = False
    desc_act = False
    weight_block_size = None
    modules_to_not_convert = []

    filename = "config.json"
    try:
        data = _get_config_json(model_id, revision, filename)
        # FP8 config
        if data["quantization_config"]["quant_method"] == "fbgemm_fp8":
            return _FP8QuantizerConfig(
                activation_scale_ub=data["quantization_config"]["activation_scale_ub"]
            )
        weight_block_size = data["quantization_config"].get("weight_block_size", None)

        if "zero_point" in data["quantization_config"]:
            sym = not data["quantization_config"]["zero_point"]
            quant_method = "awq"
        elif "sym" in data["quantization_config"]:
            sym = data["quantization_config"]["sym"]

        bits = data["quantization_config"]["bits"]
        groupsize = data["quantization_config"]["group_size"]
        # Order is important here, desc_act is missing on some real models
        quant_method = data["quantization_config"]["quant_method"]
        checkpoint_format = data["quantization_config"].get("checkpoint_format")
        desc_act = data["quantization_config"].get("desc_act", False)
        modules_to_not_convert = data["quantization_config"].get(
            "modules_to_not_convert", []
        )
        if modules_to_not_convert is None:
            modules_to_not_convert = []
    except Exception:
        filename = "quantize_config.json"
        try:
            data = _get_config_json(model_id, revision, filename)
            bits = data["bits"]
            groupsize = data["group_size"]

            if "zero_point" in data:
                sym = not data["zero_point"]
                quant_method = "awq"
            elif "sym" in data:
                sym = data["sym"]

            desc_act = data["desc_act"]
            if "version" in data and data["version"] == "GEMM":
                quant_method = "awq"
        except Exception:
            filename = "quant_config.json"
            try:
                data = _get_config_json(model_id, revision, filename)
                bits = data["w_bit"]
                groupsize = data["q_group_size"]
                desc_act = data["desc_act"]
                if "version" in data and data["version"] == "GEMM":
                    quant_method = "awq"
            except Exception:
                pass

    return _QuantizerConfig(
        bits=bits,
        groupsize=groupsize,
        quant_method=quant_method,
        checkpoint_format=checkpoint_format,
        sym=sym,
        desc_act=desc_act,
        weight_block_size=weight_block_size,
        modules_to_not_convert=modules_to_not_convert,
    )


def get_loader(
    quantize: Optional[str], model_id: str, revision: Optional[str]
) -> WeightsLoader:
    if quantize == "compressed-tensors":
        config = _get_config_json(model_id, revision, "config.json")
        from text_generation_server.layers.compressed_tensors import (
            CompressedTensorsLoader,
        )

        return CompressedTensorsLoader(config)
    quantizer_config = _get_quantizer_config(model_id, revision)
    if quantize in {"awq", "gptq"}:
        from text_generation_server.layers.gptq import GPTQWeightsLoader

        # TODO: improve check once we have one config type per quantize value
        if not isinstance(quantizer_config, _QuantizerConfig):
            raise ValueError(
                f"Quantize is set to `{quantize}` but received a `{quantizer_config.__class__.__name__}` config."
            )

        return GPTQWeightsLoader(
            bits=quantizer_config.bits,
            desc_act=quantizer_config.desc_act,
            groupsize=quantizer_config.groupsize,
            quant_method=quantizer_config.quant_method,
            quantize=quantize,
            sym=quantizer_config.sym,
            modules_to_not_convert=quantizer_config.modules_to_not_convert,
        )
    elif quantize == "fp8" or quantize is None:
        from text_generation_server.layers.fp8 import HybridFP8UnquantLoader

        # Since the default for the quantize config is _QuantizerConfig,
        # we need to add this check to not get an attribute error
        activation_scale_ub = None
        weight_block_size = quantizer_config.weight_block_size
        if isinstance(quantizer_config, _FP8QuantizerConfig):
            activation_scale_ub = quantizer_config.activation_scale_ub

        return HybridFP8UnquantLoader(
            activation_scale_ub,
            to_fp8=quantize == "fp8",
            weight_block_size=weight_block_size,
        )
    else:
        raise ValueError(f"Unknown quantization method: {quantize}")


================================================
FILE: backends/gaudi/server/text_generation_server/utils/segments.py
================================================
# Origin:   https://github.com/predibase/lorax
# Path:     lorax/server/lorax_server/utils/segments.py
# License:  Apache License Version 2.0, January 2004

from typing import List, Tuple, Union

import torch


def find_segments(
    adapter_indices: Union[torch.Tensor, List[int]],
) -> Tuple[List[int], List[int]]:
    segments = [0]
    segment_indices = []

    if isinstance(adapter_indices, torch.Tensor):
        # Calling .item() repeatedly on CUDA tensor is very slow, so we move it to CPU first
        adapter_indices = adapter_indices.cpu().tolist()

    start_index = 0
    for i in range(1, len(adapter_indices)):
        if adapter_indices[i] != adapter_indices[i - 1]:
            segments.append(i)
            segment_indices.append(adapter_indices[i - 1])
            start_index = i

    # Handle the last segment
    if start_index < len(adapter_indices):
        segments.append(len(adapter_indices))
        segment_indices.append(adapter_indices[-1])

    return segments, segment_indices


class SegmentConcatBuilder:
    def __init__(self):
        self.adapter_segment_indices = []
        self.adapter_segment_tensors = []

    def concat(self, adapter_segments: torch.Tensor, segment_indices: List[int]):
        # Update adapter segments
        if self.adapter_segment_tensors:
            # Because we have already processed at least one batch, remove the 0 start index
            # from this batch denoting the beginning of the segment, then offset all segment
            # positions by the value of the last segment in the previous batch to account for
            # the concatenation.
            adapter_segments = (
                adapter_segments[1:] + self.adapter_segment_tensors[-1][-1]
            )

        if (
            self.adapter_segment_indices
            and self.adapter_segment_indices[-1] == segment_indices[0]
        ):
            # If the last segment in the previous batch is the same as the first segment in this batch,
            # then we merge them together into a single segment. In effect, this means removing it from
            # the segment indices of this batch, and extending the segment span by removing the segment
            # end index from the previous batch.
            segment_indices = segment_indices[1:]
            self.adapter_segment_tensors[-1] = self.adapter_segment_tensors[-1][:-1]

        self.adapter_segment_indices.extend(segment_indices)
        self.adapter_segment_tensors.append(adapter_segments)

    def build(self) -> Tuple[torch.Tensor, List[int]]:
        return torch.concat(self.adapter_segment_tensors), self.adapter_segment_indices


================================================
FILE: backends/gaudi/server/text_generation_server/utils/sgmv.py
================================================
# Origin:   https://github.com/predibase/lorax
# Path:     lorax/server/lorax_server/utils/sgmv.py
# License:  Apache License Version 2.0, January 2004

import os
import warnings
from functools import lru_cache
from typing import List, Tuple

import torch
import torch.nn.functional as F

try:
    import punica_kernels as _kernels

    HAS_SGMV = not bool(os.environ.get("DISABLE_SGMV", ""))
except ImportError:
    warnings.warn("Could not import SGMV kernel from Punica, falling back to loop.")
    _kernels = None
    HAS_SGMV = False


MIN_SGMV_RANK = 8
MIN_RANK_CUSTOM = 16
MAX_RANK_CUSTOM = 128
SGMV_BLOCK_SIZE = 16
BGMV_MAX_RANK = 64


def has_sgmv() -> bool:
    return HAS_SGMV


def pad_rank(t: torch.Tensor, dim: int, world_size: int) -> torch.Tensor:
    """Pad a tensor to the minimum rank for SGMV and the nearest multiple of the SGMV block size."""
    if not has_sgmv():
        return t

    # tensor parallelism will result in effective rank being divided by world_size,
    # so we need to scale the min rank to offset that effect
    min_rank = MIN_SGMV_RANK * world_size

    # if we're at or below the min rank, pad up to the min rank
    # otherwise, pad to the nearest multiple of the block size
    current_rank = t.size(dim)
    target_rank = (
        min_rank
        if current_rank <= min_rank
        else (current_rank + SGMV_BLOCK_SIZE - 1) // SGMV_BLOCK_SIZE * SGMV_BLOCK_SIZE
    )
    if current_rank == target_rank:
        return t

    pad_size = target_rank - current_rank

    # see complicatd pad syntax here: https://pytorch.org/docs/stable/generated/torch.nn.functional.pad.html
    pad = [0, 0] * t.dim()
    pad[(t.dim() - dim - 1) * 2 + 1] = pad_size
    pad = tuple(pad)

    return F.pad(t, pad, mode="constant", value=0.0)


def use_cutlass_shrink(lora_rank: int) -> bool:
    return lora_rank < MIN_RANK_CUSTOM


def orient_for_rank(t: torch.Tensor, rank: int) -> torch.Tensor:
    if MIN_RANK_CUSTOM <= rank <= MAX_RANK_CUSTOM:
        return t.transpose(0, 1)
    return t


# Source: https://github.com/punica-ai/punica/blob/master/src/punica/ops/__init__.py
def add_lora_sgmv_cutlass(
    y: torch.Tensor,
    x: torch.Tensor,
    wa_ptr: torch.Tensor,
    wb_ptr: torch.Tensor,
    s_start: torch.Tensor,
    s_end: torch.Tensor,
    layer_idx: int,
    lora_rank: int,
):
    """
    Semantics:
        y[s[i]:s[i+1]] += x[s[i]:s[i+1]] @ deref(wa_ptr[i]).T @ deref(wb_ptr[i])

    Args:
        y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
        x: Shape: `[B, H1]`. Input vectors.
        wa_ptr: Shape: `[S]`. DType: torch.int64. Pointer to the weight matrices.\
            Weight matrix shape: `[num_layers, R, H1]`.
        wb_ptr: Shape: `[S]`. DType: torch.int64. Pointer to the weight matrices.\
            Weight matrix shape: `[num_layers, R, H2]`.
        s_start: Shape: `[S]`, DType: torch.int32. Indptr of the weight matrices start indices.
        s_end: Shape: `[S]`, DType: torch.int32. Indptr of the weight matrices end indices.
        layer_idx: Layer index of the weight matrices.
    """
    if lora_rank < MIN_RANK_CUSTOM or lora_rank > MAX_RANK_CUSTOM:
        # Custom SGMV shrink only supports rank 16, 32, 64, 128
        _add_lora_sgmv_cutlass_legacy(
            y, x, wa_ptr, wb_ptr, s_start, s_end, layer_idx, lora_rank
        )
        return

    tmp1 = torch.empty((8 * 1024 * 1024,), dtype=torch.uint8, device=x.device)
    tmp2_size = _kernels.sgmv_cutlass_tmp_size(wa_ptr.size(0))
    tmp2 = torch.empty((tmp2_size,), dtype=torch.uint8, device=x.device)
    v = torch.zeros((x.size(0), lora_rank), dtype=x.dtype, device=x.device)
    _kernels.sgmv_shrink(v, x, wa_ptr, s_start, s_end, tmp1, layer_idx)
    _kernels.sgmv_cutlass(y, v, wb_ptr, s_start, s_end, tmp2, layer_idx)


def _add_lora_sgmv_cutlass_legacy(
    y: torch.Tensor,
    x: torch.Tensor,
    wa_ptr: torch.Tensor,
    wb_ptr: torch.Tensor,
    s_start: torch.IntTensor,
    s_end: torch.IntTensor,
    layer_idx: int,
    lora_rank: int,
):
    tmp_size = _kernels.sgmv_cutlass_tmp_size(wa_ptr.size(0))
    tmp = torch.empty((tmp_size,), dtype=torch.uint8, device=x.device)
    v = torch.zeros((x.size(0), lora_rank), dtype=x.dtype, device=x.device)
    _kernels.sgmv_cutlass(v, x, wa_ptr, s_start, s_end, tmp, layer_idx)
    _kernels.sgmv_cutlass(y, v, wb_ptr, s_start, s_end, tmp, layer_idx)


@lru_cache(maxsize=1)
def get_tmp_tensor(device: torch.device) -> torch.Tensor:
    return torch.empty((8 * 1024 * 1024,), dtype=torch.uint8, device=device)


@lru_cache(maxsize=32)
def get_tmp_tensor_for_size(size: int, device: torch.device) -> torch.Tensor:
    tmp_size = _kernels.sgmv_cutlass_tmp_size(size)
    return torch.empty((tmp_size,), dtype=torch.uint8, device=device)


def get_tmp_tensor_for_size_no_kernels(size: int, device: torch.device) -> torch.Tensor:
    return torch.empty((size,), dtype=torch.uint8, device=device)


def get_tmp_expand_size(size: int) -> int:
    return _kernels.sgmv_cutlass_tmp_size(size)


def get_tmp_tensors(
    nsegments: int, lora_rank: int, device: torch.device
) -> Tuple[torch.Tensor, torch.Tensor]:
    use_cutlass = use_cutlass_shrink(lora_rank) and has_sgmv()
    has_sgmv_available = has_sgmv()

    if use_cutlass:
        tmp = get_tmp_tensor_for_size(nsegments, device)
        return tmp, tmp
    elif has_sgmv_available:
        return get_tmp_tensor(device), get_tmp_tensor_for_size(nsegments, device)
    else:
        tmp = get_tmp_tensor_for_size(nsegments, device)
        return tmp, tmp


def lora_a_sgmv_cutlass(
    x: torch.Tensor,
    tmp: torch.Tensor,
    wa_ptr: torch.Tensor,
    s_start: torch.IntTensor,
    s_end: torch.IntTensor,
    layer_idx: int,
    lora_rank: int,
) -> torch.Tensor:
    v = torch.zeros((x.size(0), lora_rank), dtype=x.dtype, device=x.device)
    if MIN_RANK_CUSTOM <= lora_rank <= MAX_RANK_CUSTOM:
        _kernels.sgmv_shrink(v, x, wa_ptr, s_start, s_end, tmp, layer_idx)
    else:
        _kernels.sgmv_cutlass(v, x, wa_ptr, s_start, s_end, tmp, layer_idx)
    return v


def lora_b_sgmv_cutlass(
    y: torch.Tensor,
    v: torch.Tensor,
    tmp: torch.Tensor,
    wb_ptr: torch.Tensor,
    s_start: torch.IntTensor,
    s_end: torch.IntTensor,
    layer_idx: int,
):
    _kernels.sgmv_cutlass(y, v, wb_ptr, s_start, s_end, tmp, layer_idx)


"""
Semantics:
    y[i] += (
        x[i].unsqueeze(0)
        @ wa_T_all[indices[i], layer_idx, :, :].transpose(-1, -2)
        @ wb_T_all[indices[i], layer_idx, :, :].transpose(-1, -2)
        * scale
    ).squeeze(0)

Args:
    y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
    v: Shape: `[B, R]`. Temporary vector.
    x: Shape: `[B, H1]`. Input vectors.
    wa_T_all: Shape: `[None, L, R, H1]`. All of the transposed LoRA A matrices.
    wb_T_all: Shape: `[None, L, H2, R]`. All of the transposed LoRA B matrices.
    indicies: Shape: `[B]`. Indices of the LoRA weights.
    layer_idx: Layer index of LoRA weights.
    scale: Scaling factor.
"""


def add_lora_a_bgmv(
    v: torch.Tensor,
    x: torch.Tensor,
    wa_T_all: torch.Tensor,
    indicies: torch.LongTensor,
    layer_idx: int,
):
    _kernels.dispatch_bgmv(v, x, wa_T_all, indicies, layer_idx, 1.0)


def add_lora_b_bgmv(
    y: torch.Tensor,
    v: torch.Tensor,
    wb_T_all: torch.Tensor,
    indicies: torch.LongTensor,
    layer_idx: int,
):
    _kernels.dispatch_bgmv(y, v, wb_T_all, indicies, layer_idx, 1.0)


def segmented_matmul(
    y: torch.Tensor,
    x: torch.Tensor,
    w: List[torch.Tensor],
    b: List[torch.Tensor],
    s_start: torch.IntTensor,
    s_end: torch.IntTensor,
):
    for i in range(len(w)):
        if s_end[i] - s_start[i] <= 0:
            continue

        xi = x[s_start[i] : s_end[i]]
        wi = w[i]
        bi = b[i]
        y[s_start[i] : s_end[i]] = F.linear(xi, wi, bi)


================================================
FILE: backends/gaudi/server/text_generation_server/utils/speculate.py
================================================
SPECULATE = None


def get_speculate() -> int:
    global SPECULATE
    return SPECULATE


def set_speculate(speculate: int):
    global SPECULATE
    SPECULATE = speculate


================================================
FILE: backends/gaudi/server/text_generation_server/utils/tokens.py
================================================
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.

import re
from typing import List, Optional, Tuple, Set, Union

import torch
from text_generation_server.pb import generate_pb2
from text_generation_server.pb.generate_pb2 import FinishReason, GrammarType
from text_generation_server.utils.logits_process import (
    FrequencyPenaltyLogitsProcessor,
    GrammarLogitProcessor,
    HeterogeneousProcessorWrapper,
    HeterogeneousRepetitionPenaltyLogitsProcessor,
    HeterogeneousFrequencyPenaltyLogitsProcessor,
    HeterogeneousTemperatureLogitsWarper,
    HeterogeneousTopKLogitsWarper,
    HeterogeneousTopPLogitsWarper,
    HeterogeneousTypicalLogitsWarper,
    HeterogeneousGrammarLogitProcessor,
    static_warper,
)
from text_generation_server.utils.watermark import WatermarkLogitsProcessor
from transformers import PreTrainedTokenizerBase, RepetitionPenaltyLogitsProcessor
import os


class NextTokenChooser:
    def __init__(
        self,
        watermark: bool = False,
        temperature: float = 1.0,
        repetition_penalty: float = 1.0,
        frequency_penalty: float = 0.0,
        top_k: Optional[int] = None,
        top_p: Optional[float] = None,
        typical_p: Optional[float] = None,
        do_sample: bool = False,
        seed: int = 0,
        device: str = "cpu",
        tokenizer: Optional[PreTrainedTokenizerBase] = None,
        grammar: str = "",
        grammar_type: GrammarType = GrammarType.GRAMMAR_TYPE_NONE,
        fsm_grammar_state: int = 0,
    ):
        self.watermark_processor = (
            WatermarkLogitsProcessor(device=device) if watermark else None
        )
        self.repetition_processor = (
            RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty)
            if repetition_penalty and repetition_penalty != 1.0
            else None
        )
        self.frequency_processor = (
            FrequencyPenaltyLogitsProcessor(penalty=frequency_penalty)
            if frequency_penalty and frequency_penalty != 0.0
            else None
        )
        self.grammar_processor = (
            GrammarLogitProcessor(tokenizer, device, grammar, grammar_type)
            if grammar != ""
            else None
        )
        self.tokenizer = tokenizer

        has_warpers = (
            (temperature is not None and temperature != 1.0)
            or (top_k is not None and top_k != 0)
            or (top_p is not None and top_p < 1.0)
            or (typical_p is not None and typical_p < 1.0)
        )
        if has_warpers:
            self.static_warper = static_warper(
                temperature=temperature, top_k=top_k, top_p=top_p, typical_p=typical_p
            )
        else:
            self.static_warper = None

        sampling = do_sample or has_warpers

        self.choice = Sampling(seed, device) if sampling else Greedy()
        self.fsm_grammar_state = fsm_grammar_state
        self.grammar = grammar

    def __call__(self, input_ids, scores):
        if self.watermark_processor is not None:
            scores = self.watermark_processor(input_ids, scores)
        if self.repetition_processor is not None:
            scores = self.repetition_processor(input_ids, scores)
        if self.frequency_processor is not None:
            scores = self.frequency_processor(input_ids, scores)
        if self.grammar_processor is not None:
            scores = self.grammar_processor(scores, self.fsm_grammar_state)

        if self.static_warper is None:
            next_logprob = torch.log_softmax(scores, -1)
        else:
            scores, next_logprob = self.static_warper(scores)

        next_id = self.choice(scores[-1]).view(1, 1)

        return next_id, next_logprob

    def advance_grammar(self, next_id: int):
        if self.grammar_processor is not None:
            self.fsm_grammar_state = self.grammar_processor.advance(
                next_id, self.fsm_grammar_state
            )
        return self

    @classmethod
    def from_pb(
        cls,
        pb: generate_pb2.NextTokenChooserParameters,
        device: torch.device,
        tokenizer: PreTrainedTokenizerBase,
    ) -> "NextTokenChooser":
        return NextTokenChooser(
            watermark=pb.watermark,
            temperature=pb.temperature,
            repetition_penalty=pb.repetition_penalty,
            frequency_penalty=pb.frequency_penalty,
            top_k=pb.top_k,
            top_p=pb.top_p,
            typical_p=pb.typical_p,
            do_sample=pb.do_sample,
            seed=pb.seed,
            device=device,
            tokenizer=tokenizer,
            grammar=pb.grammar,
            grammar_type=pb.grammar_type,
        )


class StopSequenceCriteria:
    def __init__(self, stop_sequence: str):
        stop_sequence = re.escape(stop_sequence)
        self.regex = re.compile(f"{stop_sequence}$")

    def __call__(self, output: str) -> bool:
        if self.regex.findall(output):
            return True
        return False


class StoppingCriteria:
    def __init__(
        self,
        eos_token_ids: Optional[Union[Set[int], int]],
        stop_sequence_criterias: List[StopSequenceCriteria],
        max_new_tokens: int = 20,
        ignore_eos_token: bool = False,
    ):
        if eos_token_ids is None:
            eos_token_ids = set()
        elif isinstance(eos_token_ids, int):
            eos_token_ids = set([eos_token_ids])
        elif isinstance(eos_token_ids, set):
            eos_token_ids = eos_token_ids
        else:
            raise RuntimeError(
                f"eos_token_ids is of invalid type {type(eos_token_ids)}, expected int, None or set[int]"
            )
        self.eos_token_ids = eos_token_ids
        self.stop_sequence_criterias = stop_sequence_criterias
        self.max_new_tokens = max_new_tokens
        self.current_tokens = 0
        self.current_output = ""

        if os.getenv("TEXT_GENERATION_SERVER_IGNORE_EOS_TOKEN", "false") == "true":
            self.ignore_eos_token = True
        else:
            self.ignore_eos_token = ignore_eos_token

    def __call__(self, last_token: int, last_output: str) -> Tuple[bool, Optional[str]]:
        self.current_tokens += 1
        if self.current_tokens >= self.max_new_tokens:
            return True, FinishReason.FINISH_REASON_LENGTH

        if isinstance(last_token, torch.Tensor):
            last_token = last_token.item()

        if not self.ignore_eos_token and last_token in self.eos_token_ids:
            return True, FinishReason.FINISH_REASON_EOS_TOKEN

        if self.stop_sequence_criterias:
            self.current_output += last_output
            # There is no need to keep an output that is too long
            if len(self.current_output) > 300:
                # Slice to -200 to avoid doing it all the time
                self.current_output = self.current_output[-200:]
            for stop_sequence_criteria in self.stop_sequence_criterias:
                if stop_sequence_criteria(self.current_output):
                    return True, FinishReason.FINISH_REASON_STOP_SEQUENCE

        return False, None

    @classmethod
    def from_pb(
        cls,
        pb: generate_pb2.StoppingCriteriaParameters,
        tokenizer: PreTrainedTokenizerBase,
    ) -> "StoppingCriteria":
        stop_sequence_criterias = [
            StopSequenceCriteria(sequence) for sequence in pb.stop_sequences
        ]
        # TODO Hack because eos_token_id cannot be what we want.
        eos_token_id = getattr(tokenizer, "_eos_token_ids", tokenizer.eos_token_id)
        return StoppingCriteria(
            eos_token_id,
            stop_sequence_criterias,
            pb.max_new_tokens,
            pb.ignore_eos_token,
        )


def create_n_gram_speculation(
    input_ids: torch.Tensor,
    next_ids: torch.Tensor,
    accepted_ids: torch.Tensor,
    speculate: int,
    verbose: bool,
):
    # Very trivial approach, find first match in the string.
    # This is much less refined than actual n-gram but seems to work
    # relatively OK in grounded mode and is by far much faster with
    # much less worst case complexity as everything happens on device.
    B = accepted_ids.shape[0]
    device = input_ids.device
    seeds = next_ids[accepted_ids.cumsum(dim=-1) - 1]
    indices = (input_ids == seeds.unsqueeze(-1)).max(dim=1).indices + 1
    all_indices = indices.unsqueeze(-1).expand(B, speculate) + torch.arange(
        speculate, device=device
    )
    all_indices = torch.clamp(all_indices, max=input_ids.shape[1] - 1)

    speculative_ids = input_ids.gather(dim=-1, index=all_indices)
    return speculative_ids


class HeterogeneousNextTokenChooser:
    def __init__(
        self,
        dtype: torch.dtype,
        device: torch.device,
        watermark: List[bool],
        temperature: List[float],
        repetition_penalty: List[float],
        frequency_penalty: List[float],
        top_k: List[int],
        top_p: List[float],
        typical_p: List[float],
        do_sample: List[bool],
        seeds: List[int],
        tokenizer: PreTrainedTokenizerBase,
        grammars: List[str],
        grammar_types: List[int],
        fsm_grammar_states: List[int],
        quantization_enabled: bool,
    ):
        warpers = []

        # TODO: enable watermark with FP8 quantization
        self.watermark_processor = (
            HeterogeneousProcessorWrapper(
                {
                    i: WatermarkLogitsProcessor(device=device)
                    for i, do_watermark in enumerate(watermark)
                    if do_watermark
                }
            )
            if any(watermark) and not quantization_enabled
            else None
        )

        self.repetition_processor = (
            HeterogeneousRepetitionPenaltyLogitsProcessor(
                repetition_penalty, dtype, device
            )
            if any([x != 1.0 for x in repetition_penalty])
            else None
        )

        self.frequency_processor = (
            HeterogeneousFrequencyPenaltyLogitsProcessor(
                frequency_penalty, dtype, device
            )
            if any([x != 0.0 for x in frequency_penalty])
            else None
        )

        self.grammar_processor = (
            HeterogeneousGrammarLogitProcessor(
                tokenizer, device, grammars, grammar_types
            )
            if any([grammar != "" for grammar in grammars])
            else None
        )

        if any(x != 1.0 for x in temperature):
            do_sample = [
                sample or x != 1.0 for x, sample in zip(temperature, do_sample)
            ]
            warpers.append(
                HeterogeneousTemperatureLogitsWarper(temperature, dtype, device)
            )

        if any(x != 0 for x in top_k):
            do_sample = [sample or x != 0 for x, sample in zip(top_k, do_sample)]
            warpers.append(HeterogeneousTopKLogitsWarper(top_k, device))

        if any(x < 1.0 for x in top_p):
            do_sample = [sample or x < 1.0 for x, sample in zip(top_p, do_sample)]
            warpers.append(HeterogeneousTopPLogitsWarper(top_p, dtype, device))

        if any(x < 1.0 for x in typical_p):
            do_sample = [sample or x < 1.0 for x, sample in zip(typical_p, do_sample)]
            warpers.append(HeterogeneousTypicalLogitsWarper(typical_p, dtype, device))

        self.warpers = warpers

        if any(do_sample):
            self.choice = HeterogeneousSampling(do_sample, seeds, device)
        else:
            self.choice = Greedy()

        self.seeds = seeds
        self.do_sample = do_sample
        self.dtype = dtype
        self.device = device
        self.tokenizer = tokenizer
        self.fsm_grammar_states = fsm_grammar_states
        self.grammars = grammars
        self.grammar_types = grammar_types

    def __call__(
        self,
        input_ids: torch.Tensor,
        scores: torch.Tensor,
        speculate: int,
        speculated_ids: Optional[torch.Tensor] = None,
        speculative_scores: Optional[torch.Tensor] = None,
        verbose=False,
    ):
        if speculated_ids is not None:
            B = scores.shape[0] // (speculated_ids.shape[1] + 1)
            S = speculated_ids.shape[1] + 1
            scores = scores.view(B, S, -1)
        else:
            B = scores.shape[0]
            S = 1
            scores = scores.view(B, S, -1)

        next_ids = torch.zeros((B, S), device=scores.device, dtype=torch.long)

        for j in range(S):
            _scores = scores[:, j]
            if self.watermark_processor is not None:
                _scores = self.watermark_processor(input_ids, _scores)
            if self.repetition_processor is not None:
                _scores = self.repetition_processor(input_ids, _scores)
            if self.frequency_processor is not None:
                _scores = self.frequency_processor(input_ids, _scores)
            if self.grammar_processor is not None:
                _scores = self.grammar_processor(_scores, self.fsm_grammar_states)
            for warper in self.warpers:
                _scores = warper(input_ids, _scores)
            _next_ids = self.choice(_scores)
            scores[:, j] = _scores
            next_ids[:, j] = _next_ids
        next_ids = next_ids.view(B * S)
        allscores = scores.view(B * S, -1)
        alllogprobs = torch.log_softmax(allscores, -1)

        if speculated_ids is not None:
            accepted_ids = []
            B = next_ids.shape[0] // (speculated_ids.shape[1] + 1)
            S = speculated_ids.shape[1] + 1
            indices = []
            for i in range(B):
                _next_ids = next_ids[i * S : (i + 1) * S]
                _speculated_ids = speculated_ids[i]
                validate_speculative = _next_ids[:-1] == _speculated_ids
                index = i * S
                accepted = 1
                # First is always valid
                indices.append(index)
                for valid in validate_speculative.tolist():
                    if valid:
                        index += 1
                        accepted += 1
                        indices.append(index)
                    else:
                        break
                accepted_ids.append(accepted)

            accepted_ids = torch.tensor(
                accepted_ids, device=input_ids.device, dtype=input_ids.dtype
            )
            next_ids = next_ids[indices]
            logprobs = alllogprobs[indices]
            indices = torch.arange(B, device=input_ids.device) * S
            if speculative_scores is not None:
                speculative_scores = speculative_scores[indices + accepted_ids - 1]
        else:
            accepted_ids = torch.ones_like(next_ids)
            logprobs = alllogprobs

        next_logprobs = torch.gather(logprobs, 1, next_ids.view(-1, 1)).view(-1)

        if speculate > 0:
            if speculative_scores is not None:
                # Medusa provided some scores
                speculative_ids = Greedy()(speculative_scores)
            else:
                # n-gram
                speculative_ids = create_n_gram_speculation(
                    input_ids, next_ids, accepted_ids, speculate, verbose
                )
        else:
            speculative_ids = None

        return next_ids, next_logprobs, alllogprobs, accepted_ids, speculative_ids

    def advance_grammar(self, next_ids: List[int]):
        if self.grammar_processor is not None:
            other_new_states = self.grammar_processor.advance_batch(
                next_ids, self.fsm_grammar_states
            )
            self.fsm_grammar_states = other_new_states
        return self

    def advance_grammar_single(self, grammar_state_index: int, next_id: int):
        if self.grammar_processor is not None:
            self.fsm_grammar_states[grammar_state_index] = (
                self.grammar_processor.advance_at_index(
                    next_id,
                    self.fsm_grammar_states[grammar_state_index],
                    grammar_state_index,
                )
            )
        return self

    def advance_grammar_single_with_past_state(
        self, grammar_state_index: int, next_id: torch.Tensor, past_state: int
    ):
        if self.grammar_processor is not None:
            next_id = next_id.item()
            self.fsm_grammar_states[grammar_state_index] = (
                self.grammar_processor.advance_at_index(
                    next_id,
                    past_state,
                    grammar_state_index,
                )
            )
        return self

    def filter(self, indices):
        if self.watermark_processor is not None:
            self.watermark_processor = self.watermark_processor.filter(indices)

        if self.repetition_processor is not None:
            self.repetition_processor = self.repetition_processor.filter(indices)

        if self.frequency_processor is not None:
            self.frequency_processor = self.frequency_processor.filter(indices)

        if self.grammar_processor is not None:
            self.grammar_processor = self.grammar_processor.filter(indices)

        filtered_warpers = []
        for warper in self.warpers:
            filtered_warper = warper.filter(indices)
            if filtered_warper is not None:
                filtered_warpers.append(filtered_warper)
        self.warpers = filtered_warpers

        self.seeds = [self.seeds[i] for i in indices]
        self.do_sample = [self.do_sample[i] for i in indices]

        new_grammars = []
        new_fsm_grammar_states = []
        new_grammar_types = []
        for i in indices:
            new_grammars.append(self.grammars[i])
            new_fsm_grammar_states.append(self.fsm_grammar_states[i])
            new_grammar_types.append(self.grammar_types[i])

        self.grammars = new_grammars
        self.fsm_grammar_states = new_fsm_grammar_states
        self.grammar_types = new_grammar_types

        if any(self.do_sample):
            self.choice.filter(indices)
        else:
            self.choice = Greedy()

        return self

    @classmethod
    def from_pb(
        cls,
        pb: List[generate_pb2.NextTokenChooserParameters],
        dtype: torch.dtype,
        device: torch.device,
        tokenizer: PreTrainedTokenizerBase,
        fsm_grammar_states: Optional[List[int]] = None,
        quantization_enabled: bool = False,
    ) -> "HeterogeneousNextTokenChooser":
        return HeterogeneousNextTokenChooser(
            watermark=[pb_.watermark for pb_ in pb],
            temperature=[pb_.temperature for pb_ in pb],
            repetition_penalty=[pb_.repetition_penalty for pb_ in pb],
            frequency_penalty=[pb_.frequency_penalty for pb_ in pb],
            top_k=[pb_.top_k for pb_ in pb],
            top_p=[pb_.top_p for pb_ in pb],
            typical_p=[pb_.typical_p for pb_ in pb],
            do_sample=[pb_.do_sample for pb_ in pb],
            seeds=[pb_.seed for pb_ in pb],
            device=device,
            dtype=dtype,
            tokenizer=tokenizer,
            grammars=[pb_.grammar for pb_ in pb],
            grammar_types=[pb_.grammar_type for pb_ in pb],
            fsm_grammar_states=(
                fsm_grammar_states if fsm_grammar_states else [0] * len(pb)
            ),
            quantization_enabled=quantization_enabled,
        )


def pad_next_token_chooser_parameters(
    parameters: List[generate_pb2.NextTokenChooserParameters],
    expected_size: int,
) -> List[generate_pb2.NextTokenChooserParameters]:
    # disable all logits processors to minimize padding overhead
    empty_parameters = generate_pb2.NextTokenChooserParameters(
        temperature=1.0,
        top_k=0,
        top_p=1.0,
        typical_p=1.0,
        do_sample=False,
        seed=0,
        repetition_penalty=1.0,
        frequency_penalty=0.0,
        watermark=False,
        grammar="",
        grammar_type=0,
    )
    parameters.extend([empty_parameters] * (expected_size - len(parameters)))
    return parameters


class Sampling:
    def __init__(self, seed: int, device: str = "cpu"):
        if device in ["hpu", torch.device("hpu")]:
            import habana_frameworks.torch.hpu.random as htrandom

            self.generator = htrandom.default_generators[0].manual_seed(seed)
        else:
            self.generator = torch.Generator("cpu")
            self.generator.manual_seed(seed)
        self.seed = seed

    def __call__(self, logits):
        probs = torch.nn.functional.softmax(logits, -1)
        # Avoid GPU<->CPU sync done by torch multinomial
        # See: https://github.com/pytorch/pytorch/blob/925a3788ec5c06db62ca732a0e9425a26a00916f/aten/src/ATen/native/Distributions.cpp#L631-L637
        q = torch.empty_like(probs).exponential_(1, generator=self.generator)
        return probs.div_(q).argmax()


class Greedy:
    def __call__(self, logits):
        return logits.argmax(dim=-1)


class HeterogeneousSampling:
    r"""
    Mixed greedy and probabilistic sampling. Compute both and pick the right one for each sample.
    """

    def __init__(self, do_sample: List[bool], seeds: List[int], device: torch.device):
        self.seeds = seeds

        self.greedy_indices = []
        self.sampling_mapping = {}
        for i, (sample, seed) in enumerate(zip(do_sample, seeds)):
            if sample:
                self.sampling_mapping[i] = Sampling(seed, device)
            else:
                self.greedy_indices.append(i)

        self.greedy = Greedy()

    def __call__(self, logits):
        out = torch.zeros(logits.shape[0], dtype=torch.int64, device=logits.device)
        if self.greedy_indices:
            # Computing for all indices is faster than slicing
            torch.argmax(logits, -1, out=out)

        for i, sampling in self.sampling_mapping.items():
            out[i] = sampling(logits[i])
        return out

    def filter(self, indices):
        new_greedy_indices = []
        new_sampling_mapping = {}
        for i, idx in enumerate(indices):
            if idx in self.sampling_mapping:
                new_sampling_mapping[i] = self.sampling_mapping[idx]
            else:
                new_greedy_indices.append(i)

        self.greedy_indices = new_greedy_indices
        self.sampling_mapping = new_sampling_mapping
        return self


def batch_top_tokens(
    top_n_tokens: List[int],
    top_n_tokens_tensor: torch.Tensor,
    logprobs: torch.Tensor,
    accepted_ids: torch.Tensor,
) -> Tuple[List[List[List[int]]], List[List[List[float]]]]:
    """Find the top n most likely tokens for a batch of generations.

    When multiple tokens have equal probabilities and they don't all fit, the
    remaining tokens are also returned.
    """
    max_top_n = max(top_n_tokens)
    # Early exit when top_n_tokens is not used
    if max_top_n == 0:
        return [[[]]] * len(top_n_tokens), [[[]]] * len(top_n_tokens)

    batch_size = accepted_ids.shape[0]
    speculate_size = logprobs.shape[0] // batch_size
    top_n_tokens_tensor = top_n_tokens_tensor.repeat_interleave(speculate_size)
    # Ensure top_n doesn't exceed vocab size
    top_n_tokens = [
        min(tok, logprobs.size(-1))
        for tok in top_n_tokens
        for _ in range(speculate_size)
    ]

    # Parallel kthvalue adapted from https://discuss.pytorch.org/t/how-to-efficiently-get-the-k-th-largest-values-in-parallel/160529/2
    # Sorted topk is faster than torch.sort() since we only need a small subset
    sorted_top_k = torch.topk(logprobs, k=max_top_n, dim=-1, sorted=True).values

    nth_highest = torch.gather(
        sorted_top_k, 1, (top_n_tokens_tensor - 1).clip(min=0).unsqueeze(1)
    )
    nth_highest[nth_highest == -float("inf")] = torch.finfo(logprobs.dtype).min

    # Find the new "fuzzy" top n values
    top_n_indices = (logprobs >= nth_highest).nonzero()
    _, top_n_ishes = torch.unique_consecutive(top_n_indices[:, 0], return_counts=True)

    k = 1 if top_n_ishes.numel() == 0 else top_n_ishes.max()
    # Take a new topk for these new max n values
    top_k = torch.topk(logprobs, k=k, dim=1, sorted=True)

    top_n_ishes = top_n_ishes.tolist()
    top_indices = top_k.indices.tolist()
    top_values = top_k.values.tolist()

    batch_top_token_ids = []
    batch_top_token_logprobs = []
    accepted_ids_list = accepted_ids.tolist()
    for i, n_accepted_ids in enumerate(accepted_ids_list):
        start = speculate_size * i
        stop = speculate_size * (i + 1)
        _top_indices = top_indices[start:stop]
        _top_values = top_values[start:stop]
        _top_n_ishes = top_n_ishes[start:stop]
        _top_n_tokens = top_n_tokens[start:stop]

        _top_indices = _top_indices[:n_accepted_ids]
        _top_values = _top_values[:n_accepted_ids]
        _top_n_ishes = _top_n_ishes[:n_accepted_ids]
        _top_n_tokens = _top_n_tokens[:n_accepted_ids]

        row_top_token_ids = []
        row_top_token_logprobs = []

        for idxs, vals, n, req_n in zip(
            _top_indices, _top_values, _top_n_ishes, _top_n_tokens
        ):
            indices = idxs[:n] if req_n > 0 else []
            values = vals[:n] if req_n > 0 else []

            row_top_token_ids.append(indices)
            row_top_token_logprobs.append(values)

        batch_top_token_ids.append(row_top_token_ids)
        batch_top_token_logprobs.append(row_top_token_logprobs)

    return batch_top_token_ids, batch_top_token_logprobs


def make_tokenizer_optional(tokenizer):
    class _(type(tokenizer)):
        def __call__(
            self,
            text,
            return_tensors,
            padding,
            return_token_type_ids,
            truncation,
            max_length,
        ):
            assert (
                return_tensors == "pt"
            ), "inccorrect input arguments when calling TransparentTokenizer"
            assert (
                padding == "max_length" or padding == "longest"
            ), "inccorrect input arguments when calling TransparentTokenizer"
            assert (
                not return_token_type_ids
            ), "inccorrect input arguments when calling TransparentTokenizer"
            assert (
                truncation
            ), "inccorrect input arguments when calling TransparentTokenizer"

            def str_token_to_int(i):
                if i == "?":
                    return tokenizer.pad_token_id
                else:
                    return int(i)

            all_tokens = [
                [str_token_to_int(i.strip()) for i in inner_text.split(",")]
                for inner_text in text
            ]
            if padding == "longest":
                max_length = max(len(tokens) for tokens in all_tokens)
            return {
                "input_ids": torch.tensor(
                    [
                        [tokenizer.pad_token_id] * (max_length - len(tokens)) + tokens
                        for tokens in all_tokens
                    ]
                ),
                "attention_mask": torch.tensor(
                    [
                        [0] * (max_length - len(tokens)) + [1] * len(tokens)
                        for tokens in all_tokens
                    ]
                ),
            }

        def decode(
            self,
            token_ids,
            skip_special_tokens: bool = False,
            clean_up_tokenization_spaces: bool = None,
            **kwargs,
        ) -> str:
            # I don't think this method is used anywhere and should be removed when doing refactoring
            return ",".join(str(i) for i in to_py_obj(token_ids))  # noqa: F821

    if os.getenv("SKIP_TOKENIZER_IN_TGI", "false").lower() == "true":
        tokenizer.__class__ = _
        tokenizer.is_transparent = True


def is_tokenizer_transparent(tokenizer):
    return hasattr(tokenizer, "is_transparent") and tokenizer.is_transparent is True


================================================
FILE: backends/gaudi/server/text_generation_server/utils/version.py
================================================
from packaging.version import Version
from packaging import version
import subprocess


def get_driver_version():
    """
    Returns the driver version.
    """
    # Enable console printing for `hl-smi` check
    output = subprocess.run(
        "hl-smi",
        shell=True,
        text=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        env={"ENABLE_CONSOLE": "true"},
    )
    if output.returncode == 0 and output.stdout:
        return version.parse(
            output.stdout.split("\n")[2]
            .replace(" ", "")
            .split(":")[1][:-1]
            .split("-")[0]
        )
    return None


MIN_TGI_GAUDI_SYNAPSE_VERSION = Version("1.19.0")


def is_driver_compatible():
    driver_version = get_driver_version()
    if driver_version is not None:
        if driver_version < MIN_TGI_GAUDI_SYNAPSE_VERSION:
            return False
    return True


================================================
FILE: backends/gaudi/server/text_generation_server/utils/watermark.py
================================================
# coding=utf-8
# Copyright 2023 Authors of "A Watermark for Large Language Models"
# available at https://arxiv.org/abs/2301.10226
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os

import torch
from transformers import LogitsProcessor
from typing import List, Union

GAMMA = float(os.getenv("WATERMARK_GAMMA", 0.5))
DELTA = float(os.getenv("WATERMARK_DELTA", 2.0))


class WatermarkLogitsProcessor(LogitsProcessor):
    def __init__(
        self,
        gamma: float = GAMMA,
        delta: float = DELTA,
        hash_key: int = 15485863,  # just a large prime number to create a rng seed with sufficient bit width
        device: str = "cpu",
    ):
        # watermarking parameters
        self.gamma = gamma
        self.delta = delta
        self.rng = torch.Generator(device="cpu")
        self.hash_key = hash_key

    def _seed_rng(self, input_ids: Union[List[int], torch.LongTensor]):
        if isinstance(input_ids, list):
            assert (
                len(input_ids) >= 1
            ), "requires at least a 1 token prefix sequence to seed rng"
            prev_token = input_ids[-1]
        else:
            assert len(input_ids) == 1
            input_ids = input_ids[0]
            assert (
                input_ids.shape[-1] >= 1
            ), "requires at least a 1 token prefix sequence to seed rng"
            prev_token = input_ids[-1].item()
        self.rng.manual_seed(self.hash_key * prev_token)

    def _get_greenlist_ids(
        self,
        input_ids: Union[List[int], torch.LongTensor],
        max_value: int,
        device: torch.device,
    ) -> List[int]:
        # seed the rng using the previous tokens/prefix
        self._seed_rng(input_ids)

        greenlist_size = int(max_value * self.gamma)
        vocab_permutation = torch.randperm(max_value, device=device, generator=self.rng)
        greenlist_ids = vocab_permutation[:greenlist_size]
        return greenlist_ids

    @staticmethod
    def _calc_greenlist_mask(
        scores: torch.FloatTensor, greenlist_token_ids
    ) -> torch.BoolTensor:
        green_tokens_mask = torch.zeros_like(scores)
        green_tokens_mask[-1, greenlist_token_ids] = 1
        final_mask = green_tokens_mask.bool()
        return final_mask

    @staticmethod
    def _bias_greenlist_logits(
        scores: torch.Tensor, greenlist_mask: torch.Tensor, greenlist_bias: float
    ) -> torch.Tensor:
        scores[greenlist_mask] = scores[greenlist_mask] + greenlist_bias
        return scores

    def __call__(
        self, input_ids: Union[List[int], torch.LongTensor], scores: torch.FloatTensor
    ) -> torch.FloatTensor:
        greenlist_ids = self._get_greenlist_ids(
            input_ids, scores.shape[-1], scores.device
        )
        green_tokens_mask = self._calc_greenlist_mask(
            scores=scores, greenlist_token_ids=greenlist_ids
        )

        scores = self._bias_greenlist_logits(
            scores=scores, greenlist_mask=green_tokens_mask, greenlist_bias=self.delta
        )
        return scores


================================================
FILE: backends/gaudi/server/text_generation_server/utils/weights.py
================================================
import torch

from abc import ABC, abstractmethod
from contextlib import contextmanager
from pathlib import Path
from typing import Dict, List, Optional, Union, Type
from safetensors import safe_open
from dataclasses import dataclass


class WeightsLoader(ABC):
    """
    Instances of this type implement higher-level weight loading.

    At a low-level, every weight is stored in the Safetensors format.
    The interpretation of weights may be different however, for instance
    could be packed, quantized weights. Loaders are responsible for
    interpreting the raw tensors, sharding tensors in a manner compatible
    with the format, etc.
    """

    @abstractmethod
    def get_weights(self, weights: "Weights", prefix: str):
        """
        Get weights at the given prefix and apply without tensor paralllism.
        """
        ...

    @abstractmethod
    def get_weights_col_packed(
        self,
        weights: "Weights",
        prefix: str,
        block_sizes: Union[int, List[int]],
    ):
        """
        Get the packed weights at the given prefix with column-splitting for
        tensor parallelism. This method should be used when multiple different
        weights are packed into a tensor, for instance, query/key/value
        weights or a gate/up projection.

        The `block_sizes` determines the proportions of the packed tensors.
        The columns are split in equally sized blocks when `block_sizes` is an
        `int`, or in blocks proportional given to the sizes. For instance
        `[2, 1, 1]` will divide an input with dimensionality `1024` in
        `[512, 256, 256]`.
        """
        ...

    def get_weights_col(self, weights: "Weights", prefix: str):
        """
        Get weights at the given prefix and apply column-splitting for tensor
        paralllism.
        """
        return weights.get_multi_weights_col([prefix], 0)

    @abstractmethod
    def get_multi_weights_col(self, weights: "Weights", prefixes: List[str], dim: int):
        """
        Get the weights at the given prefixes, column-split them for tensor
        parallelim, and then concatenate the weights along the given dimension.
        """
        ...

    @abstractmethod
    def get_multi_weights(self, weights: "Weights", prefixes: List[str], dim: int):
        """
        Get the weights at the given prefixes, column-split them for tensor
        parallelim, and then concatenate the weights along the given dimension.
        """
        ...

    @abstractmethod
    def get_weights_row(self, weights: "Weights", prefix: str):
        """
        Get the weights at the given prefix and apply row-splitting for tensor
        parallism.
        """
        ...


class Weight(ABC):
    """Instances of this type implement unquantized/quantized/to-be
    quantized weights."""

    @abstractmethod
    def get_linear(self, bias: torch.Tensor):
        """Create a linear layer from this weight."""
        ...


@dataclass
class UnquantizedWeight(Weight):
    weight: torch.Tensor

    def get_linear(self, bias: torch.Tensor):
        from text_generation_server.layers.linear import FastLinear

        return FastLinear(self.weight, bias)


class DefaultWeightsLoader(WeightsLoader):
    """Weight loader that loads (unquantized) Torch tensors."""

    def __init__(self, weight_class: Type[UnquantizedWeight]):
        """Create a loader. Weights will be wrapped using the given `weights_class`,
        normally this will be `UnquantizedWeight`, but a quantizer-specific class
        such as `Fp8Weight` can be used to quantize the weights during loading.
        """
        self.weight_class = weight_class

    """
    Loader that uses tensors as-is with the exception of applying sharding
    and/or concatenation.
    """

    def get_weights(self, weights: "Weights", prefix: str):
        return weights.get_tensor(f"{prefix}.weight")

    def get_weights_col_packed(
        self,
        weights: "Weights",
        prefix: str,
        block_sizes: Union[int, List[int]],
    ):
        return self.weight_class(
            weights.get_packed_sharded(
                f"{prefix}.weight", dim=0, block_sizes=block_sizes
            ),
        )

    def get_multi_weights_col(self, weights: "Weights", prefixes: List[str], dim: int):
        w = [weights.get_sharded(f"{p}.weight", dim=0) for p in prefixes]
        return self.weight_class(torch.cat(w, dim=dim))

    def get_weights_row(self, weights: "Weights", prefix: str):
        return self.weight_class(
            weights.get_sharded(f"{prefix}.weight", dim=1),
        )

    def get_multi_weights(self, weights: "Weights", prefixes: List[str], dim: int):
        w = [weights.get_tensor(f"{p}.weight") for p in prefixes]
        return self.weight_class(torch.cat(w, dim=dim))


class Weights:
    def __init__(
        self,
        filenames: List[Path],
        device,
        dtype,
        process_group,
        weights_loader: WeightsLoader,
        aliases: Optional[Dict[str, List[str]]] = None,
        prefix: Optional[str] = None,
    ):
        routing = {}
        for filename in filenames:
            with safe_open(filename, framework="pytorch") as f:
                for k in f.keys():
                    if k in routing:
                        raise RuntimeError(
                            f"Key {k} was found in multiple files: {filename} and {routing[k]}"
                        )
                    routing[k] = filename
        if aliases is None:
            aliases = {}
        self.aliases = aliases
        self.routing = routing
        self.device = device
        self.dtype = dtype
        self.process_group = process_group
        self.prefix = prefix
        self.weights_loader = weights_loader
        self._handles = {}

    def _get_handle(self, filename):
        if filename not in self._handles:
            f = safe_open(filename, framework="pytorch")
            self._handles[filename] = f

        return self._handles[filename]

    def get_filename(self, tensor_name: str) -> (str, str):
        names = [tensor_name]
        if self.prefix is not None:
            prefixed = f"{self.prefix}.{tensor_name}"
            names.append(prefixed)
        for name in names:
            filename = self.routing.get(name, None)
            if filename is not None:
                return str(filename), name

            aliases = self.aliases.get(name, [])
            for alias in aliases:
                filename = self.routing.get(alias, None)
                if filename is not None:
                    return str(filename), alias
        raise RuntimeError(f"weight {tensor_name} does not exist")

    def _get_slice(self, tensor_name: str):
        filename, tensor_name = self.get_filename(tensor_name)
        f = self._get_handle(filename)
        slice_ = f.get_slice(tensor_name)
        return slice_

    def has_tensor(self, tensor_name: str):
        try:
            self.get_filename(tensor_name)
        except Exception:
            return False
        return True

    def get_shape(self, tensor_name: str):
        return self._get_slice(tensor_name).get_shape()

    def get_tensor(
        self, tensor_name: str, to_device: bool = True, to_dtype: bool = True
    ) -> torch.Tensor:
        filename, tensor_name = self.get_filename(tensor_name)
        f = self._get_handle(filename)
        tensor = f.get_tensor(tensor_name)
        # Special case for gptq which shouldn't convert
        # u4 which are disguised as int32. Exl2 uses int16
        # as well. FP8 uses torch.float8_e4m3fn
        if (
            tensor.dtype
            not in [
                torch.float8_e4m3fn,
                torch.int8,
                torch.int16,
                torch.int32,
                torch.int64,
            ]
            and to_dtype
        ):
            tensor = tensor.to(dtype=self.dtype)
        if to_device:
            tensor = tensor.to(device=self.device)
        return tensor

    def get_partial_sharded(
        self, tensor_name: str, dim: int, to_device=True, to_dtype=True
    ):
        filename, tensor_name = self.get_filename(tensor_name)
        f = self._get_handle(filename)
        slice_ = f.get_slice(tensor_name)
        world_size = self.process_group.size()
        rank = self.process_group.rank()

        size = slice_.get_shape()[dim]
        block_size = (size + world_size - 1) // world_size
        start = rank * block_size
        stop = (rank + 1) * block_size

        if dim == 0:
            tensor = slice_[start:stop]
        elif dim == 1:
            tensor = slice_[:, start:stop]
        else:
            raise NotImplementedError("Let's make that generic when needed")
        # Special case for gptq which shouldn't convert
        # u4 which are disguised as int32. exl2 uses int16.
        # FP8 uses torch.float8_e4m3fn.
        if (
            tensor.dtype
            not in (torch.float8_e4m3fn, torch.int8, torch.int16, torch.int32)
            and to_dtype
        ):
            tensor = tensor.to(dtype=self.dtype)
        if to_device:
            tensor = tensor.to(device=self.device)
        return tensor

    def get_sharded(self, tensor_name: str, dim: int, to_device=True, to_dtype=True):
        filename, tensor_name = self.get_filename(tensor_name)
        f = self._get_handle(filename)
        slice_ = f.get_slice(tensor_name)
        world_size = self.process_group.size()
        size = slice_.get_shape()[dim]
        assert (
            size % world_size == 0
        ), f"The choosen size {size} is not compatible with sharding on {world_size} shards"
        return self.get_partial_sharded(
            tensor_name, dim, to_device=to_device, to_dtype=to_dtype
        )

    def get_packed_sharded(
        self,
        tensor_name: str,
        dim: int,
        block_sizes: Union[int, List[int]],
        to_dtype=True,
    ) -> torch.Tensor:
        """
        Get a shard from a tensor that packs multiple tensors.

        When a tensor packs multiple tensors (such as QKV or an up
        projection + gate projection), sharding with `get_sharded` is not
        safe since it would not split the packed tensors across shards.

        This method shards a tensor, such that the packed tensors are
        split across shards.

        The columns are split in equally sized blocks when blocks is an `int`, or
        in blocks proportional given to the sizes. For instance `[2, 1, 1]` will
        divide an input with dimensionality `1024` in `[512, 256, 256]`. This is
        convenient for e.g. splitting QKV without knowing the storage details of
        quantized weights.
        """
        slice_ = self._get_slice(tensor_name)
        total_size = slice_.get_shape()[dim]
        block_sizes = _blocks_to_block_sizes(total_size=total_size, blocks=block_sizes)

        world_size = self.process_group.size()
        rank = self.process_group.rank()

        tensors_slices = []
        block_offset = 0
        for block_size in block_sizes:
            assert (
                block_size % world_size == 0
            ), f"Prepacked tensor cannot be sharded across {world_size} shards"
            shard_block_size = block_size // world_size
            start = rank * shard_block_size
            stop = (rank + 1) * shard_block_size
            tensors_slices += range(block_offset + start, block_offset + stop)
            block_offset += block_size

        if dim == 0:
            tensor = slice_[tensors_slices, ...]
        elif dim == 1 or dim == -2:
            tensor = slice_[:, tensors_slices, ...]
        elif dim == 2 or dim == -1:
            tensor = slice_[..., tensors_slices]
        else:
            raise ValueError(f"Unsupported dim {dim}, only dim 0, 1 or 2 are supported")

        tensor = tensor.to(device=self.device)

        # Avoid casting quantizer dtypes.
        if (
            tensor.dtype
            not in [
                torch.float8_e4m3fn,
                torch.int8,
                torch.int16,
                torch.int32,
                torch.int64,
            ]
            and to_dtype
        ):
            tensor = tensor.to(dtype=self.dtype)

        return tensor

    def get_weights(self, prefix: str):
        return self.weights_loader.get_weights(self, prefix)

    def get_weights_col_packed_qkv(
        self,
        prefix: str,
        num_heads: int,
        num_key_value_heads: int,
    ):
        return self.get_weights_col_packed(
            prefix, [num_heads, num_key_value_heads, num_key_value_heads]
        )

    def get_weights_col_packed_gate_up(self, prefix: str):
        return self.get_weights_col_packed(prefix, 2)

    def get_weights_col_packed(self, prefix: str, block_sizes: Union[int, List[int]]):
        """
        The columns are split in equally sized blocks when blocks is an `int`, or
        in blocks proportional given to the sizes. For instance `[2, 1, 1]` will
        divide an input with dimensionality `1024` in `[512, 256, 256]`. This is
        convenient for e.g. splitting QKV without knowing the storage details of
        quantized weights.
        """
        return self.weights_loader.get_weights_col_packed(self, prefix, block_sizes)

    def get_weights_col(self, prefix: str):
        return self.weights_loader.get_weights_col(self, prefix)

    def get_multi_weights_col(self, prefixes: List[str], dim: int):
        return self.weights_loader.get_multi_weights_col(self, prefixes, dim)

    def get_tensor_shard(self, var, dim):
        world_size = self.process_group.size()
        rank = self.process_group.rank()
        block_size = var.size()[dim] // world_size
        start = rank * block_size
        stop = (rank + 1) * block_size
        if dim == 0:
            tensor = var[start:stop]
        elif dim == 1:
            tensor = var[:, start:stop]
        else:
            raise NotImplementedError("Let's make that generic when needed")
        tensor = tensor.to(dtype=self.dtype)
        tensor = tensor.to(device=self.device)
        return tensor

    def get_weights_row(self, prefix: str):
        return self.weights_loader.get_weights_row(self, prefix)

    def get_multi_weights(self, prefixes: List[str], dim: int):
        return self.weights_loader.get_multi_weights(self, prefixes, dim)

    @contextmanager
    def use_loader(self, weights_loader: WeightsLoader):
        """
        This method is a context manager that can be used to use `Weights` with
        a different loader for the duration of the context.
        """

        old_loader = self.weights_loader
        self.weights_loader = weights_loader
        try:
            yield
        finally:
            self.weights_loader = old_loader

    @property
    def loader(self):
        return self.weights_loader


def _blocks_to_block_sizes(total_size: int, blocks: Union[int, List[int]]) -> List[int]:
    """
    Convert block count or proportions to block sizes.

    This function accepts

    - The number of blocks (int), in which case the block size is
      total_size//blocks; or
    - A list of block sizes (List[int]).

    In the latter case, if sum(blocks) < total_size, the ratios between
    the block sizes will be preserved. For instance, if blocks is
    [2, 1, 1] and total_size is 1024, the returned block sizes are
    [512, 256, 256].
    """
    if isinstance(blocks, list):
        total_blocks = sum(blocks)
        assert (
            total_size % total_blocks == 0
        ), f"Cannot split {total_size} in proportional blocks: {blocks}"
        part_size = total_size // total_blocks
        return [part_size * block for block in blocks]
    else:
        assert total_size % blocks == 0, f"Prepacked is not divisible by {blocks}"
        single_size = total_size // blocks
        return [single_size] * blocks


================================================
FILE: backends/gaudi/tgi-entrypoint.sh
================================================
#!/bin/bash

ldconfig 2>/dev/null || echo 'unable to refresh ld cache, not a big deal in most cases'

# Check if --sharded argument is present in the command line arguments
if [[ "$*" == *"--sharded true"* ]]; then
  echo 'setting PT_HPU_ENABLE_LAZY_COLLECTIVES=1 for sharding'
  export PT_HPU_ENABLE_LAZY_COLLECTIVES=1
fi

text-generation-launcher $@


================================================
FILE: backends/grpc-metadata/Cargo.toml
================================================
[package]
name = "grpc-metadata"
version = "0.1.0"
edition = "2021"

[dependencies]
opentelemetry = "^0.20"
tonic = "^0.10"
tracing = "^0.1"
tracing-opentelemetry = "^0.21"


================================================
FILE: backends/grpc-metadata/src/lib.rs
================================================
//! A crate to extract and inject a OpenTelemetry context from and to a gRPC request.
//! Inspired by: https://github.com/open-telemetry/opentelemetry-rust gRPC examples

use opentelemetry::global;
use opentelemetry::propagation::Injector;
use tracing_opentelemetry::OpenTelemetrySpanExt;

/// Inject context in the metadata of a gRPC request.
struct MetadataInjector<'a>(pub &'a mut tonic::metadata::MetadataMap);

impl Injector for MetadataInjector<'_> {
    /// Set a key and value in the MetadataMap.  Does nothing if the key or value are not valid inputs
    fn set(&mut self, key: &str, value: String) {
        if let Ok(key) = tonic::metadata::MetadataKey::from_bytes(key.as_bytes()) {
            if let Ok(val) = value.parse() {
                self.0.insert(key, val);
            }
        }
    }
}

/// Get a context from the global context and inject the span into a gRPC request's metadata.
fn inject(metadata: &mut tonic::metadata::MetadataMap) {
    global::get_text_map_propagator(|propagator| {
        propagator.inject_context(
            &tracing::Span::current().context(),
            &mut MetadataInjector(metadata),
        )
    })
}

pub trait InjectTelemetryContext {
    fn inject_context(self) -> Self;
}

impl<T> InjectTelemetryContext for tonic::Request<T> {
    fn inject_context(mut self) -> Self {
        inject(self.metadata_mut());
        self
    }
}


================================================
FILE: backends/llamacpp/Cargo.toml
================================================
[package]
name = "text-generation-router-llamacpp"
version.workspace = true
edition.workspace = true
authors.workspace = true
homepage.workspace = true

[build-dependencies]
bindgen = "0.71.1"
pkg-config = "0.3.31"

[dependencies]
async-trait = "0.1.85"
clap = "4.5.27"
hf-hub.workspace = true
num_cpus = "1.16.0"
text-generation-router = { path = "../../router" }
thiserror = "2.0.11"
tokenizers.workspace = true
tokio = { version = "1.43.0", features = ["process"] }
tokio-stream = "0.1.17"
tracing = "0.1.41"


================================================
FILE: backends/llamacpp/README.md
================================================
# Llamacpp backend

If all your dependencies are installed at the system level, running
cargo build should be sufficient. However, if you want to experiment
with different versions of llama.cpp, some additional setup is required.

## Install llama.cpp

    LLAMACPP_PREFIX=$(pwd)/llama.cpp.out

    git clone https://github.com/ggerganov/llama.cpp
    cd llama.cpp
    cmake -B build \
        -DCMAKE_INSTALL_PREFIX="$LLAMACPP_PREFIX" \
        -DLLAMA_BUILD_COMMON=OFF \
        -DLLAMA_BUILD_TESTS=OFF \
        -DLLAMA_BUILD_EXAMPLES=OFF \
        -DLLAMA_BUILD_SERVER=OFF
    cmake --build build --config Release -j
    cmake --install build

## Build TGI

    PKG_CONFIG_PATH="$LLAMACPP_PREFIX/lib/pkgconfig" cargo build


================================================
FILE: backends/llamacpp/build.rs
================================================
use bindgen::callbacks::{ItemInfo, ParseCallbacks};
use std::env;
use std::path::PathBuf;

#[derive(Debug)]
struct PrefixStripper;

impl ParseCallbacks for PrefixStripper {
    fn generated_name_override(&self, item_info: ItemInfo<'_>) -> Option<String> {
        item_info.name.strip_prefix("llama_").map(str::to_string)
    }
}

fn main() {
    if let Some(cuda_version) = option_env!("CUDA_VERSION") {
        let mut version: Vec<&str> = cuda_version.split('.').collect();
        if version.len() > 2 {
            version.pop();
        }
        let cuda_version = format!("cuda-{}", version.join("."));
        pkg_config::Config::new().probe(&cuda_version).unwrap();
    }
    let llama = pkg_config::Config::new().probe("llama").unwrap();

    for path in &llama.link_paths {
        println!("cargo:rustc-link-arg=-Wl,-rpath,{}", path.display());
    }
    if cfg!(target_os = "linux") {
        println!("cargo:rustc-link-arg=-Wl,--disable-new-dtags");
    }
    let bindings = bindgen::Builder::default()
        .clang_args(
            llama
                .include_paths
                .iter()
                .map(|p| format!("-I{}", p.display())),
        )
        .header_contents("llama_bindings.h", "#include <llama.h>")
        .prepend_enum_name(false)
        .parse_callbacks(Box::new(PrefixStripper))
        .parse_callbacks(Box::new(bindgen::CargoCallbacks::new()))
        .generate()
        .expect("Unable to generate bindings");

    let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
    bindings
        .write_to_file(out_path.join("llamacpp.rs"))
        .expect("Couldn't write bindings!");
}


================================================
FILE: backends/llamacpp/requirements.txt
================================================
transformers==4.49
huggingface-hub==0.28.1
hf-transfer==0.1.9
torch==2.6.0


================================================
FILE: backends/llamacpp/src/backend.rs
================================================
use crate::llamacpp;

use async_trait::async_trait;
use std::ffi::CString;
use std::mem::replace;
use std::str::FromStr;
use std::sync::{mpsc, Once};
use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse};
use text_generation_router::validation::ValidGenerateRequest;
use text_generation_router::{FinishReason, Token};
use thiserror::Error;
use tokenizers::Tokenizer;
use tokio::sync::mpsc::{unbounded_channel, UnboundedSender};
use tokio::sync::{oneshot, watch};
use tokio::task::{spawn, spawn_blocking};
use tokio::time::{timeout, Duration, Instant};
use tokio_stream::wrappers::UnboundedReceiverStream;
use tracing::instrument;
use tracing::{debug, error, info, trace, warn};

#[derive(Debug, Clone, Copy)]
pub enum LlamacppSplitMode {
    GPU(usize),
    Layer,
    Row,
}

impl FromStr for LlamacppSplitMode {
    type Err = String;
    fn from_str(s: &str) -> Result<Self, Self::Err> {
        match s.to_lowercase().as_str() {
            "layer" => Ok(LlamacppSplitMode::Layer),
            "row" => Ok(LlamacppSplitMode::Row),
            _ => match s.parse::<usize>() {
                Ok(n) => Ok(LlamacppSplitMode::GPU(n)),
                Err(_) => Err("Choose a GPU number or `layer` or `row`".to_string()),
            },
        }
    }
}

#[derive(Debug, Clone, Copy, clap::ValueEnum)]
pub enum LlamacppNuma {
    Disabled,
    Distribute,
    Isolate,
    Numactl,
    Mirror,
}

#[allow(non_camel_case_types)]
#[derive(Debug, Clone, Copy, clap::ValueEnum)]
pub enum LlamacppGGMLType {
    F32,
    F16,
    Q4_0,
    Q4_1,
    Q5_0,
    Q5_1,
    Q8_0,
    Q8_1,
    Q2_K,
    Q3_K,
    Q4_K,
    Q5_K,
    Q6_K,
    Q8_K,
    IQ2_XXS,
    IQ2_XS,
    IQ3_XXS,
    IQ1_S,
    IQ4_NL,
    IQ3_S,
    IQ2_S,
    IQ4_XS,
    I8,
    I16,
    I32,
    I64,
    F64,
    IQ1_M,
    BF16,
    TQ1_0,
    TQ2_0,
}

// TODO: macro
impl LlamacppGGMLType {
    fn to_ggml_type(self) -> llamacpp::ggml_type {
        match self {
            LlamacppGGMLType::F32 => llamacpp::GGML_TYPE_F32,
            LlamacppGGMLType::F16 => llamacpp::GGML_TYPE_F16,
            LlamacppGGMLType::Q4_0 => llamacpp::GGML_TYPE_Q4_0,
            LlamacppGGMLType::Q4_1 => llamacpp::GGML_TYPE_Q4_1,
            LlamacppGGMLType::Q5_0 => llamacpp::GGML_TYPE_Q5_0,
            LlamacppGGMLType::Q5_1 => llamacpp::GGML_TYPE_Q5_1,
            LlamacppGGMLType::Q8_0 => llamacpp::GGML_TYPE_Q8_0,
            LlamacppGGMLType::Q8_1 => llamacpp::GGML_TYPE_Q8_1,
            LlamacppGGMLType::Q2_K => llamacpp::GGML_TYPE_Q2_K,
            LlamacppGGMLType::Q3_K => llamacpp::GGML_TYPE_Q3_K,
            LlamacppGGMLType::Q4_K => llamacpp::GGML_TYPE_Q4_K,
            LlamacppGGMLType::Q5_K => llamacpp::GGML_TYPE_Q5_K,
            LlamacppGGMLType::Q6_K => llamacpp::GGML_TYPE_Q6_K,
            LlamacppGGMLType::Q8_K => llamacpp::GGML_TYPE_Q8_K,
            LlamacppGGMLType::IQ2_XXS => llamacpp::GGML_TYPE_IQ2_XXS,
            LlamacppGGMLType::IQ2_XS => llamacpp::GGML_TYPE_IQ2_XS,
            LlamacppGGMLType::IQ3_XXS => llamacpp::GGML_TYPE_IQ3_XXS,
            LlamacppGGMLType::IQ1_S => llamacpp::GGML_TYPE_IQ1_S,
            LlamacppGGMLType::IQ4_NL => llamacpp::GGML_TYPE_IQ4_NL,
            LlamacppGGMLType::IQ3_S => llamacpp::GGML_TYPE_IQ3_S,
            LlamacppGGMLType::IQ2_S => llamacpp::GGML_TYPE_IQ2_S,
            LlamacppGGMLType::IQ4_XS => llamacpp::GGML_TYPE_IQ4_XS,
            LlamacppGGMLType::I8 => llamacpp::GGML_TYPE_I8,
            LlamacppGGMLType::I16 => llamacpp::GGML_TYPE_I16,
            LlamacppGGMLType::I32 => llamacpp::GGML_TYPE_I32,
            LlamacppGGMLType::I64 => llamacpp::GGML_TYPE_I64,
            LlamacppGGMLType::F64 => llamacpp::GGML_TYPE_F64,
            LlamacppGGMLType::IQ1_M => llamacpp::GGML_TYPE_IQ1_M,
            LlamacppGGMLType::BF16 => llamacpp::GGML_TYPE_BF16,
            LlamacppGGMLType::TQ1_0 => llamacpp::GGML_TYPE_TQ1_0,
            LlamacppGGMLType::TQ2_0 => llamacpp::GGML_TYPE_TQ2_0,
        }
    }
}

pub struct LlamacppConfig {
    pub model_gguf: String,
    pub max_batch_total_tokens: usize,
    pub max_physical_batch_total_tokens: usize,
    pub max_batch_size: usize,
    pub batch_timeout: Duration,
    pub n_threads: usize,
    pub n_threads_batch: usize,
    pub n_gpu_layers: usize,
    pub split_mode: LlamacppSplitMode,
    pub numa: LlamacppNuma,
    pub defrag_threshold: f32,
    pub use_mmap: bool,
    pub use_mlock: bool,
    pub offload_kqv: bool,
    pub flash_attention: bool,
    pub type_k: LlamacppGGMLType,
    pub type_v: LlamacppGGMLType,
}

#[derive(Debug)]
struct LlamacppRequest {
    input_ids: Vec<i32>,
    top_k: i32,
    top_p: f32,
    typical_p: f32,
    min_keep: usize,
    temp: f32,
    seed: u32,
    penalty_last_n: i32,
    penalty_repeat: f32,
    penalty_freq: f32,
    penalty_present: f32,
    max_new_tokens: usize,
    tx: UnboundedSender<Result<InferStreamResponse, InferError>>,
    time: Instant,
}

pub struct LlamacppBackend {
    tx: UnboundedSender<LlamacppRequest>,
    status: watch::Receiver<bool>,
}

impl LlamacppRequest {
    fn new(
        from: &ValidGenerateRequest,
        tx: UnboundedSender<Result<InferStreamResponse, InferError>>,
    ) -> Option<Self> {
        from.input_ids.as_ref().map(|input_ids| LlamacppRequest {
            input_ids: input_ids.iter().map(|&x| x as i32).collect(),
            top_k: from.parameters.top_k as _,
            top_p: from.parameters.top_p as _,
            typical_p: from.parameters.typical_p as _,
            min_keep: 0, // disabled
            temp: from.parameters.temperature as _,
            seed: from.parameters.seed as _,
            penalty_last_n: 64, // 0 = disabled, -1 = context size
            penalty_repeat: from.parameters.repetition_penalty as _,
            penalty_freq: from.parameters.frequency_penalty as _,
            penalty_present: 0.0, // disabled
            max_new_tokens: from.stopping_parameters.max_new_tokens as _,
            tx,
            time: Instant::now(),
        })
    }
}

struct Llamacpp {
    model: *mut llamacpp::llama_model,
    ctx: *mut llamacpp::llama_context,
    vocab: *const llamacpp::llama_vocab,
    logprobs: Vec<llamacpp::llama_token_data>,
    batch: llamacpp::llama_batch,
}

extern "C" fn llamacpp_log_callback(
    level: llamacpp::ggml_log_level,
    msg: *const std::os::raw::c_char,
    _user_data: *mut std::os::raw::c_void,
) {
    let cmsg = unsafe { std::ffi::CStr::from_ptr(msg) };
    let rmsg = cmsg.to_string_lossy().trim_end_matches('\n').to_string();

    match level {
        llamacpp::GGML_LOG_LEVEL_DEBUG => debug!(target: "llamacpp", "{}", rmsg),
        llamacpp::GGML_LOG_LEVEL_INFO => info!(target: "llamacpp", "{}", rmsg),
        llamacpp::GGML_LOG_LEVEL_WARN => warn!(target: "llamacpp", "{}", rmsg),
        llamacpp::GGML_LOG_LEVEL_ERROR => error!(target: "llamacpp", "{}", rmsg),
        _ => trace!(target: "llamacpp", "{}", rmsg),
    }
}

impl Llamacpp {
    fn new(conf: LlamacppConfig) -> Result<Self, BackendError> {
        let gguf = CString::new(conf.model_gguf)?;

        let model = unsafe {
            let mut params = llamacpp::model_default_params();
            params.n_gpu_layers = conf.n_gpu_layers as _;
            params.split_mode = match conf.split_mode {
                LlamacppSplitMode::GPU(_) => llamacpp::LLAMA_SPLIT_MODE_NONE,
                LlamacppSplitMode::Layer => llamacpp::LLAMA_SPLIT_MODE_LAYER,
                LlamacppSplitMode::Row => llamacpp::LLAMA_SPLIT_MODE_ROW,
            };
            params.main_gpu = match conf.split_mode {
                LlamacppSplitMode::GPU(n) => n as _,
                _ => 0,
            };
            params.use_mmap = conf.use_mmap;
            params.use_mlock = conf.use_mlock;
            llamacpp::model_load_from_file(gguf.as_ptr(), params)
        };
        if model.is_null() {
            return Err(BackendError::Llamacpp("Failed to load model".to_string()));
        }
        let ctx = unsafe {
            let mut params = llamacpp::context_default_params();
            params.n_ctx = conf.max_batch_total_tokens as _;
            params.n_batch = conf.max_batch_total_tokens as _;
            params.n_ubatch = conf.max_physical_batch_total_tokens as _;
            params.n_seq_max = conf.max_batch_size as _;
            params.n_threads = conf.n_threads as _;
            params.n_threads_batch = conf.n_threads_batch as _;
            params.defrag_thold = conf.defrag_threshold;
            params.offload_kqv = conf.offload_kqv;
            params.flash_attn = conf.flash_attention;
            params.type_k = conf.type_k.to_ggml_type();
            params.type_v = conf.type_v.to_ggml_type();
            params.no_perf = true;
            llamacpp::init_from_model(model, params)
        };
        if ctx.is_null() {
            return Err(BackendError::Llamacpp("Failed to init context".to_string()));
        }
        let vocab = unsafe { llamacpp::model_get_vocab(model) };
        if vocab.is_null() {
            return Err(BackendError::Llamacpp("Failed to get vocab".to_string()));
        }
        let n_tokens = unsafe { llamacpp::vocab_n_tokens(vocab) };
        let mut logprobs = Vec::with_capacity(n_tokens as usize);

        for token in 0..n_tokens {
            logprobs.push(llamacpp::llama_token_data {
                id: token,
                logit: 0.0,
                p: 0.0,
            });
        }
        let batch = unsafe { llamacpp::batch_init(conf.max_batch_total_tokens as _, 0, 1) };
        Ok(Llamacpp {
            model,
            ctx,
            vocab,
            logprobs,
            batch,
        })
    }

    fn decode(&mut self) -> i32 {
        unsafe { llamacpp::decode(self.ctx, self.batch) }
    }

    fn clear_kv_cache(&mut self, seq_id: llamacpp::llama_seq_id) {
        unsafe {
            llamacpp::kv_cache_seq_rm(self.ctx, seq_id, -1, -1);
        }
    }

    fn batch_push(
        &mut self,
        token: llamacpp::llama_token,
        pos: llamacpp::llama_pos,
        seq_id: llamacpp::llama_seq_id,
        logits: bool,
    ) -> usize {
        let n = self.batch.n_tokens as usize;
        unsafe {
            *self.batch.token.add(n) = token;
            *self.batch.pos.add(n) = pos;
            *self.batch.n_seq_id.add(n) = 1;
            *(*self.batch.seq_id.add(n)).add(0) = seq_id;
            *self.batch.logits.add(n) = logits as i8;
        }
        self.batch.n_tokens += 1;
        n
    }
}

impl Drop for Llamacpp {
    fn drop(&mut self) {
        if !self.ctx.is_null() {
            unsafe { llamacpp::free(self.ctx) };
        }
        if !self.model.is_null() {
            unsafe { llamacpp::model_free(self.model) };
        }
        unsafe { llamacpp::batch_free(self.batch) };
    }
}

struct LlamacppSampler {
    chain: *mut llamacpp::llama_sampler,
}

impl LlamacppSampler {
    fn new(req: &LlamacppRequest) -> Option<Self> {
        let chain = unsafe {
            let params = llamacpp::sampler_chain_default_params();
            llamacpp::sampler_chain_init(params)
        };
        if chain.is_null() {
            error!("Failed to init sampler");
            return None;
        }
        let (top_k, top_p, typical_p, temp, penalties, dist) = unsafe {
            (
                llamacpp::sampler_init_top_k(req.top_k),
                llamacpp::sampler_init_top_p(req.top_p, req.min_keep),
                llamacpp::sampler_init_typical(req.typical_p, req.min_keep),
                llamacpp::sampler_init_temp(req.temp),
                llamacpp::sampler_init_penalties(
                    req.penalty_last_n,
                    req.penalty_repeat,
                    req.penalty_freq,
                    req.penalty_present,
                ),
                llamacpp::sampler_init_dist(req.seed),
            )
        };
        let all = &[
            ("top_k", top_k),
            ("top_p", top_p),
            ("typical_p", typical_p),
            ("temp", temp),
            ("penalties", penalties),
            ("dist", dist),
        ];
        let mut failed = false;

        for (k, v) in all {
            if v.is_null() {
                error!("Failed to init {k} sampler");
                failed = true;
            } else {
                unsafe { llamacpp::sampler_chain_add(chain, *v) };
            }
        }
        if failed {
            unsafe { llamacpp::sampler_free(chain) };
            None
        } else {
            Some(LlamacppSampler { chain })
        }
    }

    fn sample(&self, llamacpp: &mut Llamacpp, idx: usize) -> (llamacpp::llama_token, f32) {
        let logits = unsafe { llamacpp::get_logits_ith(llamacpp.ctx, idx as _) };
        for (token, logprob) in llamacpp.logprobs.iter_mut().enumerate() {
            *logprob = llamacpp::llama_token_data {
                id: token as _,
                logit: unsafe { *logits.add(token) },
                p: 0.0,
            };
        }
        let mut view = llamacpp::llama_token_data_array {
            data: llamacpp.logprobs.as_mut_ptr(),
            size: llamacpp.logprobs.len(),
            selected: -1,
            sorted: false,
        };
        unsafe {
            llamacpp::sampler_apply(self.chain, &mut view);
            let logprob = *view.data.offset(view.selected as _);
            llamacpp::sampler_accept(self.chain, logprob.id);
            (logprob.id, logprob.p.ln())
        }
    }
}

impl Drop for LlamacppSampler {
    fn drop(&mut self) {
        if !self.chain.is_null() {
            unsafe { llamacpp::sampler_free(self.chain) };
        }
    }
}

struct LlamacppSeq {
    id: usize,
    batch_pos: usize,
    token: llamacpp::llama_token,
    pos: llamacpp::llama_pos,
    sampler: LlamacppSampler,
    text: String,
    n_new_tokens: usize,
    running: bool,
}

static INIT: Once = Once::new();

impl LlamacppBackend {
    pub fn new(
        conf: LlamacppConfig,
        tokenizer: Tokenizer,
    ) -> (
        Self,
        oneshot::Receiver<Result<(), BackendError>>,
        watch::Sender<bool>,
    ) {
        // Setup llama & export logs, once and for all
        INIT.call_once(|| unsafe {
            llamacpp::log_set(Some(llamacpp_log_callback), std::ptr::null_mut());
            llamacpp::backend_init();
            llamacpp::numa_init(match conf.numa {
                LlamacppNuma::Disabled => llamacpp::GGML_NUMA_STRATEGY_DISABLED,
                LlamacppNuma::Distribute => llamacpp::GGML_NUMA_STRATEGY_DISTRIBUTE,
                LlamacppNuma::Isolate => llamacpp::GGML_NUMA_STRATEGY_ISOLATE,
                LlamacppNuma::Numactl => llamacpp::GGML_NUMA_STRATEGY_NUMACTL,
                LlamacppNuma::Mirror => llamacpp::GGML_NUMA_STRATEGY_MIRROR,
            });
        });

        let (status_tx, status_rx) = watch::channel(false);
        let (shutdown_tx, shutdown_rx) = watch::channel(false);
        let (ok_tx, ok_rx) = oneshot::channel();
        let (tx, mut rx) = unbounded_channel::<LlamacppRequest>();
        let (sync_tx, sync_rx) = mpsc::channel();

        spawn(async move {
            let mut n_tokens = 0;
            let mut requests = Vec::with_capacity(conf.max_batch_size);

            let flush = |requests: &mut Vec<_>, n_tokens: &mut usize| {
                if !requests.is_empty() {
                    let _ =
                        sync_tx.send(replace(requests, Vec::with_capacity(conf.max_batch_size)));
                    *n_tokens = 0;
                }
            };
            loop {
                match timeout(conf.batch_timeout, rx.recv()).await {
                    Ok(Some(request)) => {
                        let n_tokens_to_add = request.input_ids.len();

                        if n_tokens + n_tokens_to_add > conf.max_batch_total_tokens {
                            flush(&mut requests, &mut n_tokens);
                        }
                        n_tokens += n_tokens_to_add;
                        requests.push(request);

                        if requests.len() == conf.max_batch_size {
                            flush(&mut requests, &mut n_tokens);
                        }
                    }
                    Ok(None) => break,                             // closed
                    Err(_) => flush(&mut requests, &mut n_tokens), // timeout
                }
            }
        });

        spawn_blocking(move || {
            let mut llamacpp = match Llamacpp::new(conf) {
                Ok(v) => {
                    let _ = ok_tx.send(Ok(()));
                    v
                }
                Err(e) => {
                    let _ = ok_tx.send(Err(e));
                    return;
                }
            };
            let vocab = tokenizer.get_added_vocabulary();

            // health() returns true
            let _ = status_tx.send(true);

            while let Ok(requests) = sync_rx.recv() {
                if *shutdown_rx.borrow() {
                    break;
                }
                let start_time = Instant::now();
                let mut seqs: Vec<LlamacppSeq> = Vec::with_capacity(requests.len());
                llamacpp.batch.n_tokens = 0;

                for (seq_id, request) in requests.iter().enumerate() {
                    debug!("Request: {:?}", request);
                    // TODO remove this
                    let sampler = match LlamacppSampler::new(request) {
                        Some(sampler) => sampler,
                        _ => {
                            let _ = request.tx.send(Err(InferError::IncompleteGeneration));
                            continue;
                        }
                    };
                    let last_pos = request.input_ids.len() - 1;

                    for (pos, &token_id) in request.input_ids.iter().enumerate() {
                        llamacpp.batch_push(
                            token_id as llamacpp::llama_token,
                            pos as llamacpp::llama_pos,
                            seq_id as llamacpp::llama_seq_id,
                            pos == last_pos, // check samplers
                        );
                    }
                    seqs.push(LlamacppSeq {
                        id: seq_id,
                        batch_pos: llamacpp.batch.n_tokens as usize - 1,
                        token: llamacpp::LLAMA_TOKEN_NULL,
                        pos: last_pos as llamacpp::llama_pos + 1,
                        sampler,
                        text: String::with_capacity(1024),
                        n_new_tokens: 0,
                        running: true,
                    });
                }
                while llamacpp.batch.n_tokens > 0 {
                    if llamacpp.decode() != 0 {
                        warn!("llama_decode failed, clearing kv cache");
                        llamacpp.clear_kv_cache(-1);
                        for seq in seqs.iter_mut() {
                            let _ = requests[seq.id]
                                .tx
                                .send(Err(InferError::IncompleteGeneration));
                            seq.running = false;
                        }
                        break;
                    }
                    for seq in seqs.iter_mut() {
                        if !seq.running {
                            continue;
                        }
                        let (next, logprob) = seq.sampler.sample(&mut llamacpp, seq.batch_pos);
                        seq.n_new_tokens += 1;
                        seq.token = next;

                        let piece = match tokenizer.decode(&[next as u32], false) {
                            Ok(piece) => piece,
                            Err(e) => {
                                error!("Failed to decode token: {e}");
                                let _ = requests[seq.id]
                                    .tx
                                    .send(Err(InferError::IncompleteGeneration));
                                seq.running = false;
                                continue;
                            }
                        };
                        let special = vocab.is_special_token(&piece);

                        if !special {
                            seq.text.push_str(&piece);
                        }
                        let token = Token {
                            id: next as _,
                            text: piece,
                            logprob,
                            special,
                        };
                        let finish: Option<FinishReason> = {
                            if unsafe { llamacpp::vocab_is_eog(llamacpp.vocab, next) } {
                                Some(FinishReason::EndOfSequenceToken)
                            } else if seq.n_new_tokens == requests[seq.id].max_new_tokens {
                                Some(FinishReason::Length)
                            } else {
                                None
                            }
                        };
                        if let Some(reason) = finish {
                            let _ = requests[seq.id].tx.send(Ok(InferStreamResponse::End {
                                token,
                                top_tokens: vec![],
                                generated_text: GeneratedText {
                                    text: seq.text.clone(),
                                    generated_tokens: seq.n_new_tokens as _,
                                    finish_reason: reason,
                                    seed: Some(requests[seq.id].seed as _),
                                },
                                start: start_time,
                                queued: requests[seq.id].time,
                            }));
                            seq.running = false;
                            continue;
                        }
                        let _ = requests[seq.id]
                            .tx
                            .send(Ok(InferStreamResponse::Intermediate {
                                token,
                                top_tokens: vec![],
                            }));
                    }
                    // generate a new batch
                    llamacpp.batch.n_tokens = 0;

                    for seq in seqs.iter_mut() {
                        if seq.running {
                            seq.batch_pos =
                                llamacpp.batch_push(seq.token, seq.pos, seq.id as _, true);
                            seq.pos += 1;
                        } else {
                            llamacpp.clear_kv_cache(seq.id as _);
                        }
                    }
                }
            }
        });
        (
            Self {
                tx,
                status: status_rx,
            },
            ok_rx,
            shutdown_tx,
        )
    }
}

#[async_trait]
impl Backend for LlamacppBackend {
    #[instrument(skip_all)]
    fn schedule(
        &self,
        request: ValidGenerateRequest,
    ) -> Result<UnboundedReceiverStream<Result<InferStreamResponse, InferError>>, InferError> {
        debug!(?request);
        let (tx, rx) = unbounded_channel::<Result<InferStreamResponse, InferError>>();
        match LlamacppRequest::new(&request, tx) {
            Some(v) => match self.tx.send(v) {
                Err(e) => Err(InferError::GenerationError(e.to_string())),
                _ => Ok(UnboundedReceiverStream::new(rx)),
            },
            _ => Err(InferError::GenerationError("Bad request".to_string())),
        }
    }

    async fn health(&self, _: bool) -> bool {
        *self.status.borrow()
    }

    fn name(&self) -> &'static str {
        "llamacpp"
    }
}

#[derive(Debug, Error)]
pub enum BackendError {
    #[error("CString error: {0}")]
    CStringError(#[from] std::ffi::NulError),
    #[error("Llamacpp error: {0}")]
    Llamacpp(String),
}


================================================
FILE: backends/llamacpp/src/llamacpp.rs
================================================
#![allow(non_upper_case_globals)]
#![allow(non_camel_case_types)]
#![allow(non_snake_case)]
#![allow(dead_code)]
include!(concat!(env!("OUT_DIR"), "/llamacpp.rs"));


================================================
FILE: backends/llamacpp/src/main.rs
================================================
mod backend;
mod llamacpp;
mod quantize;

use quantize::QuantizeType;

use backend::{
    BackendError, LlamacppBackend, LlamacppConfig, LlamacppGGMLType, LlamacppNuma,
    LlamacppSplitMode,
};
use clap::Parser;
use hf_hub::api::tokio::ApiBuilder;
use hf_hub::{Repo, RepoType};
use std::path::Path;
use text_generation_router::{logging, server, usage_stats};
use thiserror::Error;
use tokenizers::Tokenizer;
use tokio::process::Command;
use tokio::sync::oneshot::error::RecvError;
use tracing::{error, warn};

/// Backend Configuration
#[derive(Parser, Debug)]
#[clap(author, version, about, long_about = None)]
struct Args {
    /// Name of the model to load.
    #[clap(long, env)]
    model_id: String,

    /// Revision of the model.
    #[clap(default_value = "main", long, env)]
    revision: String,

    /// Path to the GGUF model file for inference.
    #[clap(long, env)]
    model_gguf: Option<String>,

    /// Number of threads to use for generation.
    #[clap(long, env)]
    n_threads: Option<usize>,

    /// Number of threads to use for batch processing.
    #[clap(long, env)]
    n_threads_batch: Option<usize>,

    /// Number of layers to store in VRAM.
    #[clap(default_value = "0", long, env)]
    n_gpu_layers: usize,

    /// Split the model across multiple GPUs.
    #[clap(default_value = "layer", long, env)]
    split_mode: LlamacppSplitMode,

    /// Defragment the KV cache if holes/size > threshold.
    #[clap(default_value = "-1.0", long, env)]
    defrag_threshold: f32,

    /// Enable NUMA optimizations.
    #[clap(default_value = "disabled", value_enum, long, env)]
    numa: LlamacppNuma,

    /// Use memory mapping for the model.
    #[clap(long, env)]
    disable_mmap: bool,

    /// Use memory locking to prevent swapping.
    #[clap(long, env)]
    use_mlock: bool,

    /// Enable offloading of KQV operations to the GPU.
    #[clap(long, env)]
    disable_offload_kqv: bool,

    /// Enable flash attention for faster inference. (EXPERIMENTAL)
    #[clap(long, env)]
    disable_flash_attention: bool,

    /// Data type used for K cache.
    #[clap(default_value = "f16", value_enum, long, env)]
    type_k: LlamacppGGMLType,

    /// Data type used for V cache.
    #[clap(default_value = "f16", value_enum, long, env)]
    type_v: LlamacppGGMLType,

    /// Number of tokenizer workers used for payload validation and truncation.
    #[clap(default_value = "2", long, env)]
    validation_workers: usize,

    /// Maximum number of concurrent requests.
    #[clap(long, env)]
    max_concurrent_requests: Option<usize>,

    /// Maximum number of input tokens per request.
    #[clap(default_value = "1024", long, env)]
    max_input_tokens: usize,

    /// Maximum number of total tokens (input + output) per request.
    #[clap(default_value = "2048", long, env)]
    max_total_tokens: usize,

    /// Maximum number of tokens in a batch.
    #[clap(long, env)]
    max_batch_total_tokens: Option<usize>,

    /// Maximum number of tokens in a physical batch.
    #[clap(long, env)]
    max_physical_batch_total_tokens: Option<usize>,

    /// Maximum number of requests per batch.
    #[clap(long, env)]
    max_batch_size: Option<usize>,

    /// IP address to listen on.
    #[clap(default_value = "0.0.0.0", long)]
    hostname: String,

    /// Port to listen on.
    #[clap(default_value = "3000", long, short, env)]
    port: u16,

    #[clap(default_value = "9000", long, short, env)]
    prometheus_port: u16,

    /// Enable JSON output format.
    #[clap(long, env)]
    json_output: bool,

    /// OTLP endpoint for telemetry data.
    #[clap(long, env)]
    otlp_endpoint: Option<String>,

    /// Service name for OTLP telemetry.
    #[clap(default_value = "text-generation-inference.router", long, env)]
    otlp_service_name: String,

    /// Allowed origins for CORS.
    #[clap(long, env)]
    cors_allow_origin: Option<Vec<String>>,

    /// Path to the tokenizer configuration file.
    #[clap(long, env)]
    tokenizer_config_path: Option<String>,

    /// Disable grammar support.
    #[clap(long, env)]
    disable_grammar_support: bool,

    /// Maximum number of inputs per request.
    #[clap(default_value = "4", long, env)]
    max_client_batch_size: usize,

    /// Level of usage statistics collection.
    #[clap(default_value = "on", long, env)]
    usage_stats: usage_stats::UsageStatsLevel,

    /// Maximum payload size in bytes.
    #[clap(default_value = "2000000", long, env)]
    payload_limit: usize,

    /// Maximum image fetch size in bytes.
    #[clap(default_value = "1073741824", long, env)]
    max_image_fetch_size: usize,
}

#[tokio::main]
async fn main() -> Result<(), RouterError> {
    let args = Args::parse();

    logging::init_logging(args.otlp_endpoint, args.otlp_service_name, args.json_output);

    let n_threads = match args.n_threads {
        Some(0) | None => num_cpus::get(),
        Some(threads) => threads,
    };
    let n_threads_batch = match args.n_threads_batch {
        Some(0) | None => n_threads,
        Some(threads) => threads,
    };
    let max_batch_size = match args.max_batch_size {
        Some(0) | None => n_threads_batch,
        Some(threads) => threads,
    };
    let max_batch_total_tokens = match args.max_batch_total_tokens {
        None => max_batch_size * args.max_total_tokens,
        Some(size) => size,
    };
    let max_physical_batch_total_tokens = match args.max_physical_batch_total_tokens {
        None => max_batch_total_tokens,
        Some(size) => size,
    };
    let max_concurrent_requests = match args.max_concurrent_requests {
        None => max_batch_size * 2,
        Some(size) => size,
    };
    if args.max_input_tokens >= args.max_total_tokens {
        return Err(RouterError::ArgumentValidation(
            "`max_input_tokens` must be < `max_total_tokens`".to_string(),
        ));
    }
    if args.max_total_tokens > max_batch_total_tokens {
        return Err(RouterError::ArgumentValidation(
            "`max_total_tokens` must be <= `max_batch_total_tokens`".to_string(),
        ));
    }
    if max_batch_size * args.max_total_tokens > max_batch_total_tokens {
        return Err(RouterError::ArgumentValidation(
            "`max_batch_size` * `max_total_tokens` must be <= `max_batch_total_tokens`".to_string(),
        ));
    }

    let api_builder = || {
        let mut builder = ApiBuilder::new().with_progress(true);

        if let Ok(cache_dir) = std::env::var("HUGGINGFACE_HUB_CACHE") {
            builder = builder.with_cache_dir(cache_dir.into());
        }
        if let Ok(token) = std::env::var("HF_TOKEN") {
            builder = builder.with_token(token.into());
        }
        if let Ok(origin) = std::env::var("HF_HUB_USER_AGENT_ORIGIN") {
            builder = builder.with_user_agent("origin", origin.as_str());
        }
        builder
    };
    let api_repo = api_builder().build()?.repo(Repo::with_revision(
        args.model_id.clone(),
        RepoType::Model,
        args.revision.clone(),
    ));

    let tokenizer_path = api_repo.get("tokenizer.json").await?;
    let tokenizer = Tokenizer::from_file(&tokenizer_path)?;

    let model_gguf = if let Some(model_gguf) = args.model_gguf {
        model_gguf
    } else {
        let model_gguf = format!("models/{}/model.gguf", args.model_id);
        let model_gguf_path = Path::new(&model_gguf);

        if !model_gguf_path.exists() {
            let tmp_gguf = "models/tmp.gguf";

            if let Some(parent) = Path::new(model_gguf_path).parent() {
                std::fs::create_dir_all(parent)?;
            }
            let cache_path = tokenizer_path.parent().unwrap();

            for sibling in api_repo.info().await?.siblings {
                let _ = api_repo.get(&sibling.rfilename).await?;
            }
            let status = Command::new("convert_hf_to_gguf.py")
                .arg("--outfile")
                .arg(tmp_gguf)
                .arg(cache_path)
                .spawn()?
                .wait()
                .await?;

            if !status.success() {
                let exit_code = status.code().unwrap_or(-1);
                error!("Failed to generate GGUF, exit code: {}", exit_code);
                return Err(RouterError::CommandError(exit_code));
            }
            quantize::model(tmp_gguf, &model_gguf, QuantizeType::MostlyQ4_0, n_threads)
                .map_err(RouterError::QuantizeError)?;
        }
        model_gguf
    };

    let (backend, ok, shutdown) = LlamacppBackend::new(
        LlamacppConfig {
            model_gguf,
            n_threads,
            n_threads_batch,
            n_gpu_layers: args.n_gpu_layers,
            split_mode: args.split_mode,
            defrag_threshold: args.defrag_threshold,
            numa: args.numa,
            use_mmap: !args.disable_mmap,
            use_mlock: args.use_mlock,
            flash_attention: !args.disable_flash_attention,
            type_k: args.type_k,
            type_v: args.type_v,
            offload_kqv: !args.disable_offload_kqv,
            max_batch_total_tokens,
            max_physical_batch_total_tokens,
            max_batch_size,
            batch_timeout: tokio::time::Duration::from_millis(5),
        },
        tokenizer,
    );
    ok.await??;

    if cfg!(debug_assertions) {
        warn!("Graceful shutdown disabled!");
        let _ = tokio::task::spawn(async move {
            let _ = tokio::signal::ctrl_c().await;
            let _ = shutdown.send(true);
        });
    }

    server::run(
        backend,
        max_concurrent_requests,
        0, // max_best_of
        0, // max_stop_sequences
        0, // max_top_n_tokens
        args.max_input_tokens,
        args.max_total_tokens,
        args.validation_workers,
        None,          // api_key
        args.model_id, // tokenizer_name
        args.tokenizer_config_path,
        Some(args.revision),
        false, // trust_remote_code
        args.hostname,
        args.port,
        args.cors_allow_origin,
        false, // ngrok,
        None,  // ngrok_authtoken,
        None,  // ngrok_edge,
        args.disable_grammar_support,
        args.max_client_batch_size,
        args.usage_stats,
        args.payload_limit,
        args.max_image_fetch_size,
        args.prometheus_port,
    )
    .await?;
    Ok(())
}

#[derive(Debug, Error)]
enum RouterError {
    #[error("Argument validation error: {0}")]
    ArgumentValidation(String),
    #[error("Tokenizer error: {0}")]
    Tokenizer(#[from] tokenizers::Error),
    #[error("Backend error: {0}")]
    Backend(#[from] BackendError),
    #[error("WebServer error: {0}")]
    WebServer(#[from] server::WebServerError),
    #[error("Recv error: {0}")]
    RecvError(#[from] RecvError),
    #[error("Io error: {0}")]
    IoError(#[from] std::io::Error),
    #[error("Var error: {0}")]
    VarError(#[from] std::env::VarError),
    #[error("Quantize error: {0}")]
    QuantizeError(String),
    #[error("Command error: {0}")]
    CommandError(i32),
    #[error("HF hub error: {0}")]
    HubError(#[from] hf_hub::api::tokio::ApiError),
}


================================================
FILE: backends/llamacpp/src/quantize.rs
================================================
use crate::llamacpp;

use std::ffi::CString;

#[repr(u32)]
#[derive(Debug, Clone, Copy)]
pub enum QuantizeType {
    MostlyQ4_0 = 2,
}

pub fn model(
    input_path: &str,
    output_path: &str,
    ftype: QuantizeType,
    n_threads: usize,
) -> Result<(), String> {
    let c_input_path =
        CString::new(input_path).map_err(|e| format!("Failed to convert input path: {}", e))?;

    let c_output_path =
        CString::new(output_path).map_err(|e| format!("Failed to convert output path: {}", e))?;

    let result = unsafe {
        let mut params = llamacpp::model_quantize_default_params();
        params.nthread = n_threads as _;
        params.ftype = ftype as _;
        params.quantize_output_tensor = true;
        llamacpp::model_quantize(c_input_path.as_ptr(), c_output_path.as_ptr(), &params)
    };
    if result == 0 {
        Ok(())
    } else {
        Err(format!("Quantization failed, error code: {}", result))
    }
}


================================================
FILE: backends/neuron/Cargo.toml
================================================
[workspace]
members = [
  "backends/v2",
  "backends/grpc-metadata",
  "launcher",
  "router"
]
default-members = [
  "backends/v2",
  "backends/grpc-metadata",
  "launcher",
  "router"
]
resolver = "2"

[workspace.package]
version = "3.0.0"
edition = "2021"
authors = ["Olivier Dehaene"]
homepage = "https://github.com/huggingface/text-generation-inference"

[workspace.dependencies]
base64 = "0.22.0"
tokenizers = { version = "0.20.0", features = ["http"] }
hf-hub = { version = "0.4.2", features = ["tokio"] }
metrics = { version = "0.23.0" }
metrics-exporter-prometheus = { version = "0.15.1", features = [] }
minijinja = { version = "2.2.0", features = ["json"] }
minijinja-contrib = { version = "2.0.2", features = ["pycompat"] }
pyo3 = { version = "0.22.2", features = ["auto-initialize"] }

[profile.release]
incremental = true

[profile.release-binary]
inherits = "release"
debug = 1
incremental = true
panic = "abort"

[profile.release-opt]
inherits = "release"
debug = 0
incremental = false
lto = "fat"
opt-level = 3
codegen-units = 1


================================================
FILE: backends/neuron/Makefile
================================================
#  Copyright 2025 The HuggingFace Team. All rights reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
mkfile_dir := $(dir $(mkfile_path))
root_dir := "${mkfile_dir}/../.."

.PHONY:	image install_server test_server test_integration

VERSION := $(shell gawk 'match($$0, /^version = "(.*)"/, a) {print a[1]}' ${root_dir}/Cargo.toml)

image:
	docker build --rm -f ${root_dir}/Dockerfile.neuron \
				 --ulimit nofile=100000:100000 \
				 --build-arg VERSION=$(VERSION) \
				 -t text-generation-inference:$(VERSION)-neuron ${root_dir}
	docker tag text-generation-inference:$(VERSION)-neuron text-generation-inference:latest-neuron

install_server:
	make -C ${mkfile_dir}/server install VERSION:=${VERSION}

test_server: install_server
	python -m pip install -r ${mkfile_dir}/tests/requirements.txt
	python -m pytest -sv ${mkfile_dir}/tests/server


================================================
FILE: backends/neuron/README.md
================================================
# Text-generation-inference - Neuron backend for AWS Trainium and inferentia2

## Description

This is the TGI backend for AWS Neuron Trainium and Inferentia family of chips.

This backend is composed of:
- the AWS Neuron SDK,
- the legacy v2 TGI launcher and router,
- a neuron specific inference server for text-generation.

## Usage

Please refer to the official [documentation](https://huggingface.co/docs/text-generation-inference/backends/neuron).

## Build your own image

The simplest way to build TGI with the neuron backend is to use the provided `Makefile`:

```shell
$ make -C backends/neuron image
```

Alternatively, you can build the image directly from the top directory using a command similar to the one defined
in the `Makefile` under the `image` target.


================================================
FILE: backends/neuron/server/.gitignore
================================================
build


================================================
FILE: backends/neuron/server/Makefile
================================================
# Initialize base variables
SHELL := /bin/bash
pkg_name := text_generation_server
BUILDDIR ?= $(CURDIR)/build
VERSION ?= 0.0.1
mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
mkfile_dir := $(dir $(mkfile_path))
pkg_dir := $(BUILDDIR)/$(pkg_name)
py_version := $(subst -,.,${VERSION})
pkg_dist := ${BUILDDIR}/dist/${pkg_name}-$(py_version).tar.gz

clean:
	rm -rf $(BUILDDIR)/*

${BUILDDIR}:
	install -d $@

# List static sources to be deployed in the package
src_dir := $(mkfile_dir)/$(pkg_name)
sources := $(wildcard $(src_dir)/*.py)
deployed_sources := $(subst $(src_dir), $(pkg_dir), $(sources))

# Static files are just copied

define COPY
	cp -f $< $@
endef

# We use a PHONY target to represent the VERSION
.PHONY: VERSION

VERSION: ${BUILDDIR}
	# The trick is to compare the value of the variable with the content of a file in the build directory
	@if [[ `cat ${BUILDDIR}/VERSION 2>&1` != '$(VERSION)' ]]; then echo -n $(VERSION) >${BUILDDIR}/VERSION; fi

# Depending on the PHONY VERSION target makes sure the pyproject.toml is regenerated if the version changes
$(BUILDDIR)/pyproject.toml: $(mkfile_dir)/pyproject.toml VERSION
	mkdir -p $(BUILDDIR)
	$(COPY)
	sed -i -e 's/version = "VERSION"/version = \"${VERSION}\"/' $@

$(pkg_dir)/%.py: $(src_dir)/%.py
	mkdir -p $(pkg_dir)
	$(COPY)

# Generated files are produced by grpcio tools

# If not provided, get local proto files
ifndef PROTODIR
PROTODIR := $(mkfile_dir)/../../../proto
endif

# Three python files are generated for each protobuf
protobufs := $(PROTODIR)/generate.proto
pkg_pb_dir := $(pkg_dir)/pb
generated_sources_base := $(foreach proto, $(protobufs), $(proto:.proto=_pb2.py))
generated_sources := $(subst $(PROTODIR), $(pkg_pb_dir), $(generated_sources_base))
generated_sources += $(subst $(PROTODIR), $(pkg_pb_dir), $(generated_sources_base:.py=.pyi))
generated_sources += $(subst $(PROTODIR), $(pkg_pb_dir), $(generated_sources_base:.py=_grpc.py))

$(pkg_pb_dir)/%_pb2.py $(pkg_pb_dir)/%_pb2.pyi $(pkg_pb_dir)/%_pb2_grpc.py: $(PROTODIR)/%.proto
	mkdir -p $(pkg_pb_dir)
	python -m grpc_tools.protoc -I$(PROTODIR) --python_out=$(pkg_pb_dir) \
		--grpc_python_out=$(pkg_pb_dir) --mypy_out=$(pkg_pb_dir) $^
	sed -i -e 's/^\(import.*pb2\)/from . \1/g' $(pkg_pb_dir)/$*_pb2_grpc.py

${pkg_dist}: $(BUILDDIR)/pyproject.toml $(deployed_sources) $(generated_sources)
	python -m build $(BUILDDIR)

package: ${pkg_dist}

install: ${pkg_dist}
	python3 -m pip uninstall -y ${pkg_name}
	python3 -m pip install ${pkg_dist}


================================================
FILE: backends/neuron/server/build-requirements.txt
================================================
build
grpcio-tools==1.53.0
mypy-protobuf


================================================
FILE: backends/neuron/server/pyproject.toml
================================================
[build-system]
requires = ["setuptools>=78.1"]
build-backend = "setuptools.build_meta"

[project]
name = "text-generation-server"
version = "VERSION"
authors = [{name="David Corvoysier", email="david@huggingface.co" }]
description = "TGI compatible inference server for AWS Neuronx platforms"
dependencies = [
    'protobuf > 3.20.1, < 4',
    'grpcio == 1.57.0',
    'grpcio-status == 1.48.2',
    'grpcio-reflection == 1.48.2',
    'grpc-interceptor == 0.15.2',
    'typer == 0.6.1',
    'safetensors',
    'loguru == 0.6.0',
    'optimum-neuron[neuronx] >= 0.0.28',
]

[tool.setuptools]
packages = ["text_generation_server", "text_generation_server.pb"]

[project.scripts]
text-generation-server = 'text_generation_server.cli:app'


================================================
FILE: backends/neuron/server/text_generation_server/cli.py
================================================
import sys
from typing import Optional

import typer
from loguru import logger


app = typer.Typer()


@app.command()
def serve(
    model_id: str,
    revision: Optional[str] = None,
    sharded: bool = False,
    trust_remote_code: bool = None,
    uds_path: str = "/tmp/text-generation-server",
    logger_level: str = "INFO",
    json_output: bool = False,
    otlp_endpoint: Optional[str] = None,
    otlp_service_name: str = "text-generation-inference.server",
    max_input_tokens: Optional[int] = None,
):
    """This is the main entry-point for the server CLI.

    Args:
        model_id (`str`):
            The *model_id* of a model on the HuggingFace hub or the path to a local model.
        revision (`Optional[str]`, defaults to `None`):
            The revision of the model on the HuggingFace hub.
        sharded (`bool`):
            Whether the model must be sharded or not. Kept for compatibility with the
            text-generation-launcher, but must be set to False.
        trust-remote-code (`bool`):
            Kept for compatibility with text-generation-launcher. Ignored.
        uds_path (`Union[Path, str]`):
            The local path on which the server will expose its google RPC services.
        logger_level (`str`):
            The server logger level. Defaults to *INFO*.
        json_output (`bool`):
            Use JSON format for log serialization.
        otlp_endpoint (`Optional[str]`, defaults to `None`):
            The Open Telemetry endpoint to use.
        otlp_service_name (`Optional[str]`, defaults to `None`):
            The name to use when pushing data to the Open Telemetry endpoint.
        max_input_tokens (`Optional[int]`, defaults to `None`):
            The maximum number of input tokens each request should contain.
    """
    if sharded:
        raise ValueError("Sharding is not supported.")
    # Remove default handler
    logger.remove()
    logger.add(
        sys.stdout,
        format="{message}",
        filter="text_generation_server",
        level=logger_level,
        serialize=json_output,
        backtrace=True,
        diagnose=False,
    )

    if trust_remote_code is not None:
        logger.warning(
            "'trust_remote_code' argument is not supported and will be ignored."
        )

    # Import here after the logger is added to log potential import exceptions
    from .server import serve

    serve(model_id, revision, uds_path)


@app.command()
def download_weights(
    model_id: str,
    revision: Optional[str] = None,
    logger_level: str = "INFO",
    json_output: bool = False,
    auto_convert: Optional[bool] = None,
    extension: Optional[str] = None,
    trust_remote_code: Optional[bool] = None,
    merge_lora: Optional[bool] = None,
):
    """Download the model weights.

    This command will be called by text-generation-launcher before serving the model.
    """
    # Remove default handler
    logger.remove()
    logger.add(
        sys.stdout,
        format="{message}",
        filter="text_generation_server",
        level=logger_level,
        serialize=json_output,
        backtrace=True,
        diagnose=False,
    )

    if extension is not None:
        logger.warning("'extension' argument is not supported and will be ignored.")
    if trust_remote_code is not None:
        logger.warning(
            "'trust_remote_code' argument is not supported and will be ignored."
        )
    if auto_convert is not None:
        logger.warning("'auto_convert' argument is not supported and will be ignored.")
    if merge_lora is not None:
        logger.warning("'merge_lora' argument is not supported and will be ignored.")

    # Import here after the logger is added to log potential import exceptions
    from .model import fetch_model

    fetch_model(model_id, revision)


================================================
FILE: backends/neuron/server/text_generation_server/generator.py
================================================
import copy
import logging
import time
from abc import ABC
from enum import Enum
from typing import List, Optional, Tuple

import torch
from loguru import logger
from transformers import AutoTokenizer, PreTrainedTokenizerBase
from optimum.neuron.configuration_utils import NeuronConfig
from transformers.generation import GenerationConfig

from optimum.neuron import NeuronModelForCausalLM
from optimum.neuron.generation import TokenSelector

from .model import get_export_kwargs_from_env
from .pb.generate_pb2 import (
    Batch,
    CachedBatch,
    FinishReason,
    GeneratedText,
    Generation,
    InfoResponse,
    Request,
    Tokens,
)


# Disable optimum-neuron warnings as it seems to block the server after a while
optimum_logger = logging.getLogger("optimum.neuron")
optimum_logger.setLevel("CRITICAL")


class Generator(ABC):
    """An abstract class to represent the workhorse behind TextGenerationService.

    Ideally, it should not rely on protobuf constructs, but in a first step it does.
    Implementations would typically need a model and a tokenizer to implement the Generator methods.
    """

    @property
    def info(self) -> InfoResponse:
        """This should simply return the expected InfoResponse"""
        raise NotImplementedError

    def warmup(self, batch: Batch) -> int:
        """Verify if the hardware can support the target load.

        Args:
            batch (`Batch`):
                A batch corresponding to the maximum number of concurrent requests.

        Return:
            The maximum number of tokens the model supports.
        """
        raise NotImplementedError

    def prefill(self, batch: Batch) -> Tuple[List[Generation], CachedBatch]:
        """Prefill is called whenever new requests need to be added.

        When this method returns successfully, a decode method will follow
        with both the current and newly prefilled batch(es).

        Args:
            batch (`Batch`):
                A batch containing the new requests.

        Return:
            A list of `Generation` for each request and a `CachedBatch` containing all pending requests.
        """
        raise NotImplementedError

    def decode(self, batches: List[Batch]) -> Tuple[List[Generation], CachedBatch]:
        """Decode after a prefill or another decode."""
        raise NotImplementedError

    def filter(self, batch_id: int, request_ids: List[int]) -> CachedBatch:
        """Remove requests that are not listed from the specified batch"""
        raise NotImplementedError

    def clear(self):
        """Remove all requests from the generator"""
        raise NotImplementedError

    @classmethod
    def from_pretrained(cls, model_id: str, revision: Optional[str]):
        """Factory method "a la transformers" """
        raise NotImplementedError


class Slot:
    """Represents a slot in a static batch"""

    class State(Enum):
        EMPTY = 0
        PAUSE = 1
        READY = 2

    def __init__(self, id: int, tokenizer: PreTrainedTokenizerBase):
        self._id = id
        self._tokenizer = tokenizer
        self.clear()

    def clear(self):
        """Clear the slot and mark it as available."""
        self._state = Slot.State.EMPTY
        self._batch_id = None
        self._request_id = None
        self._inputs = ""
        self._truncate = 0
        self._generation_config = None
        self._tokens = []
        self._mask = torch.tensor([])
        self._selector = None
        self._generated_tokens = 0
        self._next_text_token_start = 0
        self._next_text_token_end = 0
        self._generated_text = ""
        self._next_text = ""

    @property
    def id(self) -> int:
        return self._id

    @property
    def state(self) -> "Slot.State":
        return self._state

    @property
    def batch_id(self) -> int:
        return self._batch_id

    @property
    def request_id(self) -> int:
        return self._request_id

    @property
    def cached_text(self) -> str:
        return self._inputs + self._generated_text

    @property
    def generation_config(self) -> GenerationConfig:
        return self._generation_config

    @property
    def generated_tokens(self) -> int:
        return self._generated_tokens

    def assign(
        self, batch_id: int, request: Request, generation_config: GenerationConfig
    ):
        """Assign a request to a slot.

        Args:
            request (`Request`):
                The request to be assigned. Contains the inputs and tokens selection parameters.
            generation_config (`transformers.GenerationConfig`):
                The base generation config (might be modified by the request generation parameters).
        """
        self._state = Slot.State.READY
        self._batch_id = batch_id
        self._request_id = request.id
        self._inputs = request.inputs
        if request.truncate:
            self._truncate = request.truncate
        self._generation_config = copy.deepcopy(generation_config)
        # Update generation config with request parameters
        self._generation_config.do_sample = request.parameters.do_sample
        if self._generation_config.do_sample:
            if request.parameters.temperature != 0:
                self._generation_config.temperature = request.parameters.temperature
            if request.parameters.top_k != 0:
                self._generation_config.top_k = request.parameters.top_k
            if request.parameters.top_p != 0:
                self._generation_config.top_p = request.parameters.top_p
            if request.parameters.typical_p != 0:
                self._generation_config.typical_p = request.parameters.typical_p
        else:
            # Set the sampling parameters to emulate greedy decoding when using on-device sampling
            self._generation_config.temperature = 1.0
            self._generation_config.top_k = 1
            self._generation_config.top_p = 1.0
            self._generation_config.typical_p = 1.0
        if request.parameters.repetition_penalty != 0:
            self._generation_config.repetition_penalty = (
                request.parameters.repetition_penalty
            )
        self.seed = request.parameters.seed
        self._generation_config.max_new_tokens = (
            request.stopping_parameters.max_new_tokens
        )
        self._max_new_tokens = self._generation_config.max_new_tokens
        stop_strings = request.stopping_parameters.stop_sequences
        if stop_strings:
            self._generation_config.stop_strings = stop_strings

    def reset(
        self,
        input_ids: torch.LongTensor,
        attention_mask: torch.LongTensor,
        selector: TokenSelector,
    ):
        """Reset the slot for the next generation.

        Args:
            input_ids: (`torch.LongTensor`):
                The new input_ids to use to generate the next token.
            attention_mask: (`torch.LongTensor`):
                The new attention_mask to use to generate the next token.
            selector: (`optimum.neuron.generation.TokenSelector`):
                An object implementing the updated token selection logic.
        """
        self._tokens = input_ids.clone()
        self._next_text_token_start = 0
        self._next_text_token_end = torch.numel(self._tokens)
        self._next_text = ""
        self._mask = attention_mask.clone()
        self._selector = selector

    def pause(self):
        """Mark the current slot as paused for generation.

        Note that the KV cache for this slot will still be filled.
        """
        self._state = Slot.State.PAUSE

    def resume(self):
        """Mark the slot as ready for generation."""
        self._state = Slot.State.READY

    def _decode_next_tokens(
        self,
    ) -> str:
        """Hack to hopefully support generate_stream for the maximum number of tokenizers"""
        # We need to include the tokens that produced the last text to defeat cleanup algorithms in the decode
        # which decide to add a space or not depending on the surrounding ids.
        new_text = self._tokenizer.decode(
            self._tokens[self._next_text_token_start :], skip_special_tokens=False
        )
        if new_text.endswith("�"):
            # utf-8 char at the end means it's a potential unfinished byte sequence
            # from byte fallback tokenization.
            return ""

        # Compare the generated text with the one using only the tokens producing the last one
        last_text = self._tokenizer.decode(
            self._tokens[self._next_text_token_start : self._next_text_token_end],
            skip_special_tokens=False,
        )
        if len(new_text) == len(last_text):
            # Nothing new was actually generated
            return ""
        # Return the decoded text and store its token offsets
        self._next_text_token_start = self._next_text_token_end
        self._next_text_token_end = torch.numel(self._tokens)
        return new_text[len(last_text) :]

    def append(self, next_token: int) -> str:
        """Append a new generated token to this slot

        The new token is added to the list of generated tokens, which impacts
        directly the generated_text and stopped property.

        The new token is however not added immediately to the slot inputs: it will
        be added later on when it has effectively been used to produce the next token.

        Args:
            next_token (`int`):
                The newly generated token.

        Return:
            The corresponding decoded text (if any).
        """
        self._tokens = torch.cat([self._tokens, torch.LongTensor([next_token])])
        self._mask = torch.cat([self._mask, torch.LongTensor([1])])
        self._generated_tokens += 1
        next_text = self._decode_next_tokens()
        # Now that a new token has been generated, we can append the previous one to the generated text
        self._generated_text += self._next_text
        self._next_text = next_text
        return next_text

    def select(
        self, input_ids: torch.LongTensor, logits: torch.Tensor
    ) -> torch.LongTensor:
        """Select the next token from the candidate logits.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                The sequence used as a prompt for the generation (not used in all generation modes).
            logits (`torch.Tensor` of shape `(batch_size, sequence_length)`):
                The logits corresponding to the generated tokens.

        Return:
            `torch.LongTensor`: A scalar torch.LongTensor` containing the selected token.
        """
        return self._selector.select(input_ids, logits)[0]

    @property
    def stopped(self) -> bool:
        # Transformers stopping criteria expects a batch of input ids
        input_ids = torch.unsqueeze(self._tokens, dim=0)
        return self._selector.stopping_criteria(input_ids, None)

    @property
    def generated_text(self) -> str:
        return self._generated_text + self._next_text

    @property
    def next_token(self) -> int:
        return None if len(self._tokens) == 0 else self._tokens[-1]

    @property
    def attention_mask(self) -> torch.LongTensor:
        return self._mask

    @property
    def max_token(self) -> int:
        return self._generation_config.max_length

    @property
    def max_new_tokens(self) -> int:
        # The current value of max_new_tokens: might be different of the target max_new_tokens
        # if the slot has been paused and resumed.
        return self._generation_config.max_new_tokens

    @property
    def truncate(self) -> int:
        return self._truncate


class NeuronGenerator(Generator):
    """A Generator for Neuron models."""

    def __init__(
        self,
        model: NeuronModelForCausalLM,
        tokenizer: PreTrainedTokenizerBase,
    ):
        self.model = model
        if not isinstance(self.model, NeuronModelForCausalLM):
            raise ValueError("The model must be a NeuronModelForCausalLM.")
        if (
            model.neuron_config.batch_size > 1
            and not model.neuron_config.continuous_batching
        ):
            raise ValueError(
                "The neuron model must be compiled with continuous_batching=True."
            )
        # Specify padding and truncation options for decoder-only architecture
        tokenizer.pad_token_id = tokenizer.eos_token_id
        tokenizer.padding_side = "left"
        tokenizer.truncation_side = "left"
        self.tokenizer = tokenizer
        self.special_tokens = self.tokenizer.all_special_ids
        self.slots = [
            Slot(i, tokenizer) for i in range(self.model.neuron_config.batch_size)
        ]
        self.batch_id = 0

    @property
    def on_device_sampling(self) -> bool:
        return getattr(self.model.neuron_config, "on_device_sampling", False)

    @property
    def info(self) -> InfoResponse:
        """Returns the expected InfoResponse."""
        dtype = getattr(self.model.config, "torch_dtype", "float32")
        return InfoResponse(
            requires_padding=True,
            dtype=str(dtype),
            device_type="xla",
        )

    def warmup(self, batch: Batch) -> int:
        """Verify if the hardware can support the target load.

        Args:
            batch (`Batch`):
                A batch corresponding to the maximum number of concurrent requests.

        Return:
            The maximum number of tokens the model supports.
        """
        # Just check that the warmup request parameters match the model capacity
        batch_size = self.model.neuron_config.batch_size
        if len(batch.requests) > batch_size:
            raise ValueError(
                f"Inconsistent batch_size configuration: Please make sure the batch_size in the compiled model (currently {batch_size}) matches the batch_size passed to TGI.  The compiled model.neuron_config.batch_size is usually in the neuron section of the model config.json file. You may also have passed it into optimum-cli during the compilation process.  The batch size for TGI is usually set in the environment as MAX_BATCH_SIZE."
            )
        self.prefill(batch)
        self.clear()
        return (
            self.model.neuron_config.batch_size
            * self.model.neuron_config.sequence_length
        )

    def max_prefill_length(self) -> int:
        if hasattr(self.model.neuron_config, "max_context_length"):
            return self.model.neuron_config.max_context_length
        return self.model.neuron_config.sequence_length

    def prefill(self, batch: Batch) -> Tuple[List[Generation], CachedBatch]:
        """Prefill new requests.

        Args:
            batch (`Batch`):
                A batch containing the new requests.

        Return:
            A list of `Generation` for each request and a `CachedBatch` containing all pending requests.
        """
        slots = {state: [] for state in Slot.State}
        for slot in self.slots:
            slots[slot.state].append(slot)
        active_slots = slots[Slot.State.READY]
        empty_slots = slots[Slot.State.EMPTY]
        if len(empty_slots) < len(batch.requests):
            raise ValueError(
                f"Cannot prefill {len(batch.requests)} new request(s) with only {len(empty_slots)} empty slots."
                f" Please align max_batch_size with the static batch size: {self.model.neuron_config.batch_size}."
            )
        # Assign each request to an empty slot
        logger.debug(
            f"Prefilling {len(batch.requests)} new request(s) with {len(empty_slots)} empty slot(s)"
        )
        new_slots = []
        for request in batch.requests:
            slot = empty_slots.pop()
            slot.assign(self.batch_id, request, self.model.generation_config)
            new_slots.append(slot)
            logger.debug(
                f"Request {slot.request_id} assigned to slot {slot.id} with and max_new_tokens {slot.max_new_tokens}"
            )
        prefill_slots = new_slots
        seq_ids = torch.tensor([slot.id for slot in prefill_slots])
        # Reconstruct the full inputs (without padding) as seen by the model.
        # This comprises:
        # - the inputs for new requests,
        # - only when rebuilding the cache, the inputs and the generated text that has already
        # been cached (i.e. excluding the last generated token) for unfinished requests.
        inputs = []
        max_length = 0
        for slot in prefill_slots:
            inputs.append(slot.cached_text)
            # Apply truncation, making sure we fit into static dimensions
            if slot.truncate == 0:
                max_length = self.max_prefill_length()
            elif (
                slot.truncate > max_length and slot.truncate < self.max_prefill_length()
            ):
                max_length = slot.truncate
        # Tokenize with padding and truncation
        padded_inputs = self.tokenizer(
            inputs,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_length,
        )
        input_ids = padded_inputs.input_ids
        attention_mask = padded_inputs.attention_mask
        sampling_params = (
            torch.zeros(input_ids.shape[0], 3) if self.on_device_sampling else None
        )
        # Pause previously active slots during generation
        for slot in active_slots:
            slot.pause()
        # Each slot must be reset with the padded inputs and masks
        for i, slot in enumerate(prefill_slots):
            if slot.state != slot.state.EMPTY:
                if slot.truncate > 0 and slot.truncate < input_ids.shape[-1]:
                    # Apply per-request truncation
                    input_ids[i, : -slot.truncate] = self.tokenizer.pad_token_id
                    attention_mask[i, : -slot.truncate] = 0
                slot_input_ids = input_ids[i : i + 1, :]
                # Padded input ids are also required to set logits processors and stopping criterias
                selector = TokenSelector.create(
                    slot_input_ids,
                    slot.generation_config,
                    self.model,
                    self.model.neuron_config.sequence_length,
                    tokenizer=self.tokenizer,
                    seed=slot.seed,
                )
                slot_input_ids = slot_input_ids.squeeze(dim=0).type(torch.int64)
                slot_attention_mask = attention_mask[i]
                slot.reset(slot_input_ids, slot_attention_mask, selector)
                if sampling_params is not None:
                    sampling_params[i, 0] = slot.generation_config.top_k
                    sampling_params[i, 1] = slot.generation_config.top_p
                    sampling_params[i, 2] = slot.generation_config.temperature
        # Note: when rebuilding cache on prefill, the new tokens on paused slots will be ignored,
        # as they have already been generated and sent back in the last decode.
        model_inputs = self.model.prepare_inputs_for_prefill(
            input_ids,
            attention_mask=attention_mask,
            seq_ids=seq_ids,
            sampling_params=sampling_params,
        )
        tokens_or_logits = self.model(**model_inputs)[0]
        generation, next_batch = self._generate_token(
            prefill_slots, self.batch_id, tokens_or_logits, input_ids
        )
        self.batch_id += 1
        # Reactivate previously active slots for the next decode
        for i, slot in enumerate(active_slots):
            slot.resume()
        logger.debug("Model ready for decoding")
        if next_batch is not None:
            logger.debug(
                f"Next batch is {next_batch.id} with requests: {next_batch.request_ids}"
            )
        return generation, next_batch

    def decode(
        self, batches: List[CachedBatch]
    ) -> Tuple[List[Generation], CachedBatch]:
        """Decode the specified prefilled requests.

        Args:
            batches (`List[CachedBatch]`):
                A list of previous batches containing the prefilled requests.

        Return:
            A list of `Generation` for each request and a `CachedBatch` containing all pending requests.
        """
        # batches contains a list composed of:
        # - the batch id returned by the last decode,
        # - the batch id(s) returned by the last prefill(s)
        # Batches are always concatenated during prefill, so we can
        # just carry on with decoding. We adopt the id of the first
        # batch in the list as our next batch id.
        next_batch_id = batches[0].id
        request_ids = []
        for batch in batches:
            request_ids += batch.request_ids
        cleared_request_ids = []
        for slot in self.slots:
            if slot.state == slot.State.READY and slot.request_id not in request_ids:
                cleared_request_ids.append(slot.request_id)
                slot.clear()
        if len(cleared_request_ids) > 0:
            logger.info(
                f"Clearing slot for requests {cleared_request_ids} as they are not requested."
            )
        active_slots = [slot for slot in self.slots if slot.state == slot.State.READY]
        if len(active_slots) < len(request_ids):
            raise ValueError(
                "Unable to decode tokens for non-prefilled batches (probably due to a previous failure)"
            )
        decode_slots = active_slots
        seq_ids = torch.tensor([slot.id for slot in decode_slots])
        # Reconstruct input_ids and attention_mask from decode slots
        n_slots = len(decode_slots)
        input_ids = torch.full(
            [n_slots, 1], fill_value=self.tokenizer.eos_token_id, dtype=torch.int64
        )
        max_length = 0
        for slot in decode_slots:
            max_length = max(max_length, slot.attention_mask.size(-1))
        attention_mask = torch.zeros([n_slots, max_length], dtype=torch.int64)
        sampling_params = torch.zeros(n_slots, 3) if self.on_device_sampling else None
        for i, slot in enumerate(decode_slots):
            if slot.state != Slot.State.EMPTY:
                # input_ids are simply the tokens generated by the last decode or prefill requests (other tokens are cached)
                input_ids[i, 0] = slot.next_token
                attention_mask[i, : slot.attention_mask.size(-1)] = slot.attention_mask
                if sampling_params is not None:
                    sampling_params[i, 0] = slot.generation_config.top_k
                    sampling_params[i, 1] = slot.generation_config.top_p
                    sampling_params[i, 2] = slot.generation_config.temperature
        model_inputs = self.model.prepare_inputs_for_decode(
            input_ids,
            attention_mask=attention_mask,
            seq_ids=seq_ids,
            sampling_params=sampling_params,
        )
        tokens_or_logits = self.model(**model_inputs)[0]
        return self._generate_token(
            decode_slots, next_batch_id, tokens_or_logits, input_ids
        )

    def _generate_token(
        self,
        slots: List[Slot],
        next_batch_id: int,
        tokens_or_logits: torch.Tensor,
        input_ids: torch.LongTensor,
    ) -> Tuple[List[Generation], CachedBatch]:
        generations = []
        active_slots = False
        for i, slot in enumerate(slots):
            if slot.state != Slot.State.READY:
                continue
            request_id = slot.request_id
            slot_input_ids = input_ids[i : i + 1, :]
            if self.on_device_sampling:
                next_token = tokens_or_logits[i]
            else:
                next_token_logits = tokens_or_logits[i : i + 1, -1, :]
                next_token = slot.select(slot_input_ids, next_token_logits)
            next_token_text = slot.append(next_token)
            generated_text = None
            finish_reason = None
            if next_token == self.tokenizer.eos_token_id:
                finish_reason = FinishReason.FINISH_REASON_EOS_TOKEN
            elif slot.stopped:
                if slot.generated_tokens == slot.max_new_tokens:
                    finish_reason = FinishReason.FINISH_REASON_LENGTH
                else:
                    finish_reason = FinishReason.FINISH_REASON_STOP_SEQUENCE
            if finish_reason is not None:
                # We must include the generated text for each finished sequence in the response
                generated_text = GeneratedText(
                    text=slot.generated_text,
                    generated_tokens=slot.generated_tokens,
                    finish_reason=finish_reason,
                )
                logger.debug(
                    f"Decode complete for request {request_id} with {slot.generated_tokens} tokens"
                )
                # mark the slot as available
                slot.clear()
            else:
                active_slots = True
            generations.append(
                Generation(
                    request_id=request_id,
                    prefill_tokens=None,
                    tokens=Tokens(
                        ids=[next_token],
                        logprobs=[0],
                        texts=[next_token_text],
                        is_special=[next_token in self.special_tokens],
                    ),
                    generated_text=generated_text,
                )
            )
        batch = None
        if active_slots:
            # Whatever initial batch these requests came from, we always return all pending requests in a single batch
            request_ids = [
                slot.request_id for slot in self.slots if slot.state == Slot.State.READY
            ]
            batch = self._cached_batch(next_batch_id, request_ids)
        else:
            logger.debug("No more pending requests")
        return generations, batch

    def _cached_batch(self, batch_id: int, request_ids: List):
        size = len(request_ids)
        max_tokens = size * self.model.neuron_config.sequence_length
        return CachedBatch(
            id=batch_id, request_ids=request_ids, size=size, max_tokens=max_tokens
        )

    def filter(self, batch_id: int, keep_request_ids: List[int]) -> CachedBatch:
        """Remove requests that are not listed from the specified batch

        Args:
            batch_id (`int`):
                The id of a cached batch.
            keep_ids(`List[int]`):
                The list of requests that must be kept.

        Return:
            A `CachedBatch` containing the pending requests.
        """
        keep_slot_ids = [
            slot.id for slot in self.slots if slot.request_id in keep_request_ids
        ]
        self._clear(keep_slot_ids)
        return self._cached_batch(batch_id, keep_request_ids)

    def clear(self, batch_id: Optional[int] = None):
        """Remove a subset or all requests from the generator"""
        keep_ids = []
        if batch_id is not None:
            keep_ids = [slot.id for slot in self.slots if slot.batch_id != batch_id]
        return self._clear(keep_ids)

    def _clear(self, keep_slot_ids: List):
        for slot in self.slots:
            if slot.state != Slot.State.EMPTY and slot.id not in keep_slot_ids:
                logger.debug(f"Removing slot {slot.id} with request {slot.request_id}")
                slot.clear()

    @classmethod
    def from_pretrained(cls, model_id: str, revision: str = None):
        """Instantiate a NeuronGenerator.

        Args:
            model_id (`str`):
                A hub model id or the path to a local model. This path must also contain a Tokenizer.
            revision (`Optional[str]`, defaults to `None`):
                The revision of the model on the HuggingFace hub.

        Returns:
            A NeuronGenerator.
        """
        try:
            neuron_config = NeuronConfig.from_pretrained(model_id, revision=revision)
        except Exception as e:
            logger.debug(
                "NeuronConfig.from_pretrained failed for model %s, revision %s: %s",
                model_id,
                revision,
                e,
            )
            neuron_config = None
        start = time.time()
        if neuron_config is None:
            export_kwargs = get_export_kwargs_from_env()
            logger.info(f"Exporting model to neuron with config: {export_kwargs}.")
            model = NeuronModelForCausalLM.from_pretrained(
                model_id,
                revision=revision,
                low_cpu_mem_usage=True,
                export=True,
                **export_kwargs,
            )
        else:
            logger.info(
                "Loading model on neuron devices (this can take a few minutes)."
            )
            model = NeuronModelForCausalLM.from_pretrained(
                model_id, low_cpu_mem_usage=True, revision=revision
            )
        end = time.time()
        logger.info(f"Model successfully loaded in {end - start:.2f} s.")
        tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
        return cls(model, tokenizer)


================================================
FILE: backends/neuron/server/text_generation_server/interceptor.py
================================================
from typing import Any, Callable

import grpc
from google.rpc import code_pb2, status_pb2
from grpc_interceptor.server import AsyncServerInterceptor
from grpc_status import rpc_status
from loguru import logger


class ExceptionInterceptor(AsyncServerInterceptor):
    async def intercept(
        self,
        method: Callable,
        request_or_iterator: Any,
        context: grpc.ServicerContext,
        method_name: str,
    ) -> Any:
        try:
            response = method(request_or_iterator, context)
            return await response
        except Exception as err:
            method_name = method_name.split("/")[-1]
            logger.exception(f"Method {method_name} encountered an error.")

            await context.abort_with_status(
                rpc_status.to_status(
                    status_pb2.Status(code=code_pb2.INTERNAL, message=str(err))
                )
            )


================================================
FILE: backends/neuron/server/text_generation_server/model.py
================================================
import os
import shutil
import time
from typing import Optional

from huggingface_hub import snapshot_download
from huggingface_hub.constants import HF_HUB_CACHE
from loguru import logger

from optimum.neuron.cache import get_hub_cached_entries
from optimum.neuron.configuration_utils import NeuronConfig


from .tgi_env import check_env_and_neuron_config_compatibility


def get_export_kwargs_from_env():
    batch_size = os.environ.get("MAX_BATCH_SIZE", None)
    if batch_size is not None:
        batch_size = int(batch_size)
    sequence_length = os.environ.get("MAX_TOTAL_TOKENS", None)
    if sequence_length is not None:
        sequence_length = int(sequence_length)
    num_cores = os.environ.get("HF_NUM_CORES", None)
    if num_cores is not None:
        num_cores = int(num_cores)
    auto_cast_type = os.environ.get("HF_AUTO_CAST_TYPE", None)
    return {
        "batch_size": batch_size,
        "sequence_length": sequence_length,
        "num_cores": num_cores,
        "auto_cast_type": auto_cast_type,
    }


def is_cached(model_id):
    # Look for cached entries for the specified model
    in_cache = False
    entries = get_hub_cached_entries(model_id)
    # Look for compatible entries
    for entry in entries:
        if check_env_and_neuron_config_compatibility(
            entry, check_compiler_version=True
        ):
            in_cache = True
            break
    return in_cache


def log_cache_size():
    path = HF_HUB_CACHE
    if os.path.exists(path):
        usage = shutil.disk_usage(path)
        gb = 2**30
        logger.info(
            f"Cache disk [{path}]: total = {usage.total / gb:.2f} G, free = {usage.free / gb:.2f} G"
        )
    else:
        raise ValueError(f"The cache directory ({path}) does not exist.")


def fetch_model(
    model_id: str,
    revision: Optional[str] = None,
) -> str:
    """Fetch a neuron model.

    Args:
        model_id (`str`):
            The *model_id* of a model on the HuggingFace hub or the path to a local model.
        revision (`Optional[str]`, defaults to `None`):
            The revision of the model on the HuggingFace hub.

    Returns:
        A string corresponding to the model_id or path.
    """
    if not os.path.isdir("/sys/class/neuron_device/"):
        raise SystemError("No neuron cores detected on the host.")
    if os.path.isdir(model_id) and revision is not None:
        logger.warning(
            "Revision {} ignored for local model at {}".format(revision, model_id)
        )
        revision = None
    # Download the model from the Hub (HUGGING_FACE_HUB_TOKEN must be set for a private or gated model)
    # Note that the model may already be present in the cache.
    try:
        neuron_config = NeuronConfig.from_pretrained(model_id, revision=revision)
    except Exception as e:
        logger.debug(
            "NeuronConfig.from_pretrained failed for model %s, revision %s: %s",
            model_id,
            revision,
            e,
        )
        neuron_config = None
    if neuron_config is not None:
        if os.path.isdir(model_id):
            return model_id
        # Prefetch the neuron model from the Hub
        logger.info(
            f"Fetching revision [{revision}] for neuron model {model_id} under {HF_HUB_CACHE}"
        )
        log_cache_size()
        return snapshot_download(model_id, revision=revision, ignore_patterns="*.bin")
    # Model needs to be exported: look for compatible cached entries on the hub
    if not is_cached(model_id):
        hub_cache_url = "https://huggingface.co/aws-neuron/optimum-neuron-cache"
        neuron_export_url = "https://huggingface.co/docs/optimum-neuron/main/en/guides/export_model#exporting-neuron-models-using-neuronx-tgi"
        error_msg = (
            f"No cached version found for {model_id} with {get_export_kwargs_from_env()}."
            f"You can start a discussion to request it on {hub_cache_url}"
            f"Alternatively, you can export your own neuron model as explained in {neuron_export_url}"
        )
        raise ValueError(error_msg)
    logger.warning(
        f"{model_id} is not a neuron model: it will be exported using cached artifacts."
    )
    if os.path.isdir(model_id):
        return model_id
    # Prefetch weights, tokenizer and generation config so that they are in cache
    log_cache_size()
    start = time.time()
    snapshot_path = snapshot_download(
        model_id, revision=revision, ignore_patterns="*.bin"
    )
    end = time.time()
    logger.info(f"Model weights fetched in {end - start:.2f} s.")
    log_cache_size()
    return snapshot_path


================================================
FILE: backends/neuron/server/text_generation_server/server.py
================================================
import asyncio
from pathlib import Path
from typing import List

from grpc import aio
from grpc_reflection.v1alpha import reflection
from loguru import logger

from .generator import Generator, NeuronGenerator
from .interceptor import ExceptionInterceptor
from .pb import generate_pb2, generate_pb2_grpc


class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
    def __init__(self, generator: Generator, server_urls: List[str]):
        self.generator = generator
        self.server_urls = server_urls

    async def Info(self, request, context):
        return self.generator.info

    async def Health(self, request, context):
        return generate_pb2.HealthResponse()

    async def ServiceDiscovery(self, request, context):
        return generate_pb2.ServiceDiscoveryResponse(urls=self.server_urls)

    async def ClearCache(self, request, context):
        if request.HasField("id"):
            self.generator.clear(request.id)
        else:
            self.generator.clear()
        return generate_pb2.ClearCacheResponse()

    async def FilterBatch(self, request, context):
        filtered_batch = self.generator.filter(request.batch_id, request.request_ids)
        return generate_pb2.FilterBatchResponse(batch=filtered_batch)

    async def Warmup(self, request, context):
        max_tokens = self.generator.warmup(request.batch)
        return generate_pb2.WarmupResponse(max_supported_total_tokens=max_tokens)

    async def Prefill(self, request, context):
        generations, batch = self.generator.prefill(request.batch)
        return generate_pb2.PrefillResponse(generations=generations, batch=batch)

    async def Decode(self, request, context):
        generations, batch = self.generator.decode(request.batches)
        return generate_pb2.DecodeResponse(generations=generations, batch=batch)


def serve(
    model_id: str,
    revision: str,
    uds_path: Path,
):
    async def serve_inner(model_id: str, revision: str):
        unix_socket_template = "unix://{}-{}"
        local_url = unix_socket_template.format(uds_path, 0)
        server_urls = [local_url]

        try:
            generator = NeuronGenerator.from_pretrained(model_id, revision)
        except Exception:
            logger.exception("Error when initializing model")
            raise

        server = aio.server(interceptors=[ExceptionInterceptor()])
        generate_pb2_grpc.add_TextGenerationServiceServicer_to_server(
            TextGenerationService(generator, server_urls), server
        )
        SERVICE_NAMES = (
            generate_pb2.DESCRIPTOR.services_by_name["TextGenerationService"].full_name,
            reflection.SERVICE_NAME,
        )
        reflection.enable_server_reflection(SERVICE_NAMES, server)
        server.add_insecure_port(local_url)

        await server.start()

        logger.info("Server started at {}".format(local_url))

        try:
            await server.wait_for_termination()
        except KeyboardInterrupt:
            logger.info("Signal received. Shutting down")
            await server.stop(0)

    asyncio.run(serve_inner(model_id, revision))


================================================
FILE: backends/neuron/server/text_generation_server/tgi_env.py
================================================
#!/usr/bin/env python

import argparse
import logging
import os
import sys
from typing import Any, Dict, List, Optional

from optimum.neuron.modeling_decoder import get_available_cores
from optimum.neuron.cache import get_hub_cached_entries
from optimum.neuron.configuration_utils import NeuronConfig
from optimum.neuron.utils.version_utils import get_neuronxcc_version
from optimum.neuron.utils import map_torch_dtype


logger = logging.getLogger(__name__)

tgi_router_env_vars = [
    "MAX_BATCH_SIZE",
    "MAX_TOTAL_TOKENS",
    "MAX_INPUT_TOKENS",
    "MAX_BATCH_PREFILL_TOKENS",
]
tgi_server_env_vars = ["HF_NUM_CORES", "HF_AUTO_CAST_TYPE"]


# By the end of this script all env var should be specified properly
tgi_env_vars = tgi_server_env_vars + tgi_router_env_vars

available_cores = get_available_cores()
neuronxcc_version = get_neuronxcc_version()


def parse_cmdline_and_set_env(argv: List[str] = None) -> argparse.Namespace:
    parser = argparse.ArgumentParser()
    if not argv:
        argv = sys.argv
    # All these are params passed to tgi and intercepted here
    parser.add_argument(
        "--max-input-tokens",
        type=int,
        default=os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0)),
    )
    parser.add_argument(
        "--max-total-tokens", type=int, default=os.getenv("MAX_TOTAL_TOKENS", 0)
    )
    parser.add_argument(
        "--max-batch-size", type=int, default=os.getenv("MAX_BATCH_SIZE", 0)
    )
    parser.add_argument(
        "--max-batch-prefill-tokens",
        type=int,
        default=os.getenv("MAX_BATCH_PREFILL_TOKENS", 0),
    )
    parser.add_argument("--model-id", type=str, default=os.getenv("MODEL_ID"))
    parser.add_argument("--revision", type=str, default=os.getenv("REVISION"))

    args = parser.parse_known_args(argv)[0]

    if not args.model_id:
        raise Exception(
            "No model id provided ! Either specify it using --model-id cmdline or MODEL_ID env var"
        )

    # Override env with cmdline params
    os.environ["MODEL_ID"] = args.model_id

    # Set all tgi router and tgi server values to consistent values as early as possible
    # from the order of the parser defaults, the tgi router value can override the tgi server ones
    if args.max_total_tokens > 0:
        os.environ["MAX_TOTAL_TOKENS"] = str(args.max_total_tokens)

    if args.max_input_tokens > 0:
        os.environ["MAX_INPUT_TOKENS"] = str(args.max_input_tokens)

    if args.max_batch_size > 0:
        os.environ["MAX_BATCH_SIZE"] = str(args.max_batch_size)

    if args.max_batch_prefill_tokens > 0:
        os.environ["MAX_BATCH_PREFILL_TOKENS"] = str(args.max_batch_prefill_tokens)

    if args.revision:
        os.environ["REVISION"] = str(args.revision)

    return args


def neuron_config_to_env(neuron_config):
    if isinstance(neuron_config, NeuronConfig):
        neuron_config = neuron_config.to_dict()
    with open(os.environ["ENV_FILEPATH"], "w") as f:
        f.write("export MAX_BATCH_SIZE={}\n".format(neuron_config["batch_size"]))
        f.write("export MAX_TOTAL_TOKENS={}\n".format(neuron_config["sequence_length"]))
        f.write("export HF_NUM_CORES={}\n".format(neuron_config["tp_degree"]))
        config_key = (
            "auto_cast_type" if "auto_cast_type" in neuron_config else "torch_dtype"
        )
        auto_cast_type = neuron_config[config_key]
        f.write("export HF_AUTO_CAST_TYPE={}\n".format(auto_cast_type))
        max_input_tokens = os.getenv("MAX_INPUT_TOKENS")
        if not max_input_tokens:
            max_input_tokens = int(neuron_config["sequence_length"]) // 2
            if max_input_tokens == 0:
                raise Exception("Model sequence length should be greater than 1")
        f.write("export MAX_INPUT_TOKENS={}\n".format(max_input_tokens))
        max_batch_prefill_tokens = os.getenv("MAX_BATCH_PREFILL_TOKENS")
        if not max_batch_prefill_tokens:
            max_batch_prefill_tokens = int(neuron_config["batch_size"]) * int(
                max_input_tokens
            )
        f.write("export MAX_BATCH_PREFILL_TOKENS={}\n".format(max_batch_prefill_tokens))


def sort_neuron_configs(dictionary):
    return -dictionary["tp_degree"], -dictionary["batch_size"]


def lookup_compatible_cached_model(
    model_id: str, revision: Optional[str]
) -> Optional[Dict[str, Any]]:
    # Reuse the same mechanic as the one in use to configure the tgi server part
    # The only difference here is that we stay as flexible as possible on the compatibility part
    entries = get_hub_cached_entries(model_id)

    logger.debug(
        "Found %d cached entries for model %s, revision %s",
        len(entries),
        model_id,
        revision,
    )

    all_compatible = []
    for entry in entries:
        if check_env_and_neuron_config_compatibility(
            entry, check_compiler_version=True
        ):
            all_compatible.append(entry)

    if not all_compatible:
        logger.debug(
            "No compatible cached entry found for model %s, env %s, available cores %s, neuronxcc version %s",
            model_id,
            get_env_dict(),
            available_cores,
            neuronxcc_version,
        )
        return None

    logger.info("%d compatible neuron cached models found", len(all_compatible))

    all_compatible = sorted(all_compatible, key=sort_neuron_configs)

    entry = all_compatible[0]

    return entry


def check_env_and_neuron_config_compatibility(
    neuron_config_dict: Dict[str, Any], check_compiler_version: bool
) -> bool:
    logger.debug(
        "Checking the provided neuron config %s is compatible with the local setup and provided environment",
        neuron_config_dict,
    )

    # Local setup compat checks
    if neuron_config_dict["tp_degree"] > available_cores:
        logger.debug(
            "Not enough neuron cores available to run the provided neuron config"
        )
        return False

    if (
        check_compiler_version
        and neuron_config_dict["neuronxcc_version"] != neuronxcc_version
    ):
        logger.debug(
            "Compiler version conflict, the local one (%s) differs from the one used to compile the model (%s)",
            neuronxcc_version,
            neuron_config_dict["neuronxcc_version"],
        )
        return False

    batch_size = os.getenv("MAX_BATCH_SIZE", None)
    if batch_size is not None and neuron_config_dict["batch_size"] < int(batch_size):
        logger.debug(
            "The provided MAX_BATCH_SIZE (%s) is higher than the neuron config batch size (%s)",
            os.getenv("MAX_BATCH_SIZE"),
            neuron_config_dict["batch_size"],
        )
        return False
    max_total_tokens = os.getenv("MAX_TOTAL_TOKENS", None)
    if max_total_tokens is not None and neuron_config_dict["sequence_length"] < int(
        max_total_tokens
    ):
        logger.debug(
            "The provided MAX_TOTAL_TOKENS (%s) is higher than the neuron config sequence length (%s)",
            max_total_tokens,
            neuron_config_dict["sequence_length"],
        )
        return False
    num_cores = os.getenv("HF_NUM_CORES", None)
    if num_cores is not None and neuron_config_dict["tp_degree"] < int(num_cores):
        logger.debug(
            "The provided HF_NUM_CORES (%s) is higher than the neuron config tp degree (%s)",
            num_cores,
            neuron_config_dict["tp_degree"],
        )
        return False
    auto_cast_type = os.getenv("HF_AUTO_CAST_TYPE", None)
    if auto_cast_type is not None:
        config_key = (
            "auto_cast_type"
            if "auto_cast_type" in neuron_config_dict
            else "torch_dtype"
        )
        neuron_config_value = map_torch_dtype(str(neuron_config_dict[config_key]))
        env_value = map_torch_dtype(auto_cast_type)
        if env_value != neuron_config_value:
            logger.debug(
                "The provided auto cast type and the neuron config param differ (%s != %s)",
                env_value,
                neuron_config_value,
            )
            return False
    max_input_tokens = int(
        os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0))
    )
    if max_input_tokens > 0:
        if hasattr(neuron_config_dict, "max_context_length"):
            sequence_length = neuron_config_dict["max_context_length"]
        else:
            sequence_length = neuron_config_dict["sequence_length"]
        if max_input_tokens >= sequence_length:
            logger.debug(
                "Specified max input tokens is not compatible with config sequence length ( %s >= %s)",
                max_input_tokens,
                sequence_length,
            )
            return False

    return True


def get_env_dict() -> Dict[str, str]:
    d = {}
    for k in tgi_env_vars:
        d[k] = os.getenv(k)
    return d


def get_neuron_config_for_model(
    model_name_or_path: str, revision: Optional[str] = None
) -> NeuronConfig:
    try:
        neuron_config = NeuronConfig.from_pretrained(
            model_name_or_path, revision=revision
        )
    except Exception as e:
        logger.debug(
            "NeuronConfig.from_pretrained failed for model %s, revision %s: %s",
            model_name_or_path,
            revision,
            e,
        )
        neuron_config = None
    if neuron_config is not None:
        compatible = check_env_and_neuron_config_compatibility(
            neuron_config.to_dict(), check_compiler_version=False
        )
        if not compatible:
            env_dict = get_env_dict()
            msg = (
                "Invalid neuron config and env. Config {}, env {}, available cores {}, neuronxcc version {}"
            ).format(neuron_config, env_dict, available_cores, neuronxcc_version)
            logger.error(msg)
            raise Exception(msg)
    else:
        neuron_config = lookup_compatible_cached_model(model_name_or_path, revision)

    return neuron_config


================================================
FILE: backends/neuron/tests/conftest.py
================================================
pytest_plugins = ["fixtures.model"]


================================================
FILE: backends/neuron/tests/fixtures/model.py
================================================
import copy
import logging
import subprocess
import sys
from tempfile import TemporaryDirectory

import os
import pytest
from transformers import AutoTokenizer


from optimum.neuron.cache import synchronize_hub_cache


logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s] %(levelname)s [%(filename)s.%(funcName)s:%(lineno)d] %(message)s",
    stream=sys.stdout,
)
logger = logging.getLogger(__file__)


OPTIMUM_CACHE_REPO_ID = "optimum-internal-testing/neuron-testing-cache"


# All model configurations below will be added to the neuron_model_config fixture
MODEL_CONFIGURATIONS = {
    "llama": {
        "model_id": "unsloth/Llama-3.2-1B-Instruct",
        "export_kwargs": {
            "batch_size": 4,
            "sequence_length": 4096,
            "num_cores": 2,
            "auto_cast_type": "bf16",
        },
    },
    "qwen2": {
        "model_id": "Qwen/Qwen2.5-0.5B",
        "export_kwargs": {
            "batch_size": 4,
            "sequence_length": 4096,
            "num_cores": 2,
            "auto_cast_type": "bf16",
        },
    },
    "granite": {
        "model_id": "ibm-granite/granite-3.1-2b-instruct",
        "export_kwargs": {
            "batch_size": 4,
            "sequence_length": 4096,
            "num_cores": 2,
            "auto_cast_type": "bf16",
        },
    },
}


def export_model(model_id, export_kwargs, neuron_model_path):
    export_command = [
        "optimum-cli",
        "export",
        "neuron",
        "-m",
        model_id,
        "--task",
        "text-generation",
    ]
    for kwarg, value in export_kwargs.items():
        export_command.append(f"--{kwarg}")
        export_command.append(str(value))
    export_command.append(neuron_model_path)
    logger.info(f"Exporting {model_id} with {export_kwargs}")
    try:
        subprocess.run(export_command, check=True)
    except subprocess.CalledProcessError as e:
        raise ValueError(f"Failed to export model: {e}")


@pytest.fixture(scope="session", params=MODEL_CONFIGURATIONS.keys())
def neuron_model_config(request):
    """Expose a pre-trained neuron model

    The fixture exports a model locally and returns a dictionary containing:
    - a configuration name,
    - the original model id,
    - the export parameters,
    - the neuron model local path.

    For each exposed model, the local directory is maintained for the duration of the
    test session and cleaned up afterwards.

    """
    config_name = request.param
    model_config = copy.deepcopy(MODEL_CONFIGURATIONS[request.param])
    model_id = model_config["model_id"]
    export_kwargs = model_config["export_kwargs"]
    with TemporaryDirectory() as neuron_model_path:
        export_model(model_id, export_kwargs, neuron_model_path)
        synchronize_hub_cache(cache_repo_id=OPTIMUM_CACHE_REPO_ID)
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        tokenizer.save_pretrained(neuron_model_path)
        del tokenizer
        # Add dynamic parameters to the model configuration
        model_config["neuron_model_path"] = neuron_model_path
        # Also add model configuration name to allow tests to adapt their expectations
        model_config["name"] = config_name
        # Yield instead of returning to keep a reference to the temporary directory.
        # It will go out of scope and be released only once all tests needing the fixture
        # have been completed.
        logger.info(f"{config_name} ready for testing ...")
        os.environ["CUSTOM_CACHE_REPO"] = OPTIMUM_CACHE_REPO_ID
        yield model_config
        logger.info(f"Done with {config_name}")


@pytest.fixture(scope="module")
def neuron_model_path(neuron_model_config):
    yield neuron_model_config["neuron_model_path"]


================================================
FILE: backends/neuron/tests/prune_test_models.py
================================================
from argparse import ArgumentParser
from huggingface_hub import HfApi


def main():
    parser = ArgumentParser()
    parser.add_argument("--yes", action="store_true", default=False)
    args = parser.parse_args()
    api = HfApi()
    models = api.list_models(search="optimum-internal-testing/neuron-tgi-testing")
    for model in models:
        if args.yes:
            delete = True
        else:
            answer = input(f"Do you want to delete {model.id} [y/N] ?")
            delete = answer == "y"
        if delete:
            api.delete_repo(model.id)
            print(f"Deleted {model.id}.")


if __name__ == "__main__":
    main()


================================================
FILE: backends/neuron/tests/pytest.ini
================================================
[pytest]
asyncio_mode = auto


================================================
FILE: backends/neuron/tests/requirements.txt
================================================
#  Copyright 2023 The HuggingFace Team. All rights reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
text-generation >= 0.6.0
pytest >= 7.4.0
pytest-asyncio >= 0.21.1
requests < 2.32.0
docker >= 6.1.3
Levenshtein


================================================
FILE: backends/neuron/tests/server/helpers.py
================================================
from text_generation_server.generator import NeuronGenerator
from text_generation_server.pb.generate_pb2 import (
    Batch,
    NextTokenChooserParameters,
    Request,
    StoppingCriteriaParameters,
)


def create_request(
    id: int,
    inputs: str,
    truncate: int = 0,
    max_new_tokens: int = 20,
    do_sample: bool = False,
    top_k: int = 50,
    top_p: float = 0.9,
    temperature: float = 1.0,
    seed: int = 42,
    repetition_penalty: float = 1.0,
):
    parameters = NextTokenChooserParameters(
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        do_sample=do_sample,
        seed=seed,
        repetition_penalty=repetition_penalty,
    )
    stopping_parameters = StoppingCriteriaParameters(max_new_tokens=max_new_tokens)
    return Request(
        id=id,
        inputs=inputs,
        truncate=truncate,
        parameters=parameters,
        stopping_parameters=stopping_parameters,
    )


def check_prefill(
    input_text,
    expected_token_id,
    expected_token_text,
    do_sample,
    batch_size,
    model_path,
):
    """Verify that a prefill for a single request generates the expected output."""
    generator = NeuronGenerator.from_pretrained(model_path)
    assert generator.model.batch_size >= batch_size
    requests = []
    max_new_tokens = 20
    for i in range(batch_size):
        requests.append(
            create_request(
                id=0,
                inputs=input_text,
                do_sample=do_sample,
                max_new_tokens=max_new_tokens,
            )
        )
    # Let's be pessimistic when estimating max_tokens
    batch_size * (len(input_text) + max_new_tokens)
    max_length = generator.model.max_length
    batch = Batch(
        id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length
    )
    generations, next_batch = generator.prefill(batch)
    assert next_batch.size == batch_size
    # Whatever was passed as max_tokens, the server will correct it
    # because of static batching
    assert next_batch.max_tokens == batch_size * max_length
    assert len(generations) == batch_size
    for g in generations:
        tokens = g.tokens
        assert tokens.ids == [expected_token_id]
        assert tokens.texts == [expected_token_text]


def check_decode_single(
    input_text, max_new_tokens, generated_text, do_sample, model_path
):
    """Verify that a decoding for a single request generates the expected output."""
    generator = NeuronGenerator.from_pretrained(model_path)
    request = create_request(
        id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample
    )
    max_length = generator.model.max_length
    batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length)
    generations, next_batch = generator.prefill(batch)
    # We already generated one token: call decode max_new_tokens - 1 times
    for _ in range(max_new_tokens - 1):
        assert next_batch.size == 1
        assert next_batch.max_tokens == max_length
        assert len(generations) == 1
        assert len(generations[0].tokens.ids) == 1
        generations, next_batch = generator.decode([next_batch])
    assert next_batch is None
    assert len(generations) == 1
    output = generations[0].generated_text
    assert output.generated_tokens == max_new_tokens
    assert output.finish_reason == 0
    assert output.text == generated_text


def check_decode_multiple(model_path):
    """Verify that two requests added to the batch at different generation steps
    generate the same outputs (continuous batching).
    """
    generator = NeuronGenerator.from_pretrained(model_path)
    assert generator.model.batch_size > 1
    input_text = "Once upon a time"
    max_new_tokens = 20
    # Prefill a single request, remembering the generated token
    tokens = {0: [], 1: []}
    request = create_request(id=0, inputs=input_text, max_new_tokens=max_new_tokens)
    max_length = generator.model.max_length
    batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length)
    generations, next_batch = generator.prefill(batch)
    assert next_batch.size == 1
    assert len(generations) == 1
    g = generations[0]
    tokens[g.request_id].append(g.tokens.ids[0])
    assert len(tokens[0]) == 1
    # Decode a few tokens
    gen_tokens = 4
    for _ in range(gen_tokens - 1):
        generations, next_batch = generator.decode([next_batch])
        assert len(generations) == 1
        g = generations[0]
        tokens[g.request_id].append(g.tokens.ids[0])
    assert len(tokens[0]) == gen_tokens
    assert next_batch.size == 1
    # Add a second request
    request = create_request(id=1, inputs=input_text, max_new_tokens=max_new_tokens)
    batch = Batch(id=1, requests=[request], size=1, max_tokens=max_length)
    generations, next_batch_1 = generator.prefill(batch)
    assert next_batch_1.size == 1
    # We should have generated only a single token
    assert len(generations) == 1
    g = generations[0]
    tokens[g.request_id].append(g.tokens.ids[0])
    assert len(tokens[0]) == gen_tokens
    assert len(tokens[1]) == 1
    # Decode more tokens until we reach the maximum for the first request
    batches = [next_batch, next_batch_1]
    for _ in range(max_new_tokens - gen_tokens):
        generations, next_batch = generator.decode(batches)
        for g in generations:
            tokens[g.request_id].append(g.tokens.ids[0])
        batches = [next_batch]
    # Verify we now only have one pending request
    assert next_batch.size == 1
    assert len(tokens[0]) == max_new_tokens
    assert len(tokens[1]) == max_new_tokens - gen_tokens + 1
    # Verify we have the output for the first request
    for g in generations:
        if g.request_id == 0:
            output = g.generated_text
            assert output.text != ""
            assert output.generated_tokens == max_new_tokens
            generated_text = output.text
    # Continue decoding until the end of the second request
    for _ in range(gen_tokens - 1):
        generations, next_batch = generator.decode([next_batch])
        assert len(generations) == 1
        g = generations[0]
        tokens[g.request_id].append(g.tokens.ids[0])
    assert next_batch is None
    output = generations[0].generated_text
    assert output.generated_tokens == max_new_tokens
    assert tokens[0] == tokens[1]
    assert output.text == generated_text


================================================
FILE: backends/neuron/tests/server/test_cached_model.py
================================================
import os
import pytest

from text_generation_server.generator import NeuronGenerator
from text_generation_server.model import fetch_model, is_cached


@pytest.fixture(scope="module")
def cached_model_id(neuron_model_config) -> str:
    """
    Fixture to provide a cached model ID for testing.
    This assumes the model is already cached in the local environment.
    """
    export_kwargs = neuron_model_config["export_kwargs"]
    os.environ["MAX_BATCH_SIZE"] = str(export_kwargs["batch_size"])
    os.environ["MAX_TOTAL_TOKENS"] = str(export_kwargs["sequence_length"])
    os.environ["HF_AUTO_CAST_TYPE"] = export_kwargs["auto_cast_type"]
    os.environ["HF_NUM_CORES"] = str(export_kwargs["num_cores"])
    yield neuron_model_config["model_id"]
    os.environ.pop("MAX_BATCH_SIZE", None)
    os.environ.pop("MAX_TOTAL_TOKENS", None)
    os.environ.pop("HF_AUTO_CAST_TYPE", None)
    os.environ.pop("HF_NUM_CORES", None)


def test_model_is_cached(cached_model_id):
    assert is_cached(cached_model_id), f"Model {cached_model_id} is not cached"


def test_fetch_cached_model(cached_model_id: str):
    model_path = fetch_model(cached_model_id)
    assert os.path.exists(
        model_path
    ), f"Model {cached_model_id} was not fetched successfully"
    assert os.path.isdir(model_path), f"Model {cached_model_id} is not a directory"


def test_generator_from_cached_model(cached_model_id: str):
    generator = NeuronGenerator.from_pretrained(model_id=cached_model_id)
    assert generator is not None, "Generator could not be created from cached model"
    assert generator.model is not None, "Generator model is not initialized"
    assert generator.tokenizer is not None, "Generator tokenizer is not initialized"


================================================
FILE: backends/neuron/tests/server/test_continuous_batching.py
================================================
from helpers import create_request
from text_generation_server.generator import NeuronGenerator
from text_generation_server.pb.generate_pb2 import Batch


def test_continuous_batching_two_requests(neuron_model_config):
    """Verify that two requests added to the batch at different generation steps
    generate the same outputs (continuous batching).
    """
    neuron_model_path = neuron_model_config["neuron_model_path"]
    generator = NeuronGenerator.from_pretrained(neuron_model_path)
    assert generator.model.neuron_config.batch_size > 1
    input_text = "Once upon a time"
    max_new_tokens = 20
    # Prefill a single request, remembering the generated token
    tokens = {0: [], 1: []}
    request = create_request(id=0, inputs=input_text, max_new_tokens=max_new_tokens)
    max_length = generator.model.neuron_config.sequence_length
    batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length)
    generations, next_batch = generator.prefill(batch)
    assert next_batch.size == 1
    assert len(generations) == 1
    g = generations[0]
    tokens[g.request_id].append(g.tokens.ids[0])
    assert len(tokens[0]) == 1
    # Decode a few tokens
    gen_tokens = 4
    for _ in range(gen_tokens - 1):
        generations, next_batch = generator.decode([next_batch])
        assert len(generations) == 1
        g = generations[0]
        tokens[g.request_id].append(g.tokens.ids[0])
    assert len(tokens[0]) == gen_tokens
    assert next_batch.size == 1
    # Add a second request
    request = create_request(id=1, inputs=input_text, max_new_tokens=max_new_tokens)
    batch = Batch(id=1, requests=[request], size=1, max_tokens=max_length)
    generations, next_batch_1 = generator.prefill(batch)
    assert next_batch_1.size == 1
    # We should have generated only a single token
    assert len(generations) == 1
    g = generations[0]
    tokens[g.request_id].append(g.tokens.ids[0])
    assert len(tokens[0]) == gen_tokens
    assert len(tokens[1]) == 1
    # Decode more tokens until we reach the maximum for the first request
    batches = [next_batch, next_batch_1]
    for _ in range(max_new_tokens - gen_tokens):
        generations, next_batch = generator.decode(batches)
        for g in generations:
            tokens[g.request_id].append(g.tokens.ids[0])
        batches = [next_batch]
    # Verify we now only have one pending request
    assert next_batch.size == 1
    assert len(tokens[0]) == max_new_tokens
    assert len(tokens[1]) == max_new_tokens - gen_tokens + 1
    # Verify we have the output for the first request
    for g in generations:
        if g.request_id == 0:
            output = g.generated_text
            assert output.text != ""
            assert output.generated_tokens == max_new_tokens
            generated_text = output.text
    # Continue decoding until the end of the second request
    for _ in range(gen_tokens - 1):
        generations, next_batch = generator.decode([next_batch])
        assert len(generations) == 1
        g = generations[0]
        tokens[g.request_id].append(g.tokens.ids[0])
    assert next_batch is None
    output = generations[0].generated_text
    assert output.generated_tokens == max_new_tokens
    assert tokens[0] == tokens[1]
    assert output.text == generated_text


================================================
FILE: backends/neuron/tests/server/test_decode.py
================================================
from helpers import create_request
from text_generation_server.generator import NeuronGenerator
from text_generation_server.pb.generate_pb2 import Batch


def test_decode(neuron_model_config):
    """Verify that a decoding for a single request generates the expected output."""
    config_name = neuron_model_config["name"]
    neuron_model_path = neuron_model_config["neuron_model_path"]
    generator = NeuronGenerator.from_pretrained(neuron_model_path)
    for do_sample in [True, False]:
        mode = "sample" if do_sample else "greedy"
        print(f"{config_name}[{mode}]")
        generated_text = _test_decode(config_name, generator, do_sample)
        if not do_sample:
            expected_text = {
                "llama": " The world was holding its breath as the world's top scientists and engineers gathered at the secret underground facility",
                "qwen2": " I was sitting in my room, staring at the clock, when a knock at the door. I",
                "granite": "\n\nThis opening line is from George Orwell's dystopian novel, \"1",
            }[config_name]
            assert generated_text == expected_text
        generator.clear()


def _test_decode(config_name, generator, do_sample):
    input_text = (
        "It was a bright cold day in April, and the clocks were striking thirteen."
    )
    max_new_tokens = 20
    request = create_request(
        id=0,
        inputs=input_text,
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        temperature=0.9,
    )
    max_length = generator.model.neuron_config.sequence_length
    batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length)
    generations, next_batch = generator.prefill(batch)
    # We already generated one token: call decode max_new_tokens - 1 times
    for _ in range(max_new_tokens - 1):
        assert next_batch.size == 1
        assert next_batch.max_tokens == max_length
        assert len(generations) == 1
        assert len(generations[0].tokens.ids) == 1
        generations, next_batch = generator.decode([next_batch])
    assert next_batch is None
    assert len(generations) == 1
    output = generations[0].generated_text
    assert output.generated_tokens == max_new_tokens
    assert output.finish_reason == 0
    return output.text


================================================
FILE: backends/neuron/tests/server/test_generator_slot.py
================================================
import pytest
import torch
from text_generation_server.generator import Slot
from text_generation_server.pb.generate_pb2 import Request
from transformers import AutoTokenizer, GenerationConfig


TOKENIZERS = ["NousResearch/Llama-2-7b-hf", "gpt2"]


@pytest.fixture(params=TOKENIZERS)
def tokenizer(request):
    t = AutoTokenizer.from_pretrained(request.param)
    t.padding_side = "left"
    t.pad_token_id = t.eos_token_id
    return t


@pytest.mark.parametrize(
    "input_text, generated_text",
    [
        [
            "It was a bright cold day in April, and the clocks were striking thirteen.",
            " Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind,"
            " slipped quickly through the glass doors of Victory Mansions, though not quickly enough"
            " to prevent a swirl of gritty dust from entering along with him.",
        ],
        ["This sentence is written in chinese:", "我很感谢你的热情"],
        ["Some text might contain a lot of emojis like 😃", "😍💪 👉 👀"],
    ],
    ids=["spaces", "chinese-utf8", "emojis"],
)
def test_decode_streaming(tokenizer, input_text, generated_text):
    slot = Slot(0, tokenizer)
    request = Request(id=0, inputs=input_text)
    slot.assign(0, request, GenerationConfig())
    assert slot.cached_text == input_text

    inputs = tokenizer(
        input_text,
        padding="max_length",
        max_length=len(input_text) + 1,
        return_tensors="pt",
    )
    input_ids = inputs["input_ids"][0]
    attention_mask = inputs["attention_mask"][0]
    generated_tokens = tokenizer(generated_text, add_special_tokens=False)["input_ids"]

    # We need to regenerate the full text as the tokenizer might change it (extra spaces might be added)
    all_input_ids = torch.cat([input_ids, torch.tensor(generated_tokens)])
    full_text = tokenizer.decode(all_input_ids, skip_special_tokens=True)
    regenerated_text = full_text[len(input_text) :]

    # Initialize the slot with the inputs
    slot.reset(input_ids, attention_mask, selector=None)

    assert slot.generated_tokens == 0

    # Simulate an iterative generation (i.e. don't call select and use known tokens instead)
    decoded_text = ""
    for i in range(len(generated_tokens)):
        text = slot.append(generated_tokens[i])
        assert slot.generated_tokens == i + 1
        decoded_text += text

    assert decoded_text == regenerated_text


================================================
FILE: backends/neuron/tests/server/test_info.py
================================================
from text_generation_server.generator import NeuronGenerator


def test_info(neuron_model_path):
    generator = NeuronGenerator.from_pretrained(neuron_model_path)
    info = generator.info
    assert info.requires_padding is True
    assert info.device_type == "xla"
    assert info.window_size == 0
    assert info.speculate == 0


================================================
FILE: backends/neuron/tests/server/test_prefill.py
================================================
from helpers import create_request
from text_generation_server.generator import NeuronGenerator
from text_generation_server.pb.generate_pb2 import Batch


def test_prefill(neuron_model_config):
    """Verify that a prefill for a single request generates the expected output."""
    config_name = neuron_model_config["name"]
    neuron_model_path = neuron_model_config["neuron_model_path"]
    generator = NeuronGenerator.from_pretrained(neuron_model_path)
    max_batch_size = 4
    assert generator.model.neuron_config.batch_size >= max_batch_size
    for num_requests in [1, max_batch_size]:
        for do_sample in [True, False]:
            mode = "sample" if do_sample else "greedy"
            print(f"[{mode}]: {num_requests} requests")
            _test_prefill(config_name, generator, num_requests, do_sample)
            generator.clear()


def _test_prefill(config_name, generator, batch_size, do_sample):
    requests = []
    max_new_tokens = 20
    input_text = (
        "It was a bright cold day in April, and the clocks were striking thirteen."
    )
    for i in range(batch_size):
        requests.append(
            create_request(
                id=i,
                inputs=input_text,
                do_sample=do_sample,
                max_new_tokens=max_new_tokens,
            )
        )
    # Let's be pessimistic when estimating max_tokens
    max_length = generator.max_prefill_length()
    batch = Batch(
        id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length
    )
    generations, next_batch = generator.prefill(batch)
    assert next_batch.size == batch_size
    # Whatever was passed as max_tokens, the server will correct it
    # because of static batching
    assert next_batch.max_tokens == batch_size * max_length
    assert len(generations) == batch_size
    expectations = {
        "llama": [578, " The"],
        "qwen2": [358, " I"],
        "granite": [203, "\n"],
    }[config_name]
    # Greedy mode should always generate the same output
    if not do_sample:
        for g in generations:
            tokens = g.tokens
            assert tokens.ids[0] == expectations[0]
            assert tokens.texts[0] == expectations[1]


def test_prefill_truncate(neuron_model_config):
    config_name = neuron_model_config["name"]
    neuron_model_path = neuron_model_config["neuron_model_path"]
    generator = NeuronGenerator.from_pretrained(neuron_model_path)
    batch_size = generator.model.neuron_config.batch_size
    # We apply truncation to all requests but the first one
    truncate = [
        None,
    ] + [i * 3 for i in range(1, batch_size)]
    input_text = (
        "Two gin-scented tears trickled down the sides of his nose."
        " But it was all right, everything was all right, the struggle was finished."
        " He had won the victory over himself. He loved Big Brother."
    )
    requests = []
    for i in range(batch_size):
        requests.append(create_request(id=i, inputs=input_text, truncate=truncate[i]))
    max_length = generator.max_prefill_length()
    batch = Batch(
        id=0, requests=requests, size=batch_size, max_tokens=batch_size * max_length
    )
    generations, _ = generator.prefill(batch)
    # Even if the input text is identical for all requests, the first generated token might
    # be different because of the truncation
    expectations = {
        "llama": [" He", "iens", "\x08", " He"],
        "qwen2": [" He", "<|endoftext|>", " ", " The"],
        "granite": ["\n", "\n", "\n", "\n"],
    }[config_name]
    for i, g in enumerate(generations):
        tokens = g.tokens
        assert (
            tokens.texts[0] == expectations[i]
        ), f"Request {i} expected [{expectations[i]}], got [{tokens.texts[0]}]"


================================================
FILE: backends/neuron/tests/test_entry_point.py
================================================
import os
import pytest
from tempfile import TemporaryDirectory

from optimum.neuron.models.inference.nxd.backend.config import NxDNeuronConfig
from optimum.neuron.utils import map_torch_dtype

from text_generation_server.tgi_env import (
    get_neuron_config_for_model,
    lookup_compatible_cached_model,
    neuron_config_to_env,
)


def test_get_neuron_config_for_model(neuron_model_config):
    neuron_model_path = neuron_model_config["neuron_model_path"]
    export_kwargs = neuron_model_config["export_kwargs"]
    os.environ["MAX_BATCH_SIZE"] = str(export_kwargs["batch_size"])
    os.environ["MAX_TOTAL_TOKENS"] = str(export_kwargs["sequence_length"])
    os.environ["HF_AUTO_CAST_TYPE"] = export_kwargs["auto_cast_type"]
    os.environ["HF_NUM_CORES"] = str(export_kwargs["num_cores"])
    neuron_config = get_neuron_config_for_model(neuron_model_path)
    assert neuron_config is not None
    assert neuron_config.batch_size == export_kwargs["batch_size"]
    assert neuron_config.sequence_length == export_kwargs["sequence_length"]
    assert neuron_config.tp_degree == export_kwargs["num_cores"]
    if isinstance(neuron_config, NxDNeuronConfig):
        assert map_torch_dtype(neuron_config.torch_dtype) == map_torch_dtype(
            export_kwargs["auto_cast_type"]
        )
    else:
        assert map_torch_dtype(neuron_config.auto_cast_type) == map_torch_dtype(
            export_kwargs["auto_cast_type"]
        )


@pytest.mark.parametrize("model_id", ["unsloth/Llama-3.2-1B-Instruct"])
def test_lookup_compatible_cached_model(model_id: str):
    neuron_config = lookup_compatible_cached_model(model_id, None)
    assert neuron_config is not None


def test_neuron_config_to_env(neuron_model_config) -> None:
    neuron_model_path = neuron_model_config["neuron_model_path"]
    neuron_config = get_neuron_config_for_model(neuron_model_path)
    with TemporaryDirectory() as temp_dir:
        os.environ["ENV_FILEPATH"] = os.path.join(temp_dir, "env.sh")
        neuron_config_to_env(neuron_config)
        with open(os.environ["ENV_FILEPATH"], "r") as env_file:
            env_content = env_file.read()
            assert f"export MAX_BATCH_SIZE={neuron_config.batch_size}" in env_content
            assert (
                f"export MAX_TOTAL_TOKENS={neuron_config.sequence_length}"
                in env_content
            )
            assert f"export HF_NUM_CORES={neuron_config.tp_degree}" in env_content
            if hasattr(neuron_config, "torch_dtype"):
                auto_cast_type = str(map_torch_dtype(neuron_config.torch_dtype)).split(
                    "."
                )[-1]
            else:
                auto_cast_type = neuron_config.auto_cast_type
            assert f"export HF_AUTO_CAST_TYPE={auto_cast_type}" in env_content


================================================
FILE: backends/neuron/tgi-entrypoint.sh
================================================
#!/bin/bash
set -e -o pipefail -u

export ENV_FILEPATH=$(mktemp)

trap "rm -f ${ENV_FILEPATH}" EXIT

touch $ENV_FILEPATH

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )

${SCRIPT_DIR}/tgi_entry_point.py $@

source $ENV_FILEPATH

exec text-generation-launcher $@


================================================
FILE: backends/neuron/tgi_entry_point.py
================================================
#!/usr/bin/env python

import logging
import os
import sys


from text_generation_server.tgi_env import (
    available_cores,
    get_env_dict,
    get_neuron_config_for_model,
    neuron_config_to_env,
    neuronxcc_version,
    parse_cmdline_and_set_env,
    tgi_env_vars,
)


logger = logging.getLogger(__name__)


def main():
    """
    This script determines proper default TGI env variables for the neuron precompiled models to
    work properly
    :return:
    """
    args = parse_cmdline_and_set_env()

    for env_var in tgi_env_vars:
        if not os.getenv(env_var):
            break
    else:
        logger.info(
            "All env vars %s already set, skipping, user know what they are doing",
            tgi_env_vars,
        )
        sys.exit(0)

    neuron_config = get_neuron_config_for_model(args.model_id, args.revision)

    if not neuron_config:
        msg = (
            "No compatible neuron config found. Provided env {}, available cores {}, neuronxcc version {}"
        ).format(get_env_dict(), available_cores, neuronxcc_version)
        logger.error(msg)
        raise Exception(msg)

    neuron_config_to_env(neuron_config)


if __name__ == "__main__":
    main()


================================================
FILE: backends/trtllm/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.20)

if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
    cmake_policy(SET CMP0135 NEW)
endif ()

project(tgi-trtllm-backend VERSION 1.0.0)
set(CMAKE_CXX_STANDARD 23)

include(FetchContent)
include(ExternalProject)
include(CheckCXXCompilerFlag)

option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF)
option(TGI_TRTLLM_BACKEND_BUILD_EXAMPLES "Enable building the examples suite" OFF)
option(TGI_TRTLLM_BACKEND_BUILD_USE_LLD "Enable lld linker instead of ld" OFF)
set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "89-real" CACHE STRING "List of CUDA architectures to support")
set(TGI_TRTLLM_BACKEND_TRT_ROOT "/usr/local/tensorrt" CACHE STRING "Path where TensorRT libraries and headers are located")
set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include" CACHE STRING "Path where TensorRT headers are located")
set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE STRING "Path where TensorRT libraries are located")

# We are using nvidia-ml to query at runtime device information to enable some architecture-specific features
find_package(CUDAToolkit 12.6 REQUIRED COMPONENTS CUDA::cudart CUDA::nvml)
find_package(MPI REQUIRED)

#### External dependencies ####
include(cmake/json.cmake)
include(cmake/spdlog.cmake)
include(cmake/trtllm.cmake)

if (CMAKE_BUILD_TYPE STREQUAL "Debug")
    set(TGI_TRTLLM_BACKEND_DEBUG ON)
    add_compile_definitions(TGI_TRTLLM_BACKEND_DEBUG=1)
    add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_TRACE)
endif ()

if (${TGI_TRTLLM_BACKEND_BUILD_USE_LLD})
    message(STATUS "Using lld linker")
    add_link_options("-fuse-ld=lld")
endif ()

# Let's build TRTLLM as part of CMake
add_subdirectory("${trtllm_SOURCE_DIR}/cpp" "${trtllm_SOURCE_DIR}/..")

# Tell CMake to need try to override the RPATH for executorWorker as it has not information on how to do so
set_target_properties(executorWorker PROPERTIES SKIP_BUILD_RPATH TRUE)

# TGI TRTLLM Backend definition
add_library(tgi_trtllm_backend_impl STATIC csrc/hardware.hpp csrc/backend.hpp csrc/backend.cpp)
include_directories(${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
target_include_directories(tgi_trtllm_backend_impl PRIVATE
        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/csrc>
        #        $<INSTALL_INTERFACE:csrc>
)
target_include_directories(tgi_trtllm_backend_impl PUBLIC "${trtllm_SOURCE_DIR}/cpp/include")
target_link_libraries(tgi_trtllm_backend_impl PRIVATE CUDA::cudart CUDA::nvml)
target_link_libraries(tgi_trtllm_backend_impl PUBLIC nlohmann_json::nlohmann_json spdlog::spdlog)
target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapper)

# This install all the artifacts in CMAKE_INSTALL_PREFIX under include/ lib/ bin/ to make easy to link / find it back
install(TARGETS tgi_trtllm_backend_impl)
#install(TARGETS cutlass_src fb_gemm_src fpA_intB_gemm_src gemm_swiglu_sm90_src kernels_src)
install(TARGETS decoder_attention_0 decoder_attention_1)
install(TARGETS tensorrt_llm nvinfer_plugin_tensorrt_llm decoder_attention_src executorWorker)
install(FILES ${TRTLLM_NVRTC_WRAPPER_LIBRARY_PATH} TYPE LIB)
if (NOT ${TGI_TRTLLM_BACKEND_DEBUG})
    install(FILES ${TRTLLM_EXECUTOR_STATIC_LIBRARY_PATH} TYPE LIB)
endif ()


#### Unit Tests ####
if (${TGI_TRTLLM_BACKEND_BUILD_TESTS} AND CMAKE_BUILD_TYPE MATCHES "Debug")
    message(STATUS "Building tests")
    option(TGI_TRTLLM_BACKEND_ENABLE_ASAN "Enable AddressSanitizer")
    option(TGI_TRTLLM_BACKEND_ENABLE_UBSAN "Enable UndefinedSanitizer")

    FetchContent_Declare(
            Catch2
            URL https://github.com/catchorg/Catch2/archive/refs/tags/v3.7.1.tar.gz
    )
    FetchContent_MakeAvailable(Catch2)

    # This attempt to detect if the compiler can emit warning if it can't apply return value optimization from a function
    check_cxx_compiler_flag("-Wnrvo" COMPILER_SUPPORT_WARNING_ON_NVRO)
    if (${COMPILER_SUPPORT_WARNING_ON_NVRO})
        message(STATUS "Enabling non-NVRO detection")
        target_compile_options(tgi_trtllm_backend_impl PRIVATE -Wnrvo)
    endif ()
    target_compile_options(tgi_trtllm_backend_impl PRIVATE -Wall)

    cmake_path(GET TRTLLM_NVRTC_WRAPPER_LIBRARY_PATH PARENT_PATH TRTLLM_NVRTC_WRAPPER_PARENT_LIBRARY_PATH)
    message(STATUS "Adding linking path: ${TRTLLM_NVRTC_WRAPPER_PARENT_LIBRARY_PATH}")

    add_executable(tgi_trtllm_backend_tests tests/test_hardware.cpp tests/test_backend.cpp)

    #    target_compile_options(tgi_trtllm_backend_tests PRIVATE -Werror)
    target_link_directories(tgi_trtllm_backend_tests PRIVATE "${TRTLLM_NVRTC_WRAPPER_PARENT_LIBRARY_PATH}")
    target_include_directories(tgi_trtllm_backend_tests PUBLIC "${trtllm_SOURCE_DIR}/cpp/include")
    target_include_directories(tgi_trtllm_backend_tests PUBLIC "csrc/")
    target_link_libraries(tgi_trtllm_backend_tests PRIVATE ${TRTLLM_LIBS} CUDA::cudart CUDA::nvml)
    target_link_libraries(tgi_trtllm_backend_tests PUBLIC Catch2::Catch2WithMain nlohmann_json::nlohmann_json spdlog::spdlog tgi_trtllm_backend_impl)
    target_link_libraries(tgi_trtllm_backend_tests PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapper)

    if (${TGI_TRTLLM_BACKEND_ENABLE_ASAN})
        message(STATUS "Enabled AddressSanitizer")
        target_link_options(tgi_trtllm_backend_tests BEFORE PUBLIC -fsanitize=address)
    endif ()

    if (${TGI_TRTLLM_BACKEND_ENABLE_UBSAN})
        message(STATUS "Enabled UndefinedSanitizer")
        target_link_options(tgi_trtllm_backend_tests BEFORE PUBLIC -fsanitize=undefined)
    endif ()

    install(TARGETS tgi_trtllm_backend_tests)

    #    list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
    #    include(CTest)
    #    include(Catch)
    #    catch_discover_tests(tgi_trtllm_backend_tests)
endif ()


================================================
FILE: backends/trtllm/Cargo.toml
================================================
[package]
name = "text-generation-backends-trtllm"
version.workspace = true
edition.workspace = true
authors.workspace = true
homepage.workspace = true

[dependencies]
async-trait = "0.1"
clap = { version = "4.5", features = ["derive"] }
cxx = "1.0"
hashbrown = "0.15"
hf-hub = { workspace = true }
text-generation-router = { path = "../../router" }
tokenizers = { workspace = true }
tokio = { version = "1.43.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
tokio-stream = "0.1.17"
thiserror = "1.0.63"
tracing = "0.1"
pyo3 = { workspace = true }

[build-dependencies]
cmake = "0.1"
cxx-build = { version = "1.0", features = ["parallel"] }
pkg-config = "0.3"


================================================
FILE: backends/trtllm/README.md
================================================
# Text Generation Inference - TensorRT-LLM Backend Implementation

## Description

This folder provides the sources of the TensorRT-LLM backend implementation powered by TensorRT-LLM Executor new API

## Simplified Request Sequence

```mermaid
sequenceDiagram
    actor User
    participant TextGenerationInference.HttpServer
    participant TextGenerationInference.TensorRtLlmBackend
    participant TextGenerationInference.TensorRtLlmWorkerThread
    participant TensorRtLlm.Executor
    participant Nvidia.Gpu
    User ->> TextGenerationInference.HttpServer: POST /generate
    TextGenerationInference.HttpServer ->> TextGenerationInference.TensorRtLlmBackend: Validate and forward inputs & parameters
    TextGenerationInference.TensorRtLlmBackend ->> TextGenerationInference.TensorRtLlmWorkerThread: Allocate a new context and spawn a new thread to handle the request
    TextGenerationInference.TensorRtLlmWorkerThread ->> TensorRtLlm.Executor: Submit the request to the In-Flight Batcher
    activate Nvidia.Gpu
    TensorRtLlm.Executor ->> Nvidia.Gpu: Add the request to the poll for execution
    TensorRtLlm.Executor -->> TextGenerationInference.TensorRtLlmWorkerThread: Response with an unique request identifier
    rect rgb(10, 92, 54)
        loop every 100us
            rect rgb(15, 81, 50)
                alt Acquire lock to query executor
                    TextGenerationInference.TensorRtLlmWorkerThread ->> TensorRtLlm.Executor: Poll request number of new token(s) generated
                else There are new generated tokens
                    TextGenerationInference.TensorRtLlmWorkerThread ->> TensorRtLlm.Executor: Retrieve newly generated tokens
                    TensorRtLlm.Executor -->> TextGenerationInference.TensorRtLlmWorkerThread: Return decoded token information and potential error (omitted)
                    rect rgb(11, 110, 79)
                        alt Generated token is final
                            TensorRtLlm.Executor ->> Nvidia.Gpu: Remove request from the scheduler and from the GPU
                            TextGenerationInference.TensorRtLlmWorkerThread -->> User: Stream the remaining decoded tokens and flush the connection
                        else Generated token is not final
                            TextGenerationInference.TensorRtLlmWorkerThread -->> User: Stream token back to the user as they get decoded
                        end
                    end
                end
            end
            deactivate Nvidia.Gpu
        end
    end

```


================================================
FILE: backends/trtllm/build.rs
================================================
use cxx_build::CFG;
use pkg_config;
use std::env;
use std::env::consts::ARCH;
use std::path::{absolute, PathBuf};
use std::sync::LazyLock;

const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 1] = ["spdlog"];
const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST");
const CUDA_REQUIRED_VERSION: &str = "12.8";
const MPI_REQUIRED_VERSION: &str = "4.1";
const INSTALL_PREFIX: Option<&str> = option_env!("CMAKE_INSTALL_PREFIX");
const TENSORRT_ROOT_DIR: Option<&str> = option_env!("TENSORRT_ROOT_DIR");
const NCCL_ROOT_DIR: Option<&str> = option_env!("NCCL_ROOT_DIR");

const IS_GHA_BUILD: LazyLock<bool> = LazyLock::new(|| {
    option_env!("SCCACHE_GHA_ENABLED").map_or(false, |value| match value.to_lowercase().as_str() {
        "on" => true,
        "true" => true,
        "1" => true,
        _ => false,
    })
});

// Dependencies
const BACKEND_DEPS: &str = "tgi_trtllm_backend_impl";
const CUDA_TRANSITIVE_DEPS: [&str; 4] = ["cuda", "cudart", "cublas", "nvidia-ml"];
const TENSORRT_LLM_TRANSITIVE_DEPS: [(&str, &str); 5] = [
    ("dylib", "tensorrt_llm"),
    ("dylib", "tensorrt_llm_nvrtc_wrapper"),
    ("dylib", "nvinfer_plugin_tensorrt_llm"),
    ("dylib", "decoder_attention_0"),
    ("dylib", "decoder_attention_1"),
];

macro_rules! probe {
    ($name: expr, $version: expr) => {
        if let Err(_) = pkg_config::probe_library($name) {
            pkg_config::probe_library(&format!("{}-{}", $name, $version))
                .expect(&format!("Failed to locate {}", $name));
        }
    };
}

fn get_compiler_flag(
    switch: bool,
    true_case: &'static str,
    false_case: &'static str,
) -> &'static str {
    match switch {
        true => true_case,
        false => false_case,
    }
}

fn get_library_architecture() -> &'static str {
    let os = env::var("CARGO_CFG_TARGET_OS").unwrap();
    let arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap();
    let env = env::var("CARGO_CFG_TARGET_ENV").unwrap();

    match os.as_str() {
        "linux" => {
            if env != "gnu" {
                panic!("unsupported linux ABI {env}, only 'gnu' is supported")
            }

            match arch.as_str() {
                "x86_64" => "x86_64-linux-gnu",
                "aarch64" => "aarch64-linux-gnu",
                _ => panic!("unsupported linux architecture {arch}"),
            }
        }
        "windows" => {
            if env != "msvc" {
                panic!("unsupported windows ABI {env}, only 'msvc' is supported")
            }

            match arch.as_str() {
                "x86_64" => "x86_64-windows-msvc",
                _ => panic!("unsupported windows architecture {arch}"),
            }
        }
        _ => panic!("unsupported OS {os}"),
    }
}

fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf, PathBuf) {
    // Build the backend implementation through CMake
    let install_path = INSTALL_PREFIX.unwrap_or("/usr/local/tgi");
    let tensorrt_path = TENSORRT_ROOT_DIR.unwrap_or("/usr/local/tensorrt");
    let cuda_arch_list = CUDA_ARCH_LIST.unwrap_or("75-real;80-real;86-real;89-real;90-real");

    let mut install_path = PathBuf::from(install_path);
    if !install_path.is_absolute() {
        install_path = absolute(out_dir).expect("cannot happen").join(install_path);
    }

    let mut config = cmake::Config::new(".");
    config
        .uses_cxx11()
        .generator("Ninja")
        .profile(match is_debug {
            true => "Debug",
            false => "Release",
        })
        .env("OPT_LEVEL", opt_level)
        .define("CMAKE_INSTALL_PREFIX", &install_path)
        .define("CMAKE_CUDA_COMPILER", "/usr/local/cuda/bin/nvcc")
        .define("CMAKE_LIBRARY_ARCHITECTURE", get_library_architecture())
        .define("TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST", cuda_arch_list)
        .define(
            "TGI_TRTLLM_BACKEND_DEBUG",
            get_compiler_flag(is_debug, "ON", "OFF"),
        )
        .define("TGI_TRTLLM_BACKEND_TRT_ROOT", tensorrt_path);

    if is_debug || *IS_GHA_BUILD {
        config.define("TGI_TRTLLM_BACKEND_BUILD_TESTS", "ON");
    }

    if option_env!("USE_LLD_LINKER").is_some() {
        println!("cargo:warning=Using lld linker");
        config.define("TGI_TRTLLM_BACKEND_BUILD_USE_LLD", "ON");
    }

    if (is_debug && option_env!("ENABLE_ASAN").is_some()) || *IS_GHA_BUILD {
        println!("cargo:warning=Enabling Address Sanitizer");
        config.define("TGI_TRTLLM_BACKEND_ENABLE_ASAN", "ON");
    }

    if (is_debug && option_env!("ENABLE_UBSAN").is_some()) || *IS_GHA_BUILD {
        println!("cargo:warning=Enabling Undefined Sanitizer");
        config.define("TGI_TRTLLM_BACKEND_ENABLE_UBSAN", "ON");
    }

    if let Some(nvcc_host_compiler) = option_env!("CMAKE_CUDA_HOST_COMPILER") {
        config.define("CMAKE_CUDA_HOST_COMPILER", nvcc_host_compiler);
    }

    if let Some(wrapper) = option_env!("RUSTC_WRAPPER") {
        println!("cargo:warning=Using caching tool: {wrapper}");
        config.define("CMAKE_C_COMPILER_LAUNCHER", wrapper);
        config.define("CMAKE_CXX_COMPILER_LAUNCHER", wrapper);
        config.define("CMAKE_CUDA_COMPILER_LAUNCHER", wrapper);
    }

    // Allow to override which Python to use ...
    if let Some(python3) = option_env!("Python3_EXECUTABLE") {
        config.define("Python3_EXECUTABLE", python3);
    }

    config.build();

    // Additional transitive CMake dependencies
    let deps_folder = out_dir.join("build").join("_deps");
    for dependency in ADDITIONAL_BACKEND_LINK_LIBRARIES {
        let dep_name = match is_debug {
            true => format!("{}d", dependency),
            false => String::from(dependency),
        };
        let dep_path = deps_folder.join(format!("{}-build", dependency));
        println!("cargo:rustc-link-search={}", dep_path.display());
        println!("cargo:rustc-link-lib=static={}", dep_name);
    }

    // Emit linkage information from the artifacts we just built
    for path in ["lib", "lib64"] {
        let install_lib_path = install_path.join(path);
        println!(
            r"cargo:warning=Adding link search path: {}",
            install_lib_path.display()
        );
        println!(r"cargo:rustc-link-search={}", install_lib_path.display());
    }
    (PathBuf::from(install_path), deps_folder)
}

fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
    CFG.include_prefix = "backends/trtllm";
    cxx_build::bridge("src/lib.rs")
        .static_flag(true)
        .std("c++23")
        .include(deps_folder.join("spdlog-src").join("include"))
        .include(deps_folder.join("json-src").join("include"))
        .include(deps_folder.join("trtllm-src").join("cpp").join("include"))
        .include("/usr/local/cuda/include")
        .include("/usr/local/tensorrt/include")
        .include("csrc/")
        .file("csrc/ffi.hpp")
        .define(
            "TGI_TRTLLM_BACKEND_DEBUG",
            get_compiler_flag(is_debug, "ON", "OFF"),
        )
        .compile("tgi_trtllm_backend");

    println!("cargo:rerun-if-changed=CMakeLists.txt");
    println!("cargo:rerun-if-changed=cmake/trtllm.cmake");
    println!("cargo:rerun-if-changed=cmake/json.cmake");
    println!("cargo:rerun-if-changed=cmake/spdlog.cmake");
    println!("cargo:rerun-if-changed=csrc/backend.hpp");
    println!("cargo:rerun-if-changed=csrc/backend.cpp");
    println!("cargo:rerun-if-changed=csrc/hardware.hpp");
    println!("cargo:rerun-if-changed=csrc/ffi.hpp");
}

fn main() {
    // Misc variables
    let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
    let build_profile = env::var("PROFILE").unwrap();
    let (is_debug, opt_level) = match build_profile.as_ref() {
        "debug" => (true, "0"),
        "dev" => (true, "0"),
        _ => (false, "3"),
    };

    // Build the backend
    let (_backend_path, deps_folder) = build_backend(is_debug, opt_level, &out_dir);

    // Build the FFI layer calling the backend above
    build_ffi_layer(&deps_folder, is_debug);

    // Emit linkage search path
    probe!("ompi", MPI_REQUIRED_VERSION);

    // Probe CUDA & co. with pkg-config
    CUDA_TRANSITIVE_DEPS.iter().for_each(|name| {
        probe!(name, CUDA_REQUIRED_VERSION);
    });

    // NCCL is slightly trickier because it might not have a pkgconfig installed
    let nccl_library_path_default = format!("/usr/local/{}-linux-gnu", ARCH);
    let nccl_library_path = NCCL_ROOT_DIR.unwrap_or(&nccl_library_path_default);
    println!(r"cargo:rustc-link-search=native={}", nccl_library_path);
    println!("cargo:rustc-link-lib=dylib=nccl");

    // TensorRT
    let tensort_library_path = TENSORRT_ROOT_DIR.unwrap_or("/usr/local/tensorrt/lib");
    println!(r"cargo:rustc-link-search=native={}", tensort_library_path);
    println!("cargo:rustc-link-lib=dylib=nvinfer");

    // TensorRT-LLM
    TENSORRT_LLM_TRANSITIVE_DEPS
        .iter()
        .for_each(|(link_type, name)| {
            println!("cargo:rustc-link-lib={}={}", link_type, name);
        });

    // Backend
    println!("cargo:rustc-link-lib=static={}", &BACKEND_DEPS);
}


================================================
FILE: backends/trtllm/cmake/json.cmake
================================================
fetchcontent_declare(
        json
#        DOWNLOAD_EXTRACT_TIMESTAMP
        URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.tar.gz
)
fetchcontent_makeavailable(json)


================================================
FILE: backends/trtllm/cmake/spdlog.cmake
================================================
set(SPDLOG_USE_FMT ON)
set(SPDLOG_BUILD_SHARED OFF)
set(SPDLOG_FMT_EXTERNAL OFF)

# Define the level at which SPDLOG_ compilation level is defined
if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
    add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_TRACE)
else ()
    add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG)
endif ()

fetchcontent_declare(
        spdlog
        #        DOWNLOAD_EXTRACT_TIMESTAMP
        URL https://github.com/gabime/spdlog/archive/refs/tags/v1.15.0.tar.gz
)
fetchcontent_makeavailable(spdlog)


================================================
FILE: backends/trtllm/cmake/trtllm.cmake
================================================
set(TRT_INCLUDE_DIR ${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
set(TRT_LIB_DIR ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR})

set(USE_CXX11_ABI ON)
set(BUILD_PYT OFF)
set(BUILD_PYBIND OFF)
set(BUILD_MICRO_BENCHMARKS OFF)
set(BUILD_BENCHMARKS OFF)
set(BUILD_TESTS OFF)
set(CMAKE_CUDA_ARCHITECTURES ${TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST})

message(STATUS "Building for CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}")

set(ENABLE_UCX OFF)
if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
    set(FAST_BUILD ON)
    set(NVTX_DISABLE ON)
    set(INDEX_RANGE_CHECK ON)
else ()
    set(FAST_BUILD OFF)
    set(FAST_MATH ON)
    set(NVTX_DISABLE OFF)
    set(INDEX_RANGE_CHECK OFF)
endif ()

find_package(Python3 REQUIRED Interpreter)

fetchcontent_declare(
        trtllm
        GIT_REPOSITORY https://github.com/nvidia/TensorRT-LLM.git
        GIT_TAG v0.17.0
        GIT_SHALLOW ON
        DOWNLOAD_EXTRACT_TIMESTAMP
)
fetchcontent_makeavailable(trtllm)

message(STATUS "Found TensorRT-LLM: ${trtllm_SOURCE_DIR}")
execute_process(COMMAND git lfs install WORKING_DIRECTORY "${trtllm_SOURCE_DIR}/")
execute_process(COMMAND git lfs pull WORKING_DIRECTORY "${trtllm_SOURCE_DIR}/")

# TRTLLM use a JIT based *precompiled* library to generate some specific kernels, we are generating the path to this one here
set(TRTLLM_NVRTC_LIBRARY_NAME "${CMAKE_SHARED_LIBRARY_PREFIX}tensorrt_llm_nvrtc_wrapper${CMAKE_SHARED_LIBRARY_SUFFIX}" CACHE INTERNAL "nvrtc wrapper library name")
set(TRTLLM_NVRTC_WRAPPER_LIBRARY_PATH "${trtllm_SOURCE_DIR}/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/${CMAKE_LIBRARY_ARCHITECTURE}/${TRTLLM_NVRTC_LIBRARY_NAME}"
        CACHE INTERNAL "nvrtc wrapper library path")

# The same Executor Static library
set(TRTLLM_EXECUTOR_STATIC_LIBRARY_NAME "${CMAKE_SHARED_LIBRARY_PREFIX}tensorrt_llm_executor_static${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE INTERNAL "executor_static library name")
set(TRTLLM_EXECUTOR_STATIC_LIBRARY_PATH "${trtllm_SOURCE_DIR}/cpp/tensorrt_llm/executor/${CMAKE_LIBRARY_ARCHITECTURE}/${TRTLLM_EXECUTOR_STATIC_LIBRARY_NAME}" CACHE INTERNAL "executor_static library path")


================================================
FILE: backends/trtllm/cmake/utils/detect_cuda_arch.cu
================================================


================================================
FILE: backends/trtllm/csrc/backend.cpp
================================================
#include <ranges>

#include <nlohmann/json.hpp>

#include "backend.hpp"
#include "hardware.hpp"

namespace huggingface::tgi::backends::trtllm {
    tle::ParallelConfig backend_workspace_t::parallel_config() const {
        // Single engine (TP = PP = 1) -> using leader mode (no MPI involved)
        const auto world_size = config_["/pretrained_config/mapping/world_size"_json_pointer].get<size_t>();

        auto mode = tle::CommunicationMode::kLEADER;
        std::optional<tle::OrchestratorConfig> orchestratorConfig = std::nullopt;

        if (world_size > 1) {
            SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");
            mode = tle::CommunicationMode::kORCHESTRATOR;
            orchestratorConfig = std::make_optional<tle::OrchestratorConfig>(true, executor_worker_path_, nullptr,
                                                                             true);
        } else {
            SPDLOG_INFO("Detected single engine deployment, using leader mode");
        }

        return tle::ParallelConfig(tle::CommunicationType::kMPI, mode, std::nullopt, std::nullopt, orchestratorConfig);
    }


    tle::ExecutorConfig backend_workspace_t::executor_config() const {
        // Retrieve the compute capabilities to enable some options at runtime
        const auto compute_capabilities = hardware::cuda::compute_capabilities_t();

        // Allocate the config
        tle::ExecutorConfig executor_config(/* maxBeamWidth = */ 1);

        // Set the parallel config as inferred
        executor_config.setParallelConfig(parallel_config());

        // Define some configuration variables
        executor_config.setKvCacheConfig(tle::KvCacheConfig(true));
        executor_config.setEnableChunkedContext(compute_capabilities.is_at_least_ampere());
        executor_config.setSchedulerConfig(tle::SchedulerConfig(tle::CapacitySchedulerPolicy::kMAX_UTILIZATION));
        return executor_config;
    }

    backend_t::backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path)
            : workspace(engines_folder, executor_worker_path), executor_(executor_factory_initializer(workspace)) {}

    size_t backend_t::num_tokens_ready() const noexcept {
        return executor_.getNumResponsesReady();
    }

    std::expected<request_id_t, backend_error_t>
    backend_t::submit(std::span<const token_id_t> token_ids, const generation_params_t g_params,
                      const sampling_params_t s_params) noexcept {
        SPDLOG_DEBUG("Submit {:d} tokens for scheduling ({}, {})", token_ids.size(), g_params, s_params);
        return executor_.enqueueRequest(tle::Request{
                {token_ids.begin(), token_ids.end()},  // Making actual copy of the tokens
                static_cast<tle::SizeType32>(g_params.max_new_tokens),
                true,
                (tle::SamplingConfig) s_params,
                tle::OutputConfig{ /* returnLogProbs= */ true},
                std::nullopt,
                std::nullopt,
                std::nullopt,
                std::nullopt,
                workspace.generation_config().stop_words
        });
    }

    std::vector<tle::Response> backend_t::pull_tokens() noexcept {
        SPDLOG_TRACE(FMT_STRING("Pulling out tokens ({:d} available)"), num_tokens_ready());
        return executor_.awaitResponses();
    }

    void backend_t::cancel(request_id_t request_id) noexcept {
        SPDLOG_TRACE(FMT_STRING("Cancelling request: {:d}"), request_id);
        executor_.cancelRequest(request_id);
    }
}


================================================
FILE: backends/trtllm/csrc/backend.hpp
================================================
#ifndef TGI_BACKEND_TRTLLM
#define TGI_BACKEND_TRTLLM

#include <cmath>
#include <cstdint>
#include <expected>
#include <fstream>
#include <list>
#include <span>

#include <nlohmann/json.hpp>
#include <spdlog/spdlog.h>
#include <spdlog/fmt/fmt.h>

#include <tensorrt_llm/executor/executor.h>

namespace huggingface::tgi::backends::trtllm {
    namespace tle = tensorrt_llm::executor;
    using json = nlohmann::json;
    using request_id_t = uint64_t;
    using token_id_t = tle::TokenIdType;

    /**
     * Represent the parameters used for generation
     */
    struct generation_params_t {
        uint32_t max_new_tokens;
    };

    /**
     * Represent the parameters used to sample tokens from the logit distribution
     */
    struct sampling_params_t {
        uint32_t top_k;
        float_t top_p;
        float_t repetition_penalty;
        float_t frequency_penalty;
        float_t temperature;
        uint64_t seed;

        constexpr explicit operator tle::SamplingConfig() const {
            return tle::SamplingConfig{
                    1,
                    top_k,
                    top_p,
                    std::nullopt,
                    std::nullopt,
                    std::nullopt,
                    seed,
                    temperature,
                    std::nullopt,
                    std::nullopt,
                    repetition_penalty,
                    std::nullopt,
                    frequency_penalty,
                    std::nullopt
            };
        }
    };

    /**
     * Represent possible values from transformers generation `generation_config.json`.
     * It usually stores default sampling parameters to use, such as top_p, temperature, etc.
     */
    struct generation_config_t {
        float_t top_p;
        float_t temperature;
        std::list<std::vector<int32_t>> stop_words;

        constexpr explicit generation_config_t(const json &config) :
                top_p(config.value("top_p", 1.0f)), temperature(config.value("temperature", 1.0f)), stop_words(0) {
            if (config.contains("/eos_token_id"_json_pointer) && config["/eos_token_id"_json_pointer].is_array()) {
                const auto &eos_token_id = config["/eos_token_id"_json_pointer];
                std::for_each(eos_token_id.begin(), eos_token_id.end(), [this](const auto token_id) {
                    stop_words.emplace_back(1, token_id.template get<int32_t>());
                });

                SPDLOG_DEBUG("Detected {:d} predefined stop_words from generation_config.json", stop_words.size());
            }
        }
    };

    /**
     * Helper class representing various items which are stored within the TensorRT-LLM engines folder and
     * can be retrieved at runtime
     */
    class backend_workspace_t {
    private:
        constexpr static auto as_json = [](const std::filesystem::path &path) -> json {
            std::ifstream config_f(path);
            return json::parse(config_f);
        };

        std::filesystem::path engines_folder_;
        std::filesystem::path executor_worker_path_;
        json config_;
        generation_config_t generation_config_;

    public:
        backend_workspace_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path) :
                engines_folder_(engines_folder),
                executor_worker_path_(executor_worker_path),
                config_(as_json(engines_folder / "config.json")),
                generation_config_(as_json(engines_folder / "generation_config.json")) {};

        backend_workspace_t(std::filesystem::path &&engines_folder, std::filesystem::path &&executor_worker_path) :
                engines_folder_(engines_folder),
                executor_worker_path_(executor_worker_path),
                config_(as_json(engines_folder / "config.json")),
                generation_config_(as_json(engines_folder / "generation_config.json")) {};

        /**
         * Path to the folder containing the TensorRT-LLM engines
         * @return local filesystem path to the folder
         */
        [[nodiscard]] constexpr std::filesystem::path engines_folder() const { return engines_folder_; }

        /**
         * Hugging Face transformers' generated `generation_config_t` mapping information stored in the
         * `generation_config.json` holding default generation parameters.
         * @return `generation_config_t`
         */
        [[nodiscard]] constexpr const generation_config_t &generation_config() const { return generation_config_; }

        /**
         * Factory method returning new `tensorrt_llm::executor::ParallelConfig` instance used
         * to initialize `tensorrt_llm::executor::Executor` with multi-instance communication information
         * @return `tensorrt_llm::executor::ParallelConfig` instance
         */
        [[nodiscard]] tle::ParallelConfig parallel_config() const;

        /**
         * Factory method returning new `tensorrt_llm::executor::ExecutorConfig` instance used
         * to initialize `tensorrt_llm::executor::Executor`
         * @return `tensorrt_llm::executor::ExecutorConfig` instance
         */
        [[nodiscard]] tle::ExecutorConfig executor_config() const;
    };

    /**
     * Error raised by the underlying backend implementation
     */
    enum backend_error_t {
        EXECUTOR_NOT_READY = 3,
        EXECUTOR_SCHEDULING_FAILED = 4,
    };


    /**
     * Actual TensorRT-LLM backend implementation interacting with TensorRT-LLM Executor service to
     * - schedule new request
     * - pull status of submitted request(s)
     * - cancel submitted request(s)
     */
    class backend_t {
    private:
        backend_workspace_t workspace;
        tle::Executor executor_;

    public:
        backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path);

        backend_t(std::filesystem::path &&engines_folder, std::filesystem::path &&executor_worker_path)
                : backend_t(engines_folder, executor_worker_path) {};

        /**
         * Submit a new request to the executor
         * @param token_ids
         * @param generation_params
         * @param sampling_params
         * @return Either newly submitted request's id or the error why it failed to submit
         */
        [[nodiscard("Discarded executor request_id needs to be assigned")]]
        std::expected<request_id_t, backend_error_t>
        submit(std::span<const token_id_t> token_ids, generation_params_t generation_params,
               sampling_params_t sampling_params) noexcept;

        /**
         * Query the number of tokens available across all in-flight generations
         * @return
         */
        [[nodiscard("Pulling out the number of tokens")]]
        size_t num_tokens_ready() const noexcept;

        /**
         * Pull out newly generated tokens from the executor
         * @return
         */
        [[nodiscard("")]]
        std::vector<tle::Response> pull_tokens() noexcept;

        /**
         * Cancel the specified request on the executor' set
         * @param request_id Request's Identifier to remove from the in-flight executor
         */
        void cancel(request_id_t) noexcept;
    };

    /**
     * Create a TensorRT-LLM executor from a workspace
     */
    const auto executor_factory_initializer = [](const backend_workspace_t &workspace) -> tle::Executor {
        return {workspace.engines_folder(), tensorrt_llm::executor::ModelType::kDECODER_ONLY,
                workspace.executor_config()};
    };
}

/**
 * Helper structures to define formatting strategies for various types in the backend
 */
template<>
struct fmt::formatter<huggingface::tgi::backends::trtllm::generation_params_t> : formatter<string_view> {
    auto format(huggingface::tgi::backends::trtllm::generation_params_t const &c,
                format_context &ctx) const -> format_context::iterator {
        return fmt::format_to(ctx.out(), "generation_params_t{{ max_new_tokens={:d} }}", c.max_new_tokens);
    }
};

template<>
struct fmt::formatter<huggingface::tgi::backends::trtllm::sampling_params_t> : formatter<string_view> {
    auto format(huggingface::tgi::backends::trtllm::sampling_params_t const &c,
                format_context &ctx) const -> format_context::iterator {
        return fmt::format_to(
                ctx.out(),
                "sampling_params_t{{ top_k={:d}, top_p={:.3f}, repetition_penalty={:.3f}, frequency_penalty={:.3f}, temperature={:.3f}, seed={:d} }}",
                c.top_k, c.top_p, c.repetition_penalty, c.frequency_penalty, c.temperature, c.seed
        );
    }
};

#endif


================================================
FILE: backends/trtllm/csrc/ffi.hpp
================================================
#ifndef TGI_BACKEND_TRTLLM_FFI
#define TGI_BACKEND_TRTLLM_FFI

#include <memory>
#include <thread>

#include <nvml.h>
#include <tensorrt_llm/common/tllmException.h>
#include <tensorrt_llm/plugins/api/tllmPlugin.h>

#include <spdlog/spdlog.h>

#include <backend.hpp>
#include <hardware.hpp>

namespace rust::behavior {
    template<typename Try, typename Fail>
    static void trycatch(Try &&func, Fail &&fail) noexcept try {
        func();
    } catch (tensorrt_llm::common::TllmException &e) {
        fail(e.what());
    }
}

namespace huggingface::tgi::backends::trtllm {
    class tensorrt_llm_backend_t;
}

#include "backends/trtllm/src/lib.rs.h"


namespace huggingface::tgi::backends::trtllm {
    std::once_flag backend_initialized_flag;

    constexpr finish_reason_t as_finish_reason_t(const tle::FinishReason reason) noexcept {
        switch (reason) {
            case tle::FinishReason::kNOT_FINISHED:
                return finish_reason_t::kNOT_FINISHED;
            case tle::FinishReason::kSTOP_WORDS:
                return finish_reason_t::kSTOP_WORDS;
            case tle::FinishReason::kEND_ID:
                return finish_reason_t::kEND_ID;
            case tle::FinishReason::kLENGTH:
                return finish_reason_t::kLENGTH;
            default:
                std::unreachable();
        }
    }

    static auto as_generation_step = [](const tle::Response &r) {
        const auto reqId = r.getRequestId();
        if (!r.hasError()) [[likely]] {
            const auto result = r.getResult();
            const auto logits = result.logProbs.value()[0];
            return generation_step_t{
                    reqId,
                    static_cast<uint32_t>(result.outputTokenIds[0][0]),
                    logits.back(),
                    result.isFinal,
                    as_finish_reason_t(result.finishReasons[0]),
                    false,
                    std::string()
            };
        } else {
            return generation_step_t{
                    reqId,
                    0,
                    0.0,
                    true,
                    finish_reason_t::kNOT_FINISHED,
                    true,
                    std::move(r.getErrorMsg())
            };
        }
    };


    class tensorrt_llm_backend_t {
    private:
        backend_t inner_;

    public:
        tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::filesystem::path &&executor_worker_path)
                : inner_(engine_folder, executor_worker_path) {}

        size_t num_tokens_ready() const noexcept { return inner_.num_tokens_ready(); }

        request_id_t submit(
                rust::Slice<const uint32_t> tokens,
                uint32_t max_new_tokens,
                uint32_t top_k,
                float_t top_p,
                float_t temperature,
                float_t repetition_penalty,
                float_t frequency_penalty,
                uint64_t seed
        ) {
            // This is enabled only if using add_compile_definitions(SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_TRACE)
            SPDLOG_TRACE(FMT_STRING("[FFI] Submitting {:d} prompt tokens to the executor"));

            // Submit the request to the executor and get back a potential request_id used to track request status
            const auto signed_tokens = std::vector<int32_t>(tokens.begin(), tokens.end());
            const auto maybe_request_id = inner_.submit(
                    signed_tokens,
                    {max_new_tokens},
                    {top_k, top_p, repetition_penalty, frequency_penalty, temperature, seed}
            );

            // If we do have a value, let's return the request_id
            if (maybe_request_id.has_value()) [[likely]] {
                return *maybe_request_id;
            } else {
                SPDLOG_WARN("[FFI] Failed to submit request to the executor");
                return maybe_request_id.error();
            }
        }

        std::unique_ptr<std::vector<generation_step_t>> pull_tokens() noexcept {
            if (num_tokens_ready() > 0) [[likely]] {
                const auto responses = inner_.pull_tokens();

                SPDLOG_TRACE("[FFI] Successfully pulled out {:d} responses from executor", responses.size());

                // Transform tle::Response to generation_step_t
#ifdef __cpp_lib_ranges_to_container
                auto steps = responses | std::views::transform(as_generation_step) | std::ranges::to<std::vector>();
#else
                auto steps = std::vector<generation_step_t>();
                steps.reserve(responses.size());
                std::transform(responses.begin(), responses.end(), std::back_inserter(steps), as_generation_step);
#endif
                return std::make_unique<std::vector<generation_step_t>>(steps);

            } else {
                return std::make_unique<std::vector<generation_step_t>>();
            }
        }

        void cancel(request_id_t request_id) noexcept {
            SPDLOG_DEBUG("[FFI] cancelling request {:d}", request_id);
            inner_.cancel(request_id);
        }
    };

    void initialize_logging() {
#ifndef TGI_TRTLLM_BACKEND_DEBUG
        if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) {
            std::string log_level(TRTLLM_LOG_LEVEL_CSTR);
            std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) {
                return std::tolower(c);
            });

            if (log_level == "debug")
                spdlog::set_level(spdlog::level::debug);
            else
                spdlog::set_level(spdlog::level::info);
        }
#else
        spdlog::set_level(spdlog::level::debug);
#endif
    }

    void initialize_tensorrt_llm_backend() {
        SPDLOG_INFO("Initializing TGI - TensoRT-LLM Backend (v{})", tle::version());

        // Initialize everyone
        initialize_logging();
        nvmlInit_v2();
        initTrtLlmPlugins();

        const auto numGpus = huggingface::tgi::hardware::cuda::get_device_count();
        if (numGpus.has_value()) {
            SPDLOG_INFO("[FFI] Detected {:d} Nvidia GPU(s)", *numGpus);
        } else {
            SPDLOG_WARN("[FFI] Failed to detected Nvidia GPU(s) on the system");
            // todo: throw
        }
    }

    std::unique_ptr<tensorrt_llm_backend_t>
    create_backend_from_engine_folder(const rust::Str engines_folder, const rust::Str executor_worker_path) {
        std::call_once(backend_initialized_flag, initialize_tensorrt_llm_backend);
        return std::make_unique<tensorrt_llm_backend_t>(
                std::filesystem::path(std::string_view(engines_folder.begin(), engines_folder.end()),
                                      std::filesystem::path::format::auto_format),
                std::filesystem::path(std::string_view(executor_worker_path.begin(), executor_worker_path.end()),
                                      std::filesystem::path::format::auto_format)
        );
    }
}
#endif


================================================
FILE: backends/trtllm/csrc/hardware.hpp
================================================
#ifndef TGI_HARDWARE_CUDA
#define TGI_HARDWARE_CUDA
#include <cstdint>
#include <optional>

#include <nvml.h>

namespace huggingface::tgi::hardware::cuda {
    static constexpr auto VOLTA = std::make_tuple(7u, 0u);
    static constexpr auto TURING = std::make_tuple(7u, 5u);
    static constexpr auto AMPERE = std::make_tuple(8u, 0u);
    static constexpr auto HOPPER = std::make_tuple(9u, 0u);
    static constexpr auto ADA_LOVELACE = std::make_tuple(8u, 9u);

    /**
     * Get the number of GPUs on the local machine
     * @return std::nullopt if no device is available, otherwise >= 1
     */
    inline std::optional<size_t> get_device_count() {
        uint32_t numGpus = 0;
        if (nvmlDeviceGetCount_v2(&numGpus) == NVML_SUCCESS) {
            return numGpus;
        }
        return std::nullopt;
    }

    /**
     * Store information about the version of the CUDA Compute Capabilities detected on the device
     */
    struct compute_capabilities_t {
        int32_t major;
        int32_t minor;

        compute_capabilities_t(): compute_capabilities_t(0) {}
        explicit compute_capabilities_t(size_t device_idx): major(-1), minor(-1) {
            nvmlDevice_t device;
            if (nvmlDeviceGetHandleByIndex_v2(device_idx, &device) == NVML_SUCCESS) {
               nvmlDeviceGetCudaComputeCapability(device, &major, &minor);
            }
        };
        compute_capabilities_t(int32_t major, int32_t minor): major(major), minor(minor) {}

        /**
         * Evaluate if the underlying capabilities is at least greater or equals to the provided 2-tuple (major, minor)
         * @param sm Architecture version (major, minor)
         * @return True if greater or equals to the underlying compute capabilities
         */
        [[nodiscard]] constexpr auto is_at_least(std::tuple<uint32_t, uint32_t> sm) const -> decltype(auto) { return std::tie(major, minor) >= sm; }

        /**
         * Check if the capabilities match at least Volta architecture (sm_70)
         * @return true if at least Volta (>= sm_70), false otherwise
         */
        [[nodiscard]] constexpr bool is_at_least_volta() const { return is_at_least(VOLTA); }

        /**
         * Check if the capabilities match at least Turing architecture (sm_75)
         * @return true if at least Turing (>= sm_75), false otherwise
         */
        [[nodiscard]] constexpr bool is_at_least_turing() const { return is_at_least(TURING); }

        /**
         * Check if the capabilities match at least Ampere architecture (sm_80)
         * @return true if at least Ampere (>= sm_80), false otherwise
         */
        [[nodiscard]] constexpr bool is_at_least_ampere() const { return is_at_least(AMPERE); }

        /**
         * Check if the capabilities match at least Ada Lovelace architecture (sm_89)
         * @return true if at least Ada Lovelace (>= sm_89), false otherwise
         */
        [[nodiscard]] constexpr bool is_at_least_ada_lovelace() const { return is_at_least(ADA_LOVELACE); }

        /**
         * Check if the capabilities match at least Hopper architecture (sm_90)
         * @return true if at least Hopper (>= sm_90), false otherwise
         */
        [[nodiscard]] constexpr bool is_at_least_hopper() const { return is_at_least(HOPPER); }
    };
}
#endif


================================================
FILE: backends/trtllm/scripts/install_tensorrt.sh
================================================
#!/bin/bash

set -ex

TRT_VER_BASE="10.8.0"
TRT_VER_FULL="${TRT_VER_BASE}.43"
CUDA_VER="12.8"
CUDNN_VER="9.7.0.66-1"
NCCL_VER="2.25.1-1+cuda${CUDA_VER}"
CUBLAS_VER="${CUDA_VER}.3.14-1"
NVRTC_VER="${CUDA_VER}.61-1"

for i in "$@"; do
    case $i in
        --TRT_VER=?*) TRT_VER="${i#*=}";;
        --CUDA_VER=?*) CUDA_VER="${i#*=}";;
        --CUDNN_VER=?*) CUDNN_VER="${i#*=}";;
        --NCCL_VER=?*) NCCL_VER="${i#*=}";;
        --CUBLAS_VER=?*) CUBLAS_VER="${i#*=}";;
        *) ;;
    esac
    shift
done

NVCC_VERSION_OUTPUT=$(nvcc --version)
if [[ $(echo $NVCC_VERSION_OUTPUT | grep -oP "\d+\.\d+" | head -n 1) != ${CUDA_VER} ]]; then
  echo "The version of pre-installed CUDA is not equal to ${CUDA_VER}."
  exit 1
fi

install_ubuntu_requirements() {
    apt-get update && apt-get install -y --no-install-recommends gnupg2 curl ca-certificates
    ARCH=$(uname -m)
    if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi
    if [ "$ARCH" = "aarch64" ];then ARCH="sbsa";fi
    curl -fsSLO https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/${ARCH}/cuda-keyring_1.1-1_all.deb
    dpkg -i cuda-keyring_1.1-1_all.deb
    rm /etc/apt/sources.list.d/cuda-ubuntu2404-x86_64.list

    apt-get update
    if [[ $(apt list --installed | grep libcudnn9) ]]; then
      apt-get remove --purge -y --allow-change-held-packages libcudnn9*
    fi
    if [[ $(apt list --installed | grep libnccl) ]]; then
      apt-get remove --purge -y --allow-change-held-packages libnccl*
    fi
    if [[ $(apt list --installed | grep libcublas) ]]; then
      apt-get remove --purge -y --allow-change-held-packages libcublas*
    fi
    if [[ $(apt list --installed | grep cuda-nvrtc-dev) ]]; then
      apt-get remove --purge -y --allow-change-held-packages cuda-nvrtc-dev*
    fi
    CUBLAS_CUDA_VERSION=$(echo $CUDA_VER | sed 's/\./-/g')
    apt-get install -y --no-install-recommends libcudnn9-cuda-12=${CUDNN_VER} libcudnn9-dev-cuda-12=${CUDNN_VER}
    apt-get install -y --no-install-recommends libnccl2=${NCCL_VER} libnccl-dev=${NCCL_VER}
    apt-get install -y --no-install-recommends libcublas-${CUBLAS_CUDA_VERSION}=${CUBLAS_VER} libcublas-dev-${CUBLAS_CUDA_VERSION}=${CUBLAS_VER}
    # NVRTC static library doesn't exist in NGC PyTorch container.
    NVRTC_CUDA_VERSION=$(echo $CUDA_VER | sed 's/\./-/g')
    apt-get install -y --no-install-recommends cuda-nvrtc-dev-${NVRTC_CUDA_VERSION}=${NVRTC_VER}
    apt-get clean
    rm -rf /var/lib/apt/lists/*
}

install_centos_requirements() {
    CUBLAS_CUDA_VERSION=$(echo $CUDA_VER | sed 's/\./-/g')
    yum -y update
    yum -y install epel-release
    yum remove -y libnccl* && yum -y install libnccl-${NCCL_VER} libnccl-devel-${NCCL_VER}
    yum remove -y libcublas* && yum -y install libcublas-${CUBLAS_CUDA_VERSION}-${CUBLAS_VER} libcublas-devel-${CUBLAS_CUDA_VERSION}-${CUBLAS_VER}
    yum clean all
}

install_tensorrt() {
    #PY_VERSION=$(python3 -c 'import sys; print(".".join(map(str, sys.version_info[0:2])))')
    #PARSED_PY_VERSION=$(echo "${PY_VERSION//./}")
    TRT_CUDA_VERSION="12.8"

    if [ -z "$RELEASE_URL_TRT" ];then
        ARCH=${TRT_TARGETARCH}
        if [ -z "$ARCH" ];then ARCH=$(uname -m);fi
        if [ "$ARCH" = "arm64" ];then ARCH="aarch64";fi
        if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi
        if [ "$ARCH" = "x86_64" ];then DIR_NAME="x64-agnostic"; else DIR_NAME=${ARCH};fi
        if [ "$ARCH" = "aarch64" ];then OS1="Ubuntu22_04" && OS2="Ubuntu-24.04" && OS="ubuntu-24.04"; else OS1="Linux" && OS2="Linux" && OS="linux";fi
        RELEASE_URL_TRT=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/${TRT_VER_BASE}/tars/TensorRT-${TRT_VER_FULL}.${OS2}.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz
    fi
    wget --no-verbose ${RELEASE_URL_TRT} -O /tmp/TensorRT.tar
    tar -xf /tmp/TensorRT.tar -C /usr/local/
    mv /usr/local/TensorRT-${TRT_VER_FULL} /usr/local/tensorrt
    # pip3 install /usr/local/tensorrt/python/tensorrt-*-cp${PARSED_PY_VERSION}-*.whl
    rm -rf /tmp/TensorRT.tar
}

# Install base packages depending on the base OS
ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
case "$ID" in
  debian)
    install_ubuntu_requirements
    install_tensorrt
    ;;
  ubuntu)
    install_ubuntu_requirements
    install_tensorrt
    ;;
  centos)
    install_centos_requirements
    install_tensorrt
    ;;
  *)
    echo "Unable to determine OS..."
    exit 1
    ;;
esac


================================================
FILE: backends/trtllm/scripts/setup_sccache.py
================================================
from argparse import ArgumentParser

AWS_S3_CACHING_VARIABLES = {
    "AWS_ACCESS_KEY_ID": "aws_access_key_id",
    "AWS_SECRET_ACCESS_KEY": "aws_secret_access_key",
    "AWS_SESSION_TOKEN": "aws_session_token",
    "SCCACHE_REGION": "s3_region",
    "SCCACHE_BUCKET": "s3_bucket_name",
}

ALL_CACHING_STORAGE_VARIABLES = {"AWS_S3_CACHING_VARIABLES"}


def setup_sccache_locally():
    from os import environ

    print("Setting up Local Caching Layer")
    for target in ALL_CACHING_STORAGE_VARIABLES:
        for envvar in globals()[target].keys():
            if envvar in environ:
                print(f"Deleted {envvar} from environment variables")
                del environ[envvar]


def setup_sccache_for_s3():
    from os import environ

    print("Setting up AWS S3 Caching Layer")
    for envvar in AWS_S3_CACHING_VARIABLES.keys():
        if envvar not in environ or not environ[envvar] or len(environ[envvar]) == 0:
            print(f"Missing definition for environment variable {envvar}")


if __name__ == "__main__":
    parser = ArgumentParser("TensorRT-LLM Build Caching Setup")

    parser.add_argument(
        "--is-gha-build",
        type=str,
        default="FALSE",
        help="Indicate if the build is from Github Actions",
    )

    # Parse args
    args = parser.parse_args()
    args.is_gha_build = args.is_gha_build.lower() in {"on", "true", "1"}

    if args.is_gha_build:
        setup_sccache_for_s3()
    else:
        setup_sccache_locally()


================================================
FILE: backends/trtllm/src/errors.rs
================================================
use std::path::PathBuf;
use thiserror::Error;

use text_generation_router::server;

#[derive(Debug, Error)]
pub enum TensorRtLlmBackendError {
    #[error("Provided engine folder {0} doesn't exist")]
    EngineFolderDoesntExists(PathBuf),
    #[error("Provided executorWorker binary path {0} doesn't exist")]
    ExecutorWorkerNotFound(PathBuf),
    #[error("TensorRT-LLM Runtime error: {0}")]
    Runtime(String),
    #[error("Tokenizer error: {0}")]
    Tokenizer(String),
    #[error("Argument validation error: {0}")]
    ArgumentValidation(String),
    #[error("WebServer error: {0}")]
    WebServer(#[from] server::WebServerError),
    #[error("Tokio runtime failed to start: {0}")]
    Tokio(#[from] std::io::Error),
}


================================================
FILE: backends/trtllm/src/lib.rs
================================================
pub use looper::TensorRtLlmBackendV2;

pub mod errors;
mod looper;
mod utils;

#[cxx::bridge(namespace = "huggingface::tgi::backends::trtllm")]
mod ffi {
    #[cxx_name = "finish_reason_t"]
    #[derive(Debug, Clone, Copy)]
    pub enum FinishReason {
        /// The request is not finished.
        #[cxx_name = "kNOT_FINISHED"]
        NotFinished = 0u8,

        /// The request finished because the end id was generated.
        #[cxx_name = "kEND_ID"]
        EndTokenId = 1u8,

        /// The request finished because a stop word was generated.
        #[cxx_name = "kSTOP_WORDS"]
        StopWords = 2u8,

        /// The request finished because the maximum number of tokens was reached.
        #[cxx_name = "kLENGTH"]
        MaxLength = 3u8,
    }

    /// Struct used as shared type between rust and C++ to represent the result
    /// of a single decoding iteration
    #[cxx_name = "generation_step_t"]
    #[derive(Debug, Clone)]
    pub struct GenerationStep {
        request_id: u64,
        token_id: u32,
        log_prob: f32,
        is_final: bool,
        finish_reason: FinishReason,
        has_error: bool,
        error_msg: String,
    }

    unsafe extern "C++" {
        include!("backends/trtllm/csrc/ffi.hpp");

        /// Represent an instance of the underlying TensorRT-LLM backend
        #[cxx_name = "tensorrt_llm_backend_t"]
        type TensorRtLlmBackendImpl;

        /// Create an instance backed behind a std::unique_ptr to manage the lifespan of the backend
        ///
        /// # Arguments
        ///
        /// * `engine_folder`: Path to the folder containing all the TRTLLM engines
        /// * `executor_worker`: Path to the TRTLLM executor worker
        ///
        /// returns: <unknown>
        ///
        /// # Examples
        ///
        /// ```
        ///
        /// ```
        fn create_backend_from_engine_folder(
            engine_folder: &str,
            executor_worker: &str,
        ) -> Result<UniquePtr<TensorRtLlmBackendImpl>>;

        fn num_tokens_ready(self: &TensorRtLlmBackendImpl) -> usize;

        fn submit(
            self: Pin<&mut TensorRtLlmBackendImpl>,
            tokens: &[u32],
            max_new_tokens: u32,
            top_k: u32,
            top_p: f32,
            temperature: f32,
            repetition_penalty: f32,
            frequency_penalty: f32,
            seed: u64,
        ) -> Result<u64>;

        fn pull_tokens(
            self: Pin<&mut TensorRtLlmBackendImpl>,
        ) -> Result<UniquePtr<CxxVector<GenerationStep>>>;

        fn cancel(self: Pin<&mut TensorRtLlmBackendImpl>, request_id: u64);
    }
}

use ffi::FinishReason;
use text_generation_router::FinishReason as InferFinishReason;

impl From<FinishReason> for InferFinishReason {
    fn from(reason: FinishReason) -> Self {
        match reason {
            FinishReason::StopWords => InferFinishReason::StopSequence,
            FinishReason::MaxLength => InferFinishReason::Length,
            FinishReason::EndTokenId => InferFinishReason::EndOfSequenceToken,
            _ => panic!("Cannot convert {reason:?} to text_generation_router::FinishReason"),
        }
    }
}


================================================
FILE: backends/trtllm/src/looper.rs
================================================
use async_trait::async_trait;
use cxx::UniquePtr;
use hashbrown::HashMap;
use std::hint;
use std::ops::Deref;
use std::path::Path;
use tokenizers::Tokenizer;
use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender};
use tokio::sync::TryAcquireError;
use tokio::task::spawn_blocking;
use tokio::time::Instant;
use tokio_stream::wrappers::UnboundedReceiverStream;
use tracing::{debug, error, warn};

use text_generation_router::infer::InferError::{GenerationError, ValidationError};
use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse};
use text_generation_router::validation::ValidationError::{
    EmptyInput, Grammar, TopNTokensDisabled, UnsupportedModality,
};
use text_generation_router::validation::{Chunk, ValidGenerateRequest};
use text_generation_router::Token;

use crate::errors::TensorRtLlmBackendError;
use crate::ffi::{
    create_backend_from_engine_folder, FinishReason, GenerationStep, TensorRtLlmBackendImpl,
};
use crate::utils::first_line;

type InferResult<T> = Result<T, InferError>;

/// Wrap the requests along with the channel used to stream back to the client the decoded tokens
struct GenerationContext {
    request: ValidGenerateRequest,
    streamer: UnboundedSender<InferResult<InferStreamResponse>>,
    tokens: Vec<u32>,
    start: Option<Instant>,
    queued: Instant,
}

#[derive(Debug, Copy, Clone)]
struct DecodedToken {
    id: u32,
    log_prob: f32,
    is_final: bool,
    finish_reason: FinishReason,
}

impl<'step> TryFrom<&'step GenerationStep> for DecodedToken {
    type Error = InferError;

    fn try_from(step: &'step GenerationStep) -> Result<Self, Self::Error> {
        if !step.has_error {
            Ok(Self {
                id: step.token_id,
                log_prob: step.log_prob,
                is_final: step.is_final,
                finish_reason: step.finish_reason,
            })
        } else {
            Err(GenerationError(step.error_msg.clone()))
        }
    }
}

fn executor_status_looper(
    max_inflight_requests: usize,
    tokenizer: Tokenizer,
    mut backend: UniquePtr<TensorRtLlmBackendImpl>,
    mut backlog: UnboundedReceiver<GenerationContext>,
) {
    // Track the tuple (request_id, stream) for each request
    let mut in_flights =
        HashMap::<u64, GenerationContext>::with_capacity(max_inflight_requests * 2);

    'scheduler: loop {
        // Is there any request pending to be scheduled?
        let awaiting_requests = backlog.len();
        for _ in 0..awaiting_requests {
            // Retrieve all the requests
            if let Some(ctx) = backlog.blocking_recv() {
                // Submit all the request to the executor and move the context to the in-flight tracker
                let request = &ctx.request;
                let generation_params = &request.parameters;
                let stopping_params = &request.stopping_parameters;
                let input_ids = request.input_ids.as_deref();

                // Submit to the TensorRT-LLM executor for scheduling
                match backend.pin_mut().submit(
                    &input_ids.unwrap(), // This is checked beforehand in validate()
                    stopping_params.max_new_tokens,
                    generation_params.top_k,
                    generation_params.top_p,
                    generation_params.temperature,
                    generation_params.repetition_penalty,
                    generation_params.frequency_penalty,
                    generation_params.seed,
                ) {
                    Ok(request_id) => {
                        // Insert the context linked to the generated request id in the tracker
                        debug!("[in-flight] Added {}", request_id);
                        in_flights.insert(request_id, ctx);
                    }
                    Err(e) => {
                        // Return to the caller
                        let what = e.to_string();
                        error!(error = what.as_str(), "Failed to schedule request");

                        let err = Err(InferError::Overloaded(TryAcquireError::NoPermits));
                        if let Err(_) = ctx.streamer.send(err) {
                            error!("Failed to send back error to the client");
                        }
                    }
                };
            } else {
                break 'scheduler;
            }
        }

        if backend.num_tokens_ready() > 0 {
            let mut backend = backend.pin_mut();
            match backend.as_mut().pull_tokens() {
                Ok(responses) => {
                    // Iterate through all the decoded token
                    for step in responses.deref() {
                        if let Some(ctx) = in_flights.get_mut(&step.request_id) {
                            // Update the starting timestamp if not set
                            // This value might not be the actual real starting time of the request
                            // on the executor side - Need to expose more info from the executor to
                            // retrieve this value
                            // TODO : Expose actual real starting time for a request on FFI layer
                            if ctx.start.is_none() {
                                ctx.start = Some(Instant::now());
                            }

                            // Try to map the generation step to a DecodedToken
                            let response = match DecodedToken::try_from(step) {
                                Ok(decoded_token) => {
                                    post_process_decoded_token(&tokenizer, ctx, decoded_token)
                                }
                                Err(err) => Err(err),
                            };

                            // Attempt to send back the response to the client
                            if let Err(_) = ctx.streamer.send(response) {
                                // Client has dropped, remove from tracked requests
                                debug!(
                                    "Client dropped - removing request {} from tracked requests",
                                    step.request_id
                                );
                                backend.as_mut().cancel(step.request_id);
                                let _ = in_flights.remove(&step.request_id);
                            }
                        } else {
                            warn!("Untracked request {}", step.request_id,);
                        }
                    }
                }
                Err(ref err) => {
                    error!("Failed to get responses from the executor: {}.", err.what());
                    break 'scheduler;
                }
            }
        }

        // Hint the CPU we are spin-locking
        hint::spin_loop();
    }
}

fn post_process_decoded_token(
    tokenizer: &Tokenizer,
    ctx: &mut GenerationContext,
    decoded_token: DecodedToken,
) -> InferResult<InferStreamResponse> {
    match tokenizer.decode(&[decoded_token.id], false) {
        Ok(text) => {
            let is_special = tokenizer.get_added_vocabulary().is_special_token(&text);
            let token = Token {
                id: decoded_token.id,
                text,
                logprob: decoded_token.log_prob,
                special: is_special,
            };

            // Append the token to the tracked generated tokens
            ctx.tokens.push(token.id);

            // Map the correct response depending on the step is final or not
            let out = if !decoded_token.is_final {
                InferStreamResponse::Intermediate {
                    token,
                    top_tokens: vec![],
                }
            } else {
                let text = tokenizer.decode(&ctx.tokens, true);
                let generated_text = GeneratedText {
                    text: text.unwrap(),
                    generated_tokens: ctx.tokens.len() as u32,
                    finish_reason: decoded_token.finish_reason.into(),
                    seed: None,
                };

                InferStreamResponse::End {
                    token,
                    top_tokens: vec![],
                    generated_text,
                    start: ctx.start.unwrap(),
                    queued: ctx.queued,
                }
            };

            Ok(out)
        }
        Err(err) => Err(GenerationError(err.to_string())),
    }
}

fn ensure_paths_exist<P: AsRef<Path>, PP: AsRef<Path>>(
    engine_folder: P,
    executor_worker_path: PP,
) -> Result<(String, String), TensorRtLlmBackendError> {
    // Retrieve paths as &str for the backend creation
    let engine_folder = engine_folder.as_ref();
    let executor_worker_path = executor_worker_path.as_ref();

    // Ensure the engine folder exists
    if !engine_folder.exists() {
        let err = TensorRtLlmBackendError::EngineFolderDoesntExists(engine_folder.to_path_buf());

        error!("Path validation failed: {}", err,);
        return Err(err);
    }

    // Ensure executor worker binary exists
    if !executor_worker_path.exists() {
        let err = TensorRtLlmBackendError::ExecutorWorkerNotFound(engine_folder.to_path_buf());

        error!("Path validation failed: {}", err,);
        return Err(err);
    }

    let engine_folder = String::from(
        engine_folder
            .to_str()
            .expect("Failed to convert engine_folder to valid UTF-8"),
    );

    let executor_worker_path = String::from(
        executor_worker_path
            .to_str()
            .expect("Failed to convert executor_worker_path to valid UTF-8"),
    );

    Ok((engine_folder, executor_worker_path))
}

unsafe impl Send for TensorRtLlmBackendImpl {}

pub struct TensorRtLlmBackendV2(UnboundedSender<GenerationContext>);

impl TensorRtLlmBackendV2 {
    pub fn new<P: AsRef<Path> + Send, PP: AsRef<Path> + Send>(
        tokenizer: Tokenizer,
        engine_folder: P,
        executor_worker_path: PP,
        max_inflight_requests: usize,
    ) -> Result<Self, TensorRtLlmBackendError> {
        let (engine_folder, executor_worker_path) =
            ensure_paths_exist(engine_folder, executor_worker_path)?;

        // Allocate the IPC layer to communicate with the backend
        let (executor_sender, executor_receiver) = unbounded_channel();

        // Create the FFI backend
        let backend = create_backend_from_engine_folder(&engine_folder, &executor_worker_path)
            .map_err(|e| TensorRtLlmBackendError::Runtime(first_line(e.what(), "Unknown error")))?;

        // Executor looper is responsible for scheduling and pulling requests state at regular interval
        spawn_blocking(move || {
            executor_status_looper(max_inflight_requests, tokenizer, backend, executor_receiver)
        });

        Ok(TensorRtLlmBackendV2(executor_sender))
    }

    fn validate(request: &ValidGenerateRequest) -> InferResult<()> {
        if request.input_ids.is_none() {
            return Err(ValidationError(UnsupportedModality("No token provided")));
        }

        if request.top_n_tokens > 1 {
            return Err(ValidationError(TopNTokensDisabled));
        }

        // TODO: Is it really needed? How can it be validated before?
        if request.parameters.grammar.is_some() {
            return Err(ValidationError(Grammar));
        }

        match request.inputs.len() {
            0 => Err(ValidationError(EmptyInput)),
            2.. => Err(GenerationError(
                "TensorRT-LLM backend don't support multi-chunk".into(),
            )),
            1 => match request.inputs.first().expect("Single item-chunk") {
                Chunk::Text(_) => Ok(()),
                Chunk::Image(_) => Err(ValidationError(UnsupportedModality("image"))),
            },
        }
    }
}

#[async_trait]
impl Backend for TensorRtLlmBackendV2 {
    fn schedule(
        &self,
        request: ValidGenerateRequest,
    ) -> Result<UnboundedReceiverStream<Result<InferStreamResponse, InferError>>, InferError> {
        Self::validate(&request)?;

        // Open-up the stream to send tokens
        let (streamer, receiver) = unbounded_channel::<InferResult<InferStreamResponse>>();

        // Send the context to the executor for scheduling
        let queued = Instant::now();
        match self.0.send(GenerationContext {
            request,
            streamer,
            tokens: Vec::with_capacity(256),
            start: None,
            queued,
        }) {
            Ok(_) => Ok(UnboundedReceiverStream::new(receiver)),
            Err(_) => Err(GenerationError(
                "Failed to submit request to the backend".into(),
            )),
        }
    }

    async fn health(&self, _: bool) -> bool {
        true
    }

    fn name(&self) -> &'static str {
        "TensorRT-LLM"
    }
}


================================================
FILE: backends/trtllm/src/main.rs
================================================
use std::path::{Path, PathBuf};

use clap::Parser;
use hf_hub::api::tokio::{Api, ApiBuilder};
use hf_hub::{Cache, Repo, RepoType};
use tracing::info;

use text_generation_backends_trtllm::errors::TensorRtLlmBackendError;
use text_generation_backends_trtllm::TensorRtLlmBackendV2;
use text_generation_router::server::{
    get_hub_model_info, legacy_tokenizer_handle, py_resolve_tokenizer,
};
use text_generation_router::usage_stats::UsageStatsLevel;
use text_generation_router::{server, Tokenizer};

/// App Configuration
#[derive(Parser, Debug)]
#[clap(author, version, about, long_about = None)]
struct Args {
    #[clap(default_value = "128", long, env)]
    max_concurrent_requests: usize,
    #[clap(default_value = "2", long, env)]
    max_best_of: usize,
    #[clap(default_value = "4", long, env)]
    max_stop_sequences: usize,
    #[clap(default_value = "5", long, env)]
    max_top_n_tokens: u32,
    #[clap(default_value = "1024", long, env)]
    max_input_tokens: usize,
    #[clap(default_value = "2048", long, env)]
    max_total_tokens: usize,
    #[clap(default_value = "4096", long, env)]
    max_batch_prefill_tokens: u32,
    #[clap(long, env)]
    max_batch_total_tokens: Option<u32>,
    #[clap(default_value = "0.0.0.0", long, env)]
    hostname: String,
    #[clap(default_value = "3000", long, short, env)]
    port: u16,
    #[clap(default_value = "9000", long, short, env)]
    prometheus_port: u16,
    #[clap(long, env, required = true)]
    tokenizer_name: String,
    #[clap(long, env)]
    tokenizer_config_path: Option<String>,
    #[clap(long, env)]
    revision: Option<String>,
    #[clap(long, env)]
    model_id: String,
    #[clap(default_value = "2", long, env)]
    validation_workers: usize,
    #[clap(long, env)]
    json_output: bool,
    #[clap(long, env)]
    otlp_endpoint: Option<String>,
    #[clap(default_value = "text-generation-inference.router", long, env)]
    otlp_service_name: String,
    #[clap(long, env)]
    cors_allow_origin: Option<Vec<String>>,
    #[clap(default_value = "4", long, env)]
    max_client_batch_size: usize,
    #[clap(long, env)]
    auth_token: Option<String>,
    #[clap(long, env, help = "Path to the TensorRT-LLM Orchestrator worker")]
    executor_worker: PathBuf,
    #[clap(default_value = "on", long, env)]
    usage_stats: UsageStatsLevel,
    #[clap(default_value = "2000000", long, env)]
    payload_limit: usize,
    #[clap(default_value = "1073741824", long, env)]
    max_image_fetch_size: usize,
}

async fn get_tokenizer(tokenizer_name: &str, revision: Option<&str>) -> Option<Tokenizer> {
    // Parse Huggingface hub token
    let authorization_token = std::env::var("HF_TOKEN")
        .or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN"))
        .ok();

    // Tokenizer instance
    let local_path = Path::new(tokenizer_name);

    // Shared API builder initialization
    let api_builder = || {
        let mut builder = ApiBuilder::new()
            .with_progress(false)
            .with_token(authorization_token);

        if let Ok(cache_dir) = std::env::var("HUGGINGFACE_HUB_CACHE") {
            builder = builder.with_cache_dir(cache_dir.into());
        }

        if let Ok(origin) = std::env::var("HF_HUB_USER_AGENT_ORIGIN") {
            builder = builder.with_user_agent("origin", origin.as_str());
        }

        builder
    };

    // Decide if we need to use the API based on the revision and local path
    let use_api = revision.is_some() || !local_path.exists() || !local_path.is_dir();

    // Initialize API if needed
    #[derive(Clone)]
    enum Type {
        Api(Api),
        Cache(Cache),
        None,
    }
    let api = if use_api {
        if std::env::var("HF_HUB_OFFLINE") == Ok("1".to_string()) {
            let cache = std::env::var("HUGGINGFACE_HUB_CACHE")
                .map_err(|_| ())
                .map(|cache_dir| Cache::new(cache_dir.into()))
                .unwrap_or_else(|_| Cache::default());
            tracing::warn!("Offline mode active using cache defaults");
            Type::Cache(cache)
        } else {
            tracing::info!("Using the Hugging Face API");
            match api_builder().build() {
                Ok(api) => Type::Api(api),
                Err(_) => {
                    tracing::warn!("Unable to build the Hugging Face API");
                    Type::None
                }
            }
        }
    } else {
        Type::None
    };

    // Load tokenizer and model info
    let (
        config_filename,
        _tokenizer_config_filename,
        _preprocessor_config_filename,
        _processor_config_filename,
        _model_info,
    ) = match api {
        Type::None => (
            Some(local_path.join("config.json")),
            Some(local_path.join("tokenizer_config.json")),
            Some(local_path.join("preprocessor_config.json")),
            Some(local_path.join("processor_config.json")),
            None,
        ),
        Type::Api(api) => {
            let api_repo = api.repo(Repo::with_revision(
                tokenizer_name.to_string(),
                RepoType::Model,
                revision.unwrap_or_else(|| "main").to_string(),
            ));

            let config_filename = api_repo.get("config.json").await.ok();
            let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok();
            let preprocessor_config_filename = api_repo.get("preprocessor_config.json").await.ok();
            let processor_config_filename = api_repo.get("processor_config.json").await.ok();

            let model_info = if let Some(model_info) = get_hub_model_info(&api_repo).await {
                Some(model_info)
            } else {
                tracing::warn!("Could not retrieve model info from the Hugging Face hub.");
                None
            };
            (
                config_filename,
                tokenizer_config_filename,
                preprocessor_config_filename,
                processor_config_filename,
                model_info,
            )
        }
        Type::Cache(cache) => {
            let repo = cache.repo(Repo::with_revision(
                tokenizer_name.to_string(),
                RepoType::Model,
                revision.clone().unwrap_or_else(|| "main").to_string(),
            ));
            (
                repo.get("config.json"),
                repo.get("tokenizer_config.json"),
                repo.get("preprocessor_config.json"),
                repo.get("processor_config.json"),
                None,
            )
        }
    };

    let tokenizer: Tokenizer = {
        use pyo3::prelude::*;
        pyo3::Python::with_gil(|py| -> PyResult<()> {
            py_resolve_tokenizer(py, &tokenizer_name, revision.as_deref(), false)?;
            Ok(())
        })
        .inspect_err(|err| {
            tracing::error!("Failed to import python tokenizer {err}");
        })
        .or_else(|err| {
            let out = legacy_tokenizer_handle(config_filename.as_ref());
            out.ok_or(err)
        })
        .expect("We cannot load a tokenizer");
        let filename = "out/tokenizer.json";
        if let Ok(tok) = tokenizers::Tokenizer::from_file(filename) {
            Tokenizer::Rust(tok)
        } else {
            Tokenizer::Python {
                tokenizer_name: tokenizer_name.to_string(),
                revision: revision.map(|revision| revision.to_string()),
                trust_remote_code: false,
            }
        }
    };

    Some(tokenizer)
}

#[tokio::main]
async fn main() -> Result<(), TensorRtLlmBackendError> {
    // Get args
    let args = Args::parse();
    // Pattern match configuration
    let Args {
        max_concurrent_requests,
        max_best_of,
        max_stop_sequences,
        max_top_n_tokens,
        max_input_tokens,
        max_total_tokens,
        max_batch_prefill_tokens,
        max_batch_total_tokens,
        hostname,
        port,
        prometheus_port,
        tokenizer_name,
        tokenizer_config_path,
        revision,
        model_id,
        validation_workers,
        json_output,
        otlp_endpoint,
        otlp_service_name,
        cors_allow_origin,
        max_client_batch_size,
        auth_token,
        executor_worker,
        usage_stats,
        payload_limit,
        max_image_fetch_size,
    } = args;

    // Launch Tokio runtime
    text_generation_router::logging::init_logging(otlp_endpoint, otlp_service_name, json_output);

    // Validate args
    if max_input_tokens >= max_total_tokens {
        return Err(TensorRtLlmBackendError::ArgumentValidation(
            "`max_input_tokens` must be < `max_total_tokens`".to_string(),
        ));
    }
    if max_input_tokens as u32 > max_batch_prefill_tokens {
        return Err(TensorRtLlmBackendError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be >= `max_input_tokens`. Given: {max_batch_prefill_tokens} and {max_input_tokens}")));
    }

    if validation_workers == 0 {
        return Err(TensorRtLlmBackendError::ArgumentValidation(
            "`validation_workers` must be > 0".to_string(),
        ));
    }

    if let Some(ref max_batch_total_tokens) = max_batch_total_tokens {
        if max_batch_prefill_tokens > *max_batch_total_tokens {
            return Err(TensorRtLlmBackendError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be <= `max_batch_total_tokens`. Given: {max_batch_prefill_tokens} and {max_batch_total_tokens}")));
        }
        if max_total_tokens as u32 > *max_batch_total_tokens {
            return Err(TensorRtLlmBackendError::ArgumentValidation(format!("`max_total_tokens` must be <= `max_batch_total_tokens`. Given: {max_total_tokens} and {max_batch_total_tokens}")));
        }
    }

    if !executor_worker.exists() {
        return Err(TensorRtLlmBackendError::ArgumentValidation(format!(
            "`executor_work` specified path doesn't exists: {}",
            executor_worker.display()
        )));
    }

    // Create the backend
    match get_tokenizer(&tokenizer_name, revision.as_deref())
        .await
        .expect("Failed to retrieve tokenizer implementation")
    {
        Tokenizer::Python { .. } => Err(TensorRtLlmBackendError::Tokenizer(
            "Failed to retrieve Rust based tokenizer".to_string(),
        )),
        Tokenizer::Rust(tokenizer) => {
            info!("Successfully retrieved tokenizer {}", &tokenizer_name);
            let backend = TensorRtLlmBackendV2::new(
                tokenizer,
                model_id,
                executor_worker,
                max_concurrent_requests,
            )?;

            info!("Successfully created backend");

            // Run server
            server::run(
                backend,
                max_concurrent_requests,
                max_best_of,
                max_stop_sequences,
                max_top_n_tokens,
                max_input_tokens,
                max_total_tokens,
                validation_workers,
                auth_token,
                tokenizer_name,
                tokenizer_config_path,
                revision,
                false,
                hostname,
                port,
                cors_allow_origin,
                false,
                None,
                None,
                true,
                max_client_batch_size,
                usage_stats,
                payload_limit,
                max_image_fetch_size,
                prometheus_port,
            )
            .await?;
            Ok(())
        }
    }
}


================================================
FILE: backends/trtllm/src/utils.rs
================================================
///
/// Extract the first line of the provided string reference.
/// If there is no lines in the buffer, it returns a string
/// which content is defined by the content of `fail`
/// # Arguments
///
/// * `s`: The string buffer to extract the first-line from
/// * `fail`: A string content which is returned if no lines are
/// present in `s`
///
/// returns: String
///
/// # Examples
///
/// ```
/// let s = "My name is Morgan.\n I'm working at Hugging Face.";
/// first_line(s, "No line in string");
/// ```
#[inline]
pub(crate) fn first_line(s: &str, fail: &str) -> String {
    s.lines().next().unwrap_or(fail).to_string()
}


================================================
FILE: backends/trtllm/tests/test_backend.cpp
================================================
//
// Created by mfuntowicz on 12/3/24.
//

#include <catch2/catch_all.hpp>
#include <nlohmann/json.hpp>
#include <tensorrt_llm/executor/executor.h>

#include "backend.hpp"

using namespace huggingface::tgi::backends::trtllm;

TEST_CASE("parse generation_config.json all set", "[generation_config_t]")
{
    const json config_j = {{"temperature",  0.6},
                           {"top_p",        0.95},
                           {"eos_token_id", {1, 2, 3}}};
    const auto generation_config = generation_config_t(config_j);

    REQUIRE_THAT(generation_config.temperature, Catch::Matchers::WithinAbs(0.6, 1e-6));
    REQUIRE_THAT(generation_config.top_p, Catch::Matchers::WithinAbs(0.95, 1e-6));

    // Stop words
    REQUIRE_FALSE(generation_config.stop_words.empty());
    REQUIRE(generation_config.stop_words.size() == config_j["/eos_token_id"_json_pointer].size());

    for (auto [lhs, rhs]: std::views::zip(generation_config.stop_words, std::list<std::vector<int32_t>>{{1},
                                                                                                        {2},
                                                                                                        {3}})) {
        // Currently we do not support multi-tokens stop words
        REQUIRE(lhs.size() == 1);
        REQUIRE(rhs.size() == 1);
        REQUIRE_THAT(lhs, Catch::Matchers::UnorderedEquals(rhs));
    }
}

TEST_CASE("parse generation_config.json default", "[generation_config_t]")
{
    const json config_j = {{"eos_token_id", {1, 2, 3}}};
    const auto generation_config = generation_config_t(config_j);

    REQUIRE_THAT(generation_config.temperature, Catch::Matchers::WithinAbs(1.0, 1e-6));
    REQUIRE_THAT(generation_config.top_p, Catch::Matchers::WithinAbs(1.0, 1e-6));

    REQUIRE_FALSE(generation_config.stop_words.empty());
    REQUIRE(generation_config.stop_words.size() == config_j["/eos_token_id"_json_pointer].size());

    for (auto [lhs, rhs]: std::views::zip(generation_config.stop_words, std::list<std::vector<int32_t>>{{1},
                                                                                                        {2},
                                                                                                        {3}})) {
        // Currently we do not support multi-tokens stop words
        REQUIRE(lhs.size() == 1);
        REQUIRE(rhs.size() == 1);
        REQUIRE_THAT(lhs, Catch::Matchers::UnorderedEquals(rhs));
    }
}

TEST_CASE("parse generation_config.json empty", "[generation_config_t]")
{
    const json config_j = {{"eos_token_id", {}}};
    const auto generation_config = generation_config_t(config_j);

    REQUIRE_THAT(generation_config.temperature, Catch::Matchers::WithinAbs(1.0, 1e-6));
    REQUIRE_THAT(generation_config.top_p, Catch::Matchers::WithinAbs(1.0, 1e-6));

    REQUIRE(generation_config.stop_words.empty());

    const json config_j2 = {};
    const auto generation_config2 = generation_config_t(config_j);

    REQUIRE_THAT(generation_config2.temperature, Catch::Matchers::WithinAbs(1.0, 1e-6));
    REQUIRE_THAT(generation_config2.top_p, Catch::Matchers::WithinAbs(1.0, 1e-6));

    REQUIRE(generation_config2.stop_words.empty());
}

TEST_CASE("parallel_config single", "[backend_workspace_t]")
{
    // Generate temporary folder
    const auto tmp_p = std::filesystem::temp_directory_path();
    const auto config_p = tmp_p / "config.json";
    const auto generation_config_p = tmp_p / "generation_config.json";

    // Generate content
    std::ofstream o_config(config_p);
    o_config << R"({"pretrained_config": {"mapping": {"world_size": 2}}})"_json;
    o_config.close();

    std::ofstream o_generation_config(generation_config_p);
    o_generation_config << R"({"eos_token_id": []})"_json;
    o_generation_config.close();

    const auto workspace = backend_workspace_t(tmp_p.generic_string(), tmp_p.generic_string());
    const auto parallel = workspace.parallel_config();
    REQUIRE(parallel.getCommunicationMode() == tle::CommunicationMode::kORCHESTRATOR);
    REQUIRE(parallel.getCommunicationType() == tle::CommunicationType::kMPI);

    std::filesystem::remove(config_p);
    std::filesystem::remove(generation_config_p);
}

TEST_CASE("parallel_config multi", "[backend_workspace_t]")
{
    // Generate temporary folder
    const auto tmp_p = std::filesystem::temp_directory_path();
    const auto config_p = tmp_p / "config.json";
    const auto generation_config_p = tmp_p / "generation_config.json";

    // Generate content
    std::ofstream o_config(config_p);
    o_config << R"({"pretrained_config": {"mapping": {"world_size": 1}}})"_json;
    o_config.close();

    std::ofstream o_generation_config(generation_config_p);
    o_generation_config << R"({"eos_token_id": []})"_json;
    o_generation_config.close();

    const auto workspace = backend_workspace_t(tmp_p.generic_string(), tmp_p.generic_string());
    const auto parallel = workspace.parallel_config();
    REQUIRE(parallel.getCommunicationMode() == tle::CommunicationMode::kLEADER);
    REQUIRE(parallel.getCommunicationType() == tle::CommunicationType::kMPI);

    std::filesystem::remove(config_p);
    std::filesystem::remove(generation_config_p);
}

TEST_CASE("executor_config", "[backend_workspace_t]")
{

}

TEST_CASE("sampling_params_t to tle::SamplingConfig", "[backend_t]")
{
    const sampling_params_t params = {40, 0.95, 0.9, 1.0, 0.6, 2014};
    const auto config = static_cast<tle::SamplingConfig>(params);

    REQUIRE(config.getTopK().has_value());
    REQUIRE(config.getTopK().value() == params.top_k);

    REQUIRE(config.getSeed().has_value());
    REQUIRE(config.getSeed().value() == params.seed);

    REQUIRE(config.getTopP().has_value());
    REQUIRE_THAT(*config.getTopP(), Catch::Matchers::WithinAbs(params.top_p, 1e-6f));

    REQUIRE(config.getRepetitionPenalty().has_value());
    REQUIRE_THAT(*config.getRepetitionPenalty(), Catch::Matchers::WithinAbs(params.repetition_penalty, 1e-6f));

    REQUIRE(config.getFrequencyPenalty().has_value());
    REQUIRE_THAT(*config.getFrequencyPenalty(), Catch::Matchers::WithinAbs(params.frequency_penalty, 1e-6f));

    REQUIRE(config.getTemperature().has_value());
    REQUIRE_THAT(*config.getTemperature(), Catch::Matchers::WithinAbs(params.temperature, 1e-6f));
}


================================================
FILE: backends/trtllm/tests/test_hardware.cpp
================================================
//
// Created by mfuntowicz on 11/16/24.
//

#include <catch2/catch_all.hpp>
#include "../csrc/hardware.hpp"

using namespace huggingface::tgi::hardware::cuda;

TEST_CASE("is_at_least_<arch>") {
    const static auto VOLTA_CAPABILITIES = compute_capabilities_t(7, 0);
    REQUIRE(VOLTA_CAPABILITIES.is_at_least_volta());
    REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least_turing());
    REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least_ampere());
    REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least_ada_lovelace());
    REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least_hopper());

    const static auto TURING_CAPABILITIES = compute_capabilities_t(7, 5);
    REQUIRE(TURING_CAPABILITIES.is_at_least_volta());
    REQUIRE(TURING_CAPABILITIES.is_at_least_turing());
    REQUIRE_FALSE(TURING_CAPABILITIES.is_at_least_ampere());
    REQUIRE_FALSE(TURING_CAPABILITIES.is_at_least_ada_lovelace());
    REQUIRE_FALSE(TURING_CAPABILITIES.is_at_least_hopper());

    const static auto AMPERE_CAPABILITIES = compute_capabilities_t(8, 0);
    REQUIRE(AMPERE_CAPABILITIES.is_at_least_volta());
    REQUIRE(AMPERE_CAPABILITIES.is_at_least_turing());
    REQUIRE(AMPERE_CAPABILITIES.is_at_least_ampere());
    REQUIRE_FALSE(AMPERE_CAPABILITIES.is_at_least_ada_lovelace());
    REQUIRE_FALSE(AMPERE_CAPABILITIES.is_at_least_hopper());

    const static auto ADA_LOVELACE_CAPABILITIES = compute_capabilities_t(8, 9);
    REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least_volta());
    REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least_turing());
    REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least_ampere());
    REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least_ada_lovelace());
    REQUIRE_FALSE(ADA_LOVELACE_CAPABILITIES.is_at_least_hopper());

    const static auto HOPPER_CAPABILITIES = compute_capabilities_t(9, 0);
    REQUIRE(HOPPER_CAPABILITIES.is_at_least_volta());
    REQUIRE(HOPPER_CAPABILITIES.is_at_least_turing());
    REQUIRE(HOPPER_CAPABILITIES.is_at_least_ampere());
    REQUIRE(HOPPER_CAPABILITIES.is_at_least_ada_lovelace());
    REQUIRE(HOPPER_CAPABILITIES.is_at_least_hopper());
}

TEST_CASE("is_at_least") {
    const static auto VOLTA_CAPABILITIES = compute_capabilities_t(7, 0);
    REQUIRE(VOLTA_CAPABILITIES.is_at_least(VOLTA));
    REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least(TURING));
    REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least(AMPERE));
    REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least(ADA_LOVELACE));
    REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least(HOPPER));

    const static auto TURING_CAPABILITIES = compute_capabilities_t(7, 5);
    REQUIRE(TURING_CAPABILITIES.is_at_least(VOLTA));
    REQUIRE(TURING_CAPABILITIES.is_at_least(TURING));
    REQUIRE_FALSE(TURING_CAPABILITIES.is_at_least(AMPERE));
    REQUIRE_FALSE(TURING_CAPABILITIES.is_at_least(ADA_LOVELACE));
    REQUIRE_FALSE(TURING_CAPABILITIES.is_at_least(HOPPER));

    const static auto AMPERE_CAPABILITIES = compute_capabilities_t(8, 0);
    REQUIRE(AMPERE_CAPABILITIES.is_at_least(VOLTA));
    REQUIRE(AMPERE_CAPABILITIES.is_at_least(TURING));
    REQUIRE(AMPERE_CAPABILITIES.is_at_least(AMPERE));
    REQUIRE_FALSE(AMPERE_CAPABILITIES.is_at_least(ADA_LOVELACE));
    REQUIRE_FALSE(AMPERE_CAPABILITIES.is_at_least(HOPPER));

    const static auto ADA_LOVELACE_CAPABILITIES = compute_capabilities_t(8, 9);
    REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least(VOLTA));
    REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least(TURING));
    REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least(AMPERE));
    REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least(ADA_LOVELACE));
    REQUIRE_FALSE(ADA_LOVELACE_CAPABILITIES.is_at_least(HOPPER));

    const static auto HOPPER_CAPABILITIES = compute_capabilities_t (9, 0);
    REQUIRE(HOPPER_CAPABILITIES.is_at_least(VOLTA));
    REQUIRE(HOPPER_CAPABILITIES.is_at_least(TURING));
    REQUIRE(HOPPER_CAPABILITIES.is_at_least(AMPERE));
    REQUIRE(HOPPER_CAPABILITIES.is_at_least(ADA_LOVELACE));
    REQUIRE(HOPPER_CAPABILITIES.is_at_least(HOPPER));
}


================================================
FILE: backends/v2/Cargo.toml
================================================
[package]
name = "text-generation-router-v2"
description = "Text Generation Webserver"
version.workspace = true
edition.workspace = true
authors.workspace = true
homepage.workspace = true

[lib]
path = "src/lib.rs"

[[bin]]
name = "text-generation-router-v2"
path = "src/main.rs"

[dependencies]
async-trait = "0.1.74"
async-stream = "0.3.5"
axum = { version = "0.7", features = ["json"] }
axum-tracing-opentelemetry = "0.16"
text-generation-router = { path = "../../router" }
clap = { version = "4.4.5", features = ["derive", "env"] }
grpc-metadata = { path = "../grpc-metadata" }
futures = "0.3.28"
hf-hub = { workspace = true }
jsonschema = { version = "0.28.0" }
metrics = { workspace = true }
metrics-exporter-prometheus = { workspace = true }
nohash-hasher = "0.2.0"
opentelemetry = { version = "0.20.0", features = ["rt-tokio"] }
opentelemetry-otlp = "0.13.0"
rand = "0.8.5"
reqwest = { version = "0.11.20", features = [] }
serde = "1.0.188"
serde_json = "1.0.107"
slotmap = "1.0.7"
thiserror = "1.0.48"
tokenizers = { workspace = true }
tokio = { version = "1.32.0", features = [
  "rt",
  "rt-multi-thread",
  "parking_lot",
  "signal",
  "sync",
] }
tokio-stream = "0.1.14"
tower-http = { version = "0.5.1", features = ["cors"] }
tracing = "0.1.37"
tracing-opentelemetry = "0.21.0"
tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] }
utoipa = { version = "4.2.0", features = ["axum_extras"] }
utoipa-swagger-ui = { version = "6.0.0", features = ["axum"] }
init-tracing-opentelemetry = { version = "0.14.1", features = [
  "opentelemetry-otlp",
] }
minijinja = { workspace = true }
minijinja-contrib = { workspace = true }
futures-util = "0.3.30"
regex = "1.10.3"
once_cell = "1.19.0"
image = "0.25.1"
base64 = { workspace = true }
prost = "^0.12"
tonic = "^0.10"
tower = "^0.4"

[build-dependencies]
tonic-build = "0.10.1"
prost-build = "0.12.1"

[features]
default = ["ngrok"]
ngrok = ["text-generation-router/ngrok"]
google = ["text-generation-router/google"]
kserve = ["text-generation-router/kserve"]


================================================
FILE: backends/v2/build.rs
================================================
use std::fs;

fn main() -> Result<(), Box<dyn std::error::Error>> {
    println!("cargo:rerun-if-changed=../../proto/");

    fs::create_dir_all("src/client/pb").unwrap_or(());
    let mut config = prost_build::Config::new();
    config.protoc_arg("--experimental_allow_proto3_optional");

    tonic_build::configure()
        .build_client(true)
        .build_server(false)
        .out_dir("src/client/pb")
        .include_file("mod.rs")
        .compile_with_config(config, &["../../proto/generate.proto"], &["../../proto"])
        .unwrap_or_else(|e| panic!("protobuf compilation failed: {e}"));

    Ok(())
}


================================================
FILE: backends/v2/src/backend.rs
================================================
use crate::client::{Batch, CachedBatch, ClientError, Generation, Health, ShardedClient};
/// Batching and inference logic
use crate::queue::{Entry, Queue};
use async_trait::async_trait;
use nohash_hasher::IntMap;
use std::sync::Arc;
use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse};
use text_generation_router::validation::ValidGenerateRequest;
use text_generation_router::{FinishReason, PrefillToken, Token};
use tokio::sync::mpsc::error::SendError;
use tokio::sync::{mpsc, Notify};
use tokio::time::Instant;
use tokio_stream::wrappers::UnboundedReceiverStream;
use tracing::{info_span, instrument, Instrument, Span};

pub struct BackendV2 {
    /// Request queue
    queue: Queue,
    /// Notify batcher on queue appends
    batching_task_notifier: Arc<Notify>,
    /// Client clone, used for health checks to skip the queue
    client: ShardedClient,
}

impl BackendV2 {
    #[allow(clippy::too_many_arguments)]
    pub(crate) fn new(
        client: ShardedClient,
        waiting_served_ratio: f32,
        max_batch_prefill_tokens: u32,
        max_batch_total_tokens: u32,
        max_waiting_tokens: usize,
        max_batch_size: Option<usize>,
        requires_padding: bool,
        window_size: Option<u32>,
        speculate: u32,
    ) -> Self {
        // Infer shared state
        let attention = std::env::var("ATTENTION").unwrap_or("paged".to_string());
        let block_size = match attention.as_str() {
            "flashinfer" => 1,
            "flashdecoding" => 256,
            "paged" => 16,
            _ => unreachable!(),
        };

        let queue = Queue::new(requires_padding, block_size, window_size, speculate);
        let batching_task_notifier = Arc::new(Notify::new());

        // Spawn batching background task that contains all the inference logic
        tokio::spawn(batching_task(
            client.clone(),
            waiting_served_ratio,
            max_batch_prefill_tokens,
            max_batch_total_tokens,
            max_waiting_tokens,
            max_batch_size,
            queue.clone(),
            batching_task_notifier.clone(),
        ));

        Self {
            queue,
            batching_task_notifier,
            client,
        }
    }
}

#[async_trait]
impl Backend for BackendV2 {
    #[instrument(skip_all)]
    fn schedule(
        &self,
        request: ValidGenerateRequest,
    ) -> Result<UnboundedReceiverStream<Result<InferStreamResponse, InferError>>, InferError> {
        // MPSC channel to communicate with the background batching task
        let (response_tx, response_rx) = mpsc::unbounded_channel();

        // Append the request to the queue
        self.queue.append(Entry {
            request,
            response_tx,
            span: Span::current(),
            temp_span: None,
            queue_time: Instant::now(),
            batch_time: None,
        });

        // Notify the background task that we have a new entry in the queue that needs
        // to be batched
        self.batching_task_notifier.notify_one();

        // Return stream
        Ok(UnboundedReceiverStream::new(response_rx))
    }

    async fn health(&self, current_health: bool) -> bool {
        if current_health {
            // Generation is healthy, we only check that the shards can allocate on device
            self.client.device_health().await
        } else {
            self.client.model_health().await
        }
        .is_ok()
    }

    fn start_health(&self) -> bool {
        true
    }

    fn name(&self) -> &'static str {
        "tgi-v2"
    }
}

/// Batching logic
/// Will be launched in a background Tokio task
///
/// Batches requests and sends them to the inference server
#[allow(clippy::too_many_arguments)]
pub(crate) async fn batching_task(
    mut client: ShardedClient,
    waiting_served_ratio: f32,
    max_batch_prefill_tokens: u32,
    max_batch_total_tokens: u32,
    max_waiting_tokens: usize,
    max_batch_size: Option<usize>,
    queue: Queue,
    notifier: Arc<Notify>,
) {
    // Infinite loop
    loop {
        // Wait for a notification from the Infer struct
        notifier.notified().await;

        // Get the next batch from the queue
        // This batch might be smaller than the maximum batch size if there are not enough requests
        // waiting in the queue
        while let Some((mut entries, batch, span)) = queue
            .next_batch(
                None,
                max_batch_size,
                max_batch_prefill_tokens,
                max_batch_total_tokens,
            )
            .await
        {
            let mut cached_batch = prefill(&mut client, batch, &mut entries)
                .instrument(span)
                .await;
            let mut waiting_tokens = 1;

            // We loop until we do not receive any cached batch from the inference server (== until
            // all requests have met their stopping criteria)
            while let Some(batch) = cached_batch {
                // Get current batch info
                let batch_size = batch.size;
                let batch_max_tokens = batch.max_tokens;
                let mut batches = vec![batch];
                metrics::gauge!("tgi_batch_current_size").set(batch_size as f64);
                metrics::gauge!("tgi_batch_current_max_tokens").set(batch_max_tokens as f64);

                let min_size = if waiting_tokens >= max_waiting_tokens {
                    // If we didn't onboard any new requests since >= max_waiting_tokens, we try
                    // to add a new batch even though its size might be small
                    None
                } else {
                    // Minimum batch size
                    Some((batch_size as f32 * waiting_served_ratio).floor() as usize)
                };

                let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens);
                let max_size =
                    max_batch_size.map(|max_size| max_size.saturating_sub(batch_size as usize));
                // Try to get a new batch
                if let Some((mut new_entries, new_batch, span)) = queue
                    .next_batch(min_size, max_size, max_batch_prefill_tokens, token_budget)
                    .await
                {
                    // Tracking metrics
                    if min_size.is_some() {
                        metrics::counter!("tgi_batch_concat", "reason" => "backpressure")
                            .increment(1);
                    } else {
                        metrics::counter!("tgi_batch_concat", "reason" => "wait_exceeded")
                            .increment(1);
                    }

                    entries.iter_mut().for_each(|(_, entry)| {
                        // Create a new span to add the info that this entry is waiting
                        // because a new batch is being computed
                        let entry_waiting_span = info_span!(parent: &entry.span, "waiting");
                        // Add relationships
                        span.follows_from(&entry_waiting_span);
                        entry_waiting_span.follows_from(&span);
                        // Update entry
                        entry.temp_span = Some(entry_waiting_span);
                    });

                    // Generate one token for this new batch to have the attention past in cache
                    let new_cached_batch = prefill(&mut client, new_batch, &mut new_entries)
                        .instrument(span)
                        .await;
                    // Reset waiting counter
                    waiting_tokens = 1;
                    // Extend current batch with the new batch
                    if let Some(new_cached_batch) = new_cached_batch {
                        entries.extend(new_entries);
                        batches.push(new_cached_batch);
                    }
                }

                // Create span for this batch to add context to inference calls
                let next_batch_size = entries.len();
                let next_batch_span =
                    info_span!(parent: None, "batch", batch_size = next_batch_size);
                entries.iter_mut().for_each(|(_, entry)| {
                    // Create a new span to link the batch back to this entry
                    let entry_batch_span = info_span!(parent: &entry.span, "infer");
                    // Add relationships
                    next_batch_span.follows_from(&entry_batch_span);
                    entry_batch_span.follows_from(&next_batch_span);
                    // Update entry
                    entry.temp_span = Some(entry_batch_span);
                });

                cached_batch = decode(&mut client, batches, &mut entries)
                    .instrument(next_batch_span)
                    .await;
                waiting_tokens += 1;
            }
            metrics::gauge!("tgi_batch_current_size").set(0.0);
            metrics::gauge!("tgi_batch_current_max_tokens").set(0.0);
        }
    }
}

#[instrument(skip_all)]
async fn prefill(
    client: &mut ShardedClient,
    batch: Batch,
    entries: &mut IntMap<u64, Entry>,
) -> Option<CachedBatch> {
    let start_time = Instant::now();
    let batch_id = batch.id;
    metrics::counter!("tgi_batch_inference_count", "method" => "prefill").increment(1);

    match client.prefill(batch).await {
        Ok((generations, next_batch, timings)) => {
            let start_filtering_time = Instant::now();
            // Send generated tokens and filter stopped entries
            filter_send_generations(generations, entries);

            // Filter next batch and remove requests that were stopped
            let next_batch = filter_batch(client, next_batch, entries).await;

            metrics::histogram!("tgi_batch_forward_duration","method" => "prefill")
                .record(timings.forward.as_secs_f64());
            metrics::histogram!("tgi_batch_decode_duration", "method" => "prefill")
                .record(timings.decode.as_secs_f64());
            metrics::histogram!("tgi_batch_filter_duration", "method" => "prefill")
                .record(start_filtering_time.elapsed().as_secs_f64());
            metrics::histogram!("tgi_batch_inference_duration","method" => "prefill")
                .record(start_time.elapsed().as_secs_f64());
            metrics::counter!("tgi_batch_inference_success", "method" => "prefill").increment(1);
            next_batch
        }
        // If we have an error, we discard the whole batch
        Err(err) => {
            let _ = client.clear_cache(Some(batch_id)).await;
            send_errors(err, entries);
            metrics::counter!("tgi_batch_inference_failure", "method" => "prefill").increment(1);
            None
        }
    }
}

#[instrument(skip_all)]
async fn decode(
    client: &mut ShardedClient,
    batches: Vec<CachedBatch>,
    entries: &mut IntMap<u64, Entry>,
) -> Option<CachedBatch> {
    let start_time = Instant::now();
    let batch_ids: Vec<u64> = batches.iter().map(|b| b.id).collect();
    metrics::counter!("tgi_batch_inference_count", "method" => "decode").increment(1);

    match client.decode(batches).await {
        Ok((generations, next_batch, timings)) => {
            let start_filtering_time = Instant::now();
            // Send generated tokens and filter stopped entries
            filter_send_generations(generations, entries);

            // Filter next batch and remove requests that were stopped
            let next_batch = filter_batch(client, next_batch, entries).await;

            if let Some(concat_duration) = timings.concat {
                metrics::histogram!("tgi_batch_concat_duration", "method" => "decode")
                    .record(concat_duration.as_secs_f64());
            }
            metrics::histogram!("tgi_batch_forward_duration", "method" => "decode")
                .record(timings.forward.as_secs_f64());
            metrics::histogram!("tgi_batch_decode_duration", "method" => "decode")
                .record(timings.decode.as_secs_f64());
            metrics::histogram!("tgi_batch_filter_duration", "method" => "decode")
                .record(start_filtering_time.elapsed().as_secs_f64());
            metrics::histogram!("tgi_batch_inference_duration", "method" => "decode")
                .record(start_time.elapsed().as_secs_f64());
            metrics::counter!("tgi_batch_inference_success", "method" => "decode").increment(1);
            next_batch
        }
        // If we have an error, we discard the whole batch
        Err(err) => {
            for id in batch_ids {
                let _ = client.clear_cache(Some(id)).await;
            }
            send_errors(err, entries);
            metrics::counter!("tgi_batch_inference_failure", "method" => "decode").increment(1);
            None
        }
    }
}

/// Filter a `batch` and remove all requests not present in `entries`
#[instrument(skip_all)]
async fn filter_batch(
    client: &mut ShardedClient,
    next_batch: Option<CachedBatch>,
    entries: &IntMap<u64, Entry>,
) -> Option<CachedBatch> {
    let mut batch = next_batch?;

    // No need to filter
    if batch.size as usize == entries.len() {
        return Some(batch);
    }

    let id = batch.id;

    // Retain only requests that are still in entries
    batch.request_ids.retain(|id| entries.contains_key(id));

    if batch.request_ids.is_empty() {
        // All requests have been filtered out
        // Next batch is now empty
        // Clear it from the Python shards cache
        // We unwrap here as we need to panic since we cannot recover if this method fails
        client.clear_cache(Some(id)).await.unwrap();
        None
    } else {
        // Filter Python shard cache
        // We unwrap here as we need to panic since we cannot recover if this method fails
        client.filter_batch(id, batch.request_ids).await.unwrap()
    }
}

/// Send one or multiple `InferStreamResponse` to Infer for all `entries`
/// and filter entries
#[instrument(skip_all)]
fn filter_send_generations(generations: Vec<Generation>, entries: &mut IntMap<u64, Entry>) {
    generations.into_iter().for_each(|generation| {
        let id = generation.request_id;
        // Get entry
        // We can `expect` here as the request id should always be in the entries
        let entry = entries
            .get(&id)
            .expect("ID not found in entries. This is a bug.");

        // Create and enter a span to link this function back to the entry
        let _span = info_span!(parent: entry.temp_span.as_ref().expect("batch_span is None. This is a bug."), "send_generation", generation = ?generation).entered();
        // Send generation responses back to the infer task
        // If the receive an error from the Flume channel, it means that the client dropped the
        // request and we need to stop generating hence why we unwrap_or(true)
        let stopped = send_responses(generation, entry).inspect_err(|_err| {
            tracing::error!("Entry response channel error.");
            metrics::counter!("tgi_request_failure", "err" => "dropped").increment(1);
        }).unwrap_or(true);
        if stopped {
            entries.remove(&id).expect("ID not found in entries. This is a bug.");
        }
    });
}

/// Send responses through the `entry` response channel
fn send_responses(
    generation: Generation,
    entry: &Entry,
) -> Result<bool, Box<SendError<Result<InferStreamResponse, InferError>>>> {
    // Return directly if the channel is disconnected
    if entry.response_tx.is_closed() {
        metrics::counter!("tgi_request_failure", "err" => "dropped").increment(1);
        return Ok(true);
    }

    let mut stopped = false;

    if let Some(prefill_tokens) = generation.prefill_tokens {
        // Create Token objects
        // We do that here instead of in the Python code as Rust for loops are faster
        let prefill_tokens = prefill_tokens
            .ids
            .into_iter()
            .zip(prefill_tokens.logprobs)
            .zip(prefill_tokens.texts)
            .map(|((id, logprob), text)| PrefillToken { id, text, logprob })
            .collect();

        // Send message
        entry
            .response_tx
            .send(Ok(InferStreamResponse::Prefill(prefill_tokens)))?;
    }

    // Create last Token
    let tokens_ = generation.tokens.expect("Non empty tokens in generation");
    let n = tokens_.ids.len();
    metrics::histogram!("tgi_request_skipped_tokens").record((n - 1) as f64);
    let mut iterator = tokens_
        .ids
        .into_iter()
        .zip(tokens_.logprobs)
        .zip(tokens_.texts)
        .zip(tokens_.is_special)
        .enumerate()
        .peekable();
    while let Some((i, (((id, logprob), text), special))) = iterator.next() {
        let token = Token {
            id,
            text,
            logprob,
            special,
        };
        let top_tokens = if let Some(top_tokens_) = generation.top_tokens.get(i) {
            top_tokens_
                .ids
                .iter()
                .zip(top_tokens_.logprobs.iter())
                .zip(top_tokens_.texts.iter())
                .zip(top_tokens_.is_special.iter())
                .map(|(((&id, &logprob), text), &special)| Token {
                    id,
                    text: text.to_string(),
                    logprob,
                    special,
                })
                .collect()
        } else {
            vec![]
        };
        match (&generation.generated_text, iterator.peek()) {
            (Some(generated_text), None) => {
                // Generation has ended
                stopped = true;
                // Send message
                entry.response_tx.send(Ok(InferStreamResponse::End {
                    token,
                    top_tokens,
                    generated_text: GeneratedText::from(generated_text.clone()),
                    queued: entry.queue_time,
                    start: entry.batch_time.unwrap(),
                }))?;
            }
            _ => {
                // Send message
                entry
                    .response_tx
                    .send(Ok(InferStreamResponse::Intermediate { token, top_tokens }))?;
            }
        }
    }

    Ok(stopped)
}

/// Send errors to Infer for all `entries`
#[instrument(skip_all)]
fn send_errors(error: ClientError, entries: &mut IntMap<u64, Entry>) {
    entries.drain().for_each(|(_, entry)| {
        // Create and enter a span to link this function back to the entry
        let _send_error_span = info_span!(parent: entry.temp_span.as_ref().expect("batch_span is None. This is a bug."), "send_error").entered();
        let err = InferError::GenerationError(error.to_string());
        metrics::counter!("tgi_request_failure", "err" => "generation").increment(1);
        tracing::error!("{err}");

        // unwrap_or is valid here as we don't care if the receiver is gone.
        entry
            .response_tx
            .send(Err(err))
            .unwrap_or(());
    });
}

impl From<crate::client::GeneratedText> for GeneratedText {
    fn from(value: crate::client::GeneratedText) -> Self {
        let v2_finish_reason = crate::client::FinishReason::try_from(value.finish_reason).unwrap();
        let finish_reason = match v2_finish_reason {
            crate::client::FinishReason::Length => FinishReason::Length,
            crate::client::FinishReason::EosToken => FinishReason::EndOfSequenceToken,
            crate::client::FinishReason::StopSequence => FinishReason::StopSequence,
        };

        Self {
            text: value.text,
            generated_tokens: value.generated_tokens,
            finish_reason,
            seed: value.seed,
        }
    }
}


================================================
FILE: backends/v2/src/client/grpc_client.rs
================================================
/// Single shard Client
use crate::client::pb;
use crate::client::{ClientError, Result, WARMUP_IMAGE_BASE64};
use grpc_metadata::InjectTelemetryContext;
use pb::generate::v2::text_generation_service_client::TextGenerationServiceClient;
use pb::generate::v2::*;
use std::cmp::min;
use std::time::Duration;
use tonic::transport::{Channel, Uri};
use tracing::instrument;

/// Text Generation Inference gRPC client
#[derive(Debug, Clone)]
pub struct Client {
    stub: TextGenerationServiceClient<Channel>,
}

impl Client {
    /// Returns a client connected to the given url
    #[allow(dead_code)]
    pub async fn connect(uri: Uri) -> Result<Self> {
        let channel = Channel::builder(uri).connect().await?;

        Ok(Self {
            stub: TextGenerationServiceClient::new(channel),
        })
    }

    /// Returns a client connected to the given unix socket
    pub async fn connect_uds(path: String) -> Result<Self> {
        let channel = Channel::from_shared("http://[::]:50051".to_string())
            .unwrap()
            .connect_with_connector(tower::service_fn(move |_: Uri| {
                tokio::net::UnixStream::connect(path.clone())
            }))
            .await?;

        Ok(Self {
            stub: TextGenerationServiceClient::new(channel),
        })
    }

    /// Returns a list of uris or unix sockets of all shards
    #[instrument(skip(self))]
    pub async fn service_discovery(&mut self) -> Result<Vec<String>> {
        let request = tonic::Request::new(ServiceDiscoveryRequest {}).inject_context();
        let response = self.stub.service_discovery(request).await.map_err(|_| {
            ClientError::Connection("Server does not support v2 interface".to_string())
        })?;
        let urls = response
            .into_inner()
            .urls
            .into_iter()
            // Remove unix socket prefix
            .map(|url| match url.strip_prefix("unix://") {
                None => url,
                Some(stripped_url) => stripped_url.to_string(),
            })
            .collect();
        Ok(urls)
    }

    /// Get model info
    #[instrument(skip(self))]
    pub async fn info(&mut self) -> Result<InfoResponse> {
        let request = tonic::Request::new(InfoRequest {}).inject_context();
        let response = self.stub.info(request).await?.into_inner();
        Ok(response)
    }

    /// Get model health
    #[instrument(skip(self))]
    pub async fn health(&mut self) -> Result<HealthResponse> {
        let request = tonic::Request::new(HealthRequest {}).inject_context();
        let response = self.stub.health(request).await?.into_inner();
        Ok(response)
    }

    /// Clear the past generations cache
    #[instrument(skip(self))]
    pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {
        let request = tonic::Request::new(ClearCacheRequest { id: batch_id }).inject_context();
        self.stub.clear_cache(request).await?;
        Ok(())
    }

    /// Filter a cached batch
    #[instrument(skip(self))]
    pub async fn filter_batch(
        &mut self,
        batch_id: u64,
        request_ids: Vec<u64>,
    ) -> Result<Option<CachedBatch>> {
        let request = tonic::Request::new(FilterBatchRequest {
            batch_id,
            request_ids,
        })
        .inject_context();
        let filtered_batch = self.stub.filter_batch(request).await?.into_inner();
        Ok(filtered_batch.batch)
    }

    /// Warmup on a max size batch
    ///
    /// Returns the maximum amount of tokens supported by the hardware
    #[instrument(skip_all)]
    pub async fn warmup(
        &mut self,
        max_input_length: u32,
        max_prefill_tokens: u32,
        max_total_tokens: u32,
        max_batch_size: Option<usize>,
    ) -> Result<Option<u32>> {
        let mut n_tokens = 0;
        let mut requests = Vec::new();
        // Create requests
        while n_tokens < max_prefill_tokens {
            let truncate = min(max_input_length, max_prefill_tokens - n_tokens);

            let mut inputs = String::new();
            inputs.push_str(&"_test ".to_string().repeat(max_input_length as usize));
            if n_tokens == 0 {
                // 1 request is enough to test vision heads.
                // Sending images on other queries messes up easily with truncation.
                inputs.push_str(&format!(
                    "![](data:image/jpeg;base64,{WARMUP_IMAGE_BASE64})",
                ));
            }

            requests.push(Request {
                id: 0,
                inputs,
                // We truncate the input on the server side to be sure that it has the correct size
                truncate,
                // Set sampling parameters to also take these ops into account in the max memory
                parameters: Some(NextTokenChooserParameters {
                    temperature: 0.9,
                    top_k: 10,
                    top_p: 0.9,
                    typical_p: 0.9,
                    do_sample: false,
                    seed: 0,
                    repetition_penalty: 1.2,
                    frequency_penalty: 0.1,
                    watermark: true,
                    grammar: String::new(),
                    grammar_type: GrammarType::None as i32,
                }),
                stopping_parameters: Some(StoppingCriteriaParameters {
                    max_new_tokens: max_total_tokens - truncate,
                    stop_sequences: vec![],
                    ignore_eos_token: true,
                }),
                prefill_logprobs: true,
                top_n_tokens: 20,
            });
            n_tokens += max_input_length;

            // Check max_batch_size
            if Some(requests.len()) == max_batch_size {
                break;
            }
        }

        let batch = Batch {
            id: 0,
            size: requests.len() as u32,
            requests,
            max_tokens: 0,
        };

        let request = tonic::Request::new(WarmupRequest {
            batch: Some(batch),
            max_input_length,
            max_prefill_tokens,
            max_total_tokens,
        })
        .inject_context();
        let response = self.stub.warmup(request).await?.into_inner();
        Ok(response.max_supported_total_tokens)
    }

    /// Generate one token for each request in the given batch
    ///
    /// Returns Generation for each request in batch
    /// and the next cached batch
    #[instrument(skip_all, fields(id = &batch.id, size = &batch.size))]
    pub async fn prefill(
        &mut self,
        batch: Batch,
    ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
        let request = tonic::Request::new(PrefillRequest { batch: Some(batch) }).inject_context();
        let response = self.stub.prefill(request).await?.into_inner();
        Ok((
            response.generations,
            response.batch,
            PrefillTimings::new(response.forward_ns, response.decode_ns, response.total_ns),
        ))
    }

    /// Generate one token for each request in the given cached batches
    ///
    /// Returns Generation for each request in batches
    /// and the next cached batch
    #[instrument(skip_all, fields(size = batches.iter().map(|batch|{batch.size}).sum::<u32>()))]
    pub async fn decode(
        &mut self,
        batches: Vec<CachedBatch>,
    ) -> Result<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)> {
        let request = tonic::Request::new(DecodeRequest { batches }).inject_context();
        let response = self.stub.decode(request).await?.into_inner();
        Ok((
            response.generations,
            response.batch,
            DecodeTimings::new(
                response.concat_ns,
                response.forward_ns,
                response.decode_ns,
                response.total_ns,
            ),
        ))
    }
}

pub struct PrefillTimings {
    pub forward: Duration,
    pub decode: Duration,
    pub total: Duration,
}

impl PrefillTimings {
    fn new(forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
        Self {
            forward: Duration::from_nanos(forward_ns),
            decode: Duration::from_nanos(decode_ns),
            total: Duration::from_nanos(total_ns),
        }
    }
}

pub struct DecodeTimings {
    pub concat: Option<Duration>,
    pub forward: Duration,
    pub decode: Duration,
    pub total: Duration,
}

impl DecodeTimings {
    fn new(concat_ns: Option<u64>, forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
        Self {
            concat: concat_ns.map(Duration::from_nanos),
            forward: Duration::from_nanos(forward_ns),
            decode: Duration::from_nanos(decode_ns),
            total: Duration::from_nanos(total_ns),
        }
    }
}


================================================
FILE: backends/v2/src/client/mod.rs
================================================
//! Text Generation gRPC client library

use async_trait::async_trait;
use thiserror::Error;
use tonic::transport;
use tonic::Status;

#[allow(clippy::derive_partial_eq_without_eq)]
mod pb;

mod grpc_client;
mod sharded_client;

pub use grpc_client::Client;
pub use pb::generate::v2::{
    Batch, CachedBatch, FinishReason, GeneratedText, Generation, GrammarType, HealthResponse,
    InfoResponse, NextTokenChooserParameters, Request, StoppingCriteriaParameters,
};
pub use sharded_client::ShardedClient;

#[async_trait]
pub trait Health {
    /// Check if a generate server is healthy by asking it to allocate a tensor on device
    async fn device_health(&self) -> Result<()>;

    /// Check if a generate server is healthy by doing a forward pass.
    /// EXPENSIVE
    async fn model_health(&self) -> Result<()>;
}

#[derive(Debug)]
pub struct ShardInfo {
    pub requires_padding: bool,
    pub dtype: String,
    pub device_type: String,
    pub window_size: Option<u32>,
    pub speculate: u32,
}

#[derive(Error, Debug, Clone)]
pub enum ClientError {
    #[error("Could not connect to Text Generation server: {0}")]
    Connection(String),
    #[error("Server error: {0}")]
    Generation(String),
    #[error("Sharded results are empty")]
    EmptyResults,
}

impl From<Status> for ClientError {
    fn from(err: Status) -> Self {
        let err = Self::Generation(err.message().to_string());
        tracing::error!("{err}");
        err
    }
}

impl From<transport::Error> for ClientError {
    fn from(err: transport::Error) -> Self {
        let err = Self::Connection(err.to_string());
        tracing::error!("{err}");
        err
    }
}

static WARMUP_IMAGE_BASE64 :&str = "iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAIAAAAC64paAAABg2lDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw0AcxV/TSotUROxQxCFDdbKLijjWKhShQqgVWnUwufQLmrQkKS6OgmvBwY/FqoOLs64OroIg+AHi7OCk6CIl/i8ptIjx4Lgf7+497t4BQqvKNDOQADTdMjKppJjLr4rBVwQQwhAERGVm1uckKQ3P8XUPH1/v4jzL+9yfY0AtmAzwicQJVjcs4g3imU2rznmfOMLKskp8Tjxh0AWJH7muuPzGueSwwDMjRjYzTxwhFks9rPQwKxsa8TRxTNV0yhdyLquctzhr1Qbr3JO/MFzQV5a5TnMUKSxiCRJEKGiggiosxGnVSTGRof2kh3/E8UvkUshVASPHAmrQIDt+8D/43a1ZnJp0k8JJoO/Ftj/GgOAu0G7a9vexbbdPAP8zcKV3/bUWMPtJerOrxY6AwW3g4rqrKXvA5Q4QfarLhuxIfppCsQi8n9E35YHhW6B/ze2ts4/TByBLXaVvgINDYLxE2ese7w719vbvmU5/PycecohsjayNAAAACXBIWXMAAC4jAAAuIwF4pT92AAAAB3RJTUUH6AQIEQMnlTSSjwAAABl0RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAASSURBVDjLY2AYBaNgFIyCoQsABMQAAeRw1DoAAAAASUVORK5CYII=";

pub type Result<T> = std::result::Result<T, ClientError>;


================================================
FILE: backends/v2/src/client/sharded_client.rs
================================================
/// Multi shard Client
use crate::client::{ClientError, Result};
use crate::client::{Health, ShardInfo};

use crate::client::grpc_client::{DecodeTimings, PrefillTimings};
use crate::client::InfoResponse;
use crate::client::{
    Batch, CachedBatch, Client, Generation, GrammarType, HealthResponse,
    NextTokenChooserParameters, Request, StoppingCriteriaParameters,
};
use async_trait::async_trait;
use futures::future::join_all;
use tonic::transport::Uri;
use tracing::instrument;

#[derive(Debug, Clone)]
/// Text Generation Inference gRPC multi client
pub struct ShardedClient {
    clients: Vec<Client>,
}

impl ShardedClient {
    fn new(clients: Vec<Client>) -> Self {
        Self { clients }
    }

    /// Create a new ShardedClient from a master client. The master client will communicate with
    /// the other shards and returns all uris/unix sockets with the `service_discovery` gRPC method.
    async fn from_master_client(mut master_client: Client) -> Result<Self> {
        // Get all uris/unix sockets from the master client
        let uris = master_client.service_discovery().await?;
        let futures = uris.into_iter().map(Client::connect_uds);
        let clients: Result<Vec<Client>> = join_all(futures).await.into_iter().collect();
        Ok(Self::new(clients?))
    }

    /// Returns a client connected to the given uri
    #[allow(dead_code)]
    pub async fn connect(uri: Uri) -> Result<Self> {
        let master_client = Client::connect(uri).await?;
        Self::from_master_client(master_client).await
    }

    /// Returns a client connected to the given unix socket
    pub async fn connect_uds(path: String) -> Result<Self> {
        let master_client = Client::connect_uds(path).await?;
        Self::from_master_client(master_client).await
    }

    /// Get the model info
    #[instrument(skip(self))]
    pub async fn info(&mut self) -> Result<ShardInfo> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| client.info())
            .collect();
        join_all(futures).await.pop().unwrap().map(ShardInfo::from)
    }

    /// GRPC health check
    #[instrument(skip(self))]
    pub async fn health(&mut self) -> Result<HealthResponse> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| client.health())
            .collect();
        join_all(futures).await.pop().unwrap()
    }

    /// Clear the past generations cache
    #[instrument(skip(self))]
    pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| client.clear_cache(batch_id))
            .collect();
        join_all(futures).await.into_iter().collect()
    }

    /// Filter a cached batch
    #[instrument(skip(self))]
    pub async fn filter_batch(
        &mut self,
        batch_id: u64,
        request_ids: Vec<u64>,
    ) -> Result<Option<CachedBatch>> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| Box::pin(client.filter_batch(batch_id, request_ids.clone())))
            .collect();
        // all shards return the same message
        join_all(futures).await.pop().unwrap()
    }

    /// Warmup on a max size batch
    ///
    /// Returns the maximum amount of tokens supported by the hardware
    #[instrument(skip(self))]
    pub async fn warmup(
        &mut self,
        max_input_length: u32,
        max_prefill_tokens: u32,
        max_total_tokens: u32,
        max_batch_size: Option<usize>,
    ) -> Result<Option<u32>> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| {
                Box::pin(client.warmup(
                    max_input_length,
                    max_prefill_tokens,
                    max_total_tokens,
                    max_batch_size,
                ))
            })
            .collect();
        // Take the minimum value
        let results = join_all(futures)
            .await
            .into_iter()
            .collect::<Result<Vec<Option<u32>>>>()?;
        Ok(results.into_iter().flatten().min())
    }

    /// Generate one token for each request in the given batch
    ///
    /// Returns Generation for each request in batch
    /// and the next cached batch
    #[instrument(skip_all, fields(id = & batch.id, size = & batch.size))]
    pub async fn prefill(
        &mut self,
        batch: Batch,
    ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| Box::pin(client.prefill(batch.clone())))
            .collect();
        #[allow(clippy::type_complexity)]
        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)>> =
            join_all(futures).await.into_iter().collect();
        let mut results = results?;

        let (mut generations, next_batch, mut timings) =
            results.pop().ok_or(ClientError::EmptyResults)?;

        // Merge generations from different model shards
        for (mut shard_generations, _, shard_timings) in results.into_iter() {
            generations.append(&mut shard_generations);
            // Return the timings of the slowest shard
            if shard_timings.total > timings.total {
                timings = shard_timings;
            }
        }
        Ok((generations, next_batch, timings))
    }

    /// Generate one token for each request in the given cached batches
    ///
    /// Returns Generation for each request in batches
    /// and the next cached batch
    #[instrument(skip_all, fields(size = batches.iter().map(| batch | {batch.size}).sum::< u32 > ()))]
    pub async fn decode(
        &mut self,
        batches: Vec<CachedBatch>,
    ) -> Result<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| Box::pin(client.decode(batches.clone())))
            .collect();
        #[allow(clippy::type_complexity)]
        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)>> =
            join_all(futures).await.into_iter().collect();
        let mut results = results?;

        let (mut generations, next_batch, mut timings) =
            results.pop().ok_or(ClientError::EmptyResults)?;

        // Merge generations from different model shards
        for (mut shard_generations, _, shard_timings) in results.into_iter() {
            generations.append(&mut shard_generations);
            // Return the timings of the slowest shard
            if shard_timings.total > timings.total {
                timings = shard_timings;
            }
        }
        Ok((generations, next_batch, timings))
    }
}

impl From<InfoResponse> for ShardInfo {
    fn from(value: InfoResponse) -> Self {
        Self {
            requires_padding: value.requires_padding,
            dtype: value.dtype,
            device_type: value.device_type,
            window_size: value.window_size,
            speculate: value.speculate,
        }
    }
}

#[async_trait]
impl Health for ShardedClient {
    async fn device_health(&self) -> Result<()> {
        self.clone().health().await?;
        Ok(())
    }

    async fn model_health(&self) -> Result<()> {
        // Dummy batch of 1 token and 1 generated token
        let liveness_request = Request {
            id: u64::MAX,
            inputs: "liveness".to_string(),
            truncate: 10,
            prefill_logprobs: false,
            parameters: Some(NextTokenChooserParameters {
                temperature: 1.0,
                top_k: 0,
                top_p: 1.0,
                typical_p: 1.0,
                do_sample: false,
                seed: 0,
                repetition_penalty: 1.0,
                frequency_penalty: 0.0,
                watermark: false,
                grammar: String::new(),
                grammar_type: GrammarType::None as i32,
            }),
            stopping_parameters: Some(StoppingCriteriaParameters {
                max_new_tokens: 1,
                stop_sequences: vec![],
                ignore_eos_token: false,
            }),
            top_n_tokens: 0,
        };
        let batch = Batch {
            id: u64::MAX,
            requests: vec![liveness_request],
            size: 1,
            max_tokens: 2,
        };
        self.clone().prefill(batch).await?;
        Ok(())
    }
}


================================================
FILE: backends/v2/src/lib.rs
================================================
mod backend;
mod client;
mod queue;

use crate::client::{ClientError, ShardedClient};
pub(crate) use backend::BackendV2;
use serde::Serialize;
use thiserror::Error;
use utoipa::ToSchema;

#[derive(Clone, Debug, Serialize, ToSchema)]
pub struct BackendInfo {
    /// Mandatory
    #[schema(example = "cuda")]
    pub model_device_type: String,
    #[schema(example = "torch.float16")]
    pub model_dtype: String,

    /// Backend parameters
    #[schema(example = "1")]
    pub speculate: usize,
    #[schema(example = "1.2")]
    pub waiting_served_ratio: f32,
    #[schema(example = "32000")]
    pub max_batch_total_tokens: u32,
    #[schema(example = "20")]
    pub max_waiting_tokens: usize,
    #[schema(nullable = true, example = "null")]
    pub max_batch_size: Option<usize>,
}

#[allow(clippy::too_many_arguments)]
pub async fn connect_backend(
    max_input_tokens: usize,
    max_total_tokens: usize,
    master_shard_uds_path: String,
    waiting_served_ratio: f32,
    max_batch_prefill_tokens: u32,
    max_batch_total_tokens: Option<u32>,
    max_waiting_tokens: usize,
    max_batch_size: Option<usize>,
) -> Result<(BackendV2, BackendInfo), V2Error> {
    // Helper function
    let check_max_batch_total_tokens = |max_supported_batch_total_tokens: Option<u32>| {
        match max_supported_batch_total_tokens {
            // Older models do not support automatic max-batch-total-tokens
            None => {
                let max_batch_total_tokens = max_batch_total_tokens
                    .unwrap_or(16000.max((max_total_tokens as u32).max(max_batch_prefill_tokens)));
                tracing::warn!("Model does not support automatic max batch total tokens");
                Ok(max_batch_total_tokens)
            }
            // Flash attention models return their max supported total tokens
            Some(max_supported_batch_total_tokens) => {
                // Warn if user added his own max-batch-total-tokens as we will ignore it
                if max_batch_total_tokens.is_some() {
                    tracing::warn!(
                        "`--max-batch-total-tokens` is deprecated for Flash \
                        Attention models."
                    );
                    tracing::warn!(
                        "Inferred max batch total tokens: {max_supported_batch_total_tokens}"
                    );
                }
                if max_total_tokens as u32 > max_supported_batch_total_tokens {
                    return Err(V2Error::NotEnoughMemory(max_total_tokens));
                }

                Ok(max_supported_batch_total_tokens)
            }
        }
    };

    let mut sharded_client = ShardedClient::connect_uds(master_shard_uds_path)
        .await
        .map_err(V2Error::Connection)?;

    // server is running on v2
    // Clear the cache; useful if the webserver rebooted
    sharded_client
        .clear_cache(None)
        .await
        .map_err(V2Error::Cache)?;
    // Get info from the shard
    let shard_info = sharded_client.info().await.map_err(V2Error::Info)?;

    // Warmup model
    tracing::info!("Warming up model");
    let max_batch_total_tokens = check_max_batch_total_tokens(
        sharded_client
            .warmup(
                max_input_tokens as u32,
                max_batch_prefill_tokens,
                max_total_tokens as u32,
                max_batch_size,
            )
            .await
            .map_err(V2Error::Warmup)?,
    )?;
    tracing::info!("Setting max batch total tokens to {max_batch_total_tokens}");

    let backend_info = BackendInfo {
        waiting_served_ratio,
        max_batch_total_tokens,
        max_waiting_tokens,
        max_batch_size,
        model_device_type: shard_info.device_type.clone(),
        model_dtype: shard_info.dtype.clone(),
        speculate: shard_info.speculate as usize,
    };

    let backend = BackendV2::new(
        sharded_client,
        waiting_served_ratio,
        max_batch_prefill_tokens,
        max_batch_total_tokens,
        max_waiting_tokens,
        max_batch_size,
        shard_info.requires_padding,
        shard_info.window_size,
        shard_info.speculate,
    );

    tracing::info!("Using backend V3");

    Ok((backend, backend_info))
}

#[derive(Debug, Error)]
pub enum V2Error {
    #[error("Unable to clear the Python model shards cache: {0}")]
    Cache(ClientError),
    #[error("Unable to connect to the Python model shards: {0}")]
    Connection(ClientError),
    #[error("Unable to get the Python model shards info: {0}")]
    Info(ClientError),
    #[error("Unable to warmup the Python model shards: {0}")]
    Warmup(ClientError),
    #[error("Not enough memory to handle `max_total_tokens={0}`")]
    NotEnoughMemory(usize),
}


================================================
FILE: backends/v2/src/main.rs
================================================
use clap::{Parser, Subcommand};
use text_generation_router::{server, usage_stats};
use text_generation_router_v2::{connect_backend, V2Error};
use thiserror::Error;

/// App Configuration
#[derive(Parser, Debug)]
#[clap(author, version, about, long_about = None)]
struct Args {
    #[command(subcommand)]
    command: Option<Commands>,

    #[clap(default_value = "128", long, env)]
    max_concurrent_requests: usize,
    #[clap(default_value = "2", long, env)]
    max_best_of: usize,
    #[clap(default_value = "4", long, env)]
    max_stop_sequences: usize,
    #[clap(default_value = "5", long, env)]
    max_top_n_tokens: u32,
    #[clap(default_value = "1024", long, env)]
    max_input_tokens: usize,
    #[clap(default_value = "2048", long, env)]
    max_total_tokens: usize,
    #[clap(default_value = "1.2", long, env)]
    waiting_served_ratio: f32,
    #[clap(default_value = "4096", long, env)]
    max_batch_prefill_tokens: u32,
    #[clap(long, env)]
    max_batch_total_tokens: Option<u32>,
    #[clap(default_value = "20", long, env)]
    max_waiting_tokens: usize,
    #[clap(long, env)]
    max_batch_size: Option<usize>,
    #[clap(default_value = "0.0.0.0", long, env)]
    hostname: String,
    #[clap(default_value = "3000", long, short, env)]
    port: u16,
    #[clap(default_value = "9000", long, short, env)]
    prometheus_port: u16,
    #[clap(default_value = "/tmp/text-generation-server-0", long, env)]
    master_shard_uds_path: String,
    #[clap(default_value = "bigscience/bloom", long, env)]
    tokenizer_name: String,
    #[clap(long, env)]
    tokenizer_config_path: Option<String>,
    #[clap(long, env)]
    revision: Option<String>,
    #[clap(long, env, value_enum)]
    trust_remote_code: bool,
    #[clap(default_value = "2", long, env)]
    validation_workers: usize,
    #[clap(long, env)]
    api_key: Option<String>,
    #[clap(long, env)]
    json_output: bool,
    #[clap(long, env)]
    otlp_endpoint: Option<String>,
    #[clap(default_value = "text-generation-inference.router", long, env)]
    otlp_service_name: String,
    #[clap(long, env)]
    cors_allow_origin: Option<Vec<String>>,
    #[clap(long, env)]
    ngrok: bool,
    #[clap(long, env)]
    ngrok_authtoken: Option<String>,
    #[clap(long, env)]
    ngrok_edge: Option<String>,
    #[clap(long, env, default_value_t = false)]
    disable_grammar_support: bool,
    #[clap(default_value = "4", long, env)]
    max_client_batch_size: usize,
    #[clap(default_value = "on", long, env)]
    usage_stats: usage_stats::UsageStatsLevel,
    #[clap(default_value = "2000000", long, env)]
    payload_limit: usize,
    #[clap(default_value = "1073741824", long, env)]
    max_image_fetch_size: usize,
}

#[derive(Debug, Subcommand)]
enum Commands {
    PrintSchema,
}

#[tokio::main]
async fn main() -> Result<(), RouterError> {
    // Get args
    let args = Args::parse();
    // Pattern match configuration
    let Args {
        command,
        max_concurrent_requests,
        max_best_of,
        max_stop_sequences,
        max_top_n_tokens,
        max_input_tokens,
        max_total_tokens,
        waiting_served_ratio,
        max_batch_prefill_tokens,
        max_batch_total_tokens,
        max_waiting_tokens,
        max_batch_size,
        hostname,
        port,
        prometheus_port,
        master_shard_uds_path,
        tokenizer_name,
        tokenizer_config_path,
        revision,
        trust_remote_code,
        validation_workers,
        api_key,
        json_output,
        otlp_endpoint,
        otlp_service_name,
        cors_allow_origin,
        ngrok,
        ngrok_authtoken,
        ngrok_edge,
        disable_grammar_support,
        max_client_batch_size,
        usage_stats,
        payload_limit,
        max_image_fetch_size,
    } = args;

    if let Some(Commands::PrintSchema) = command {
        use utoipa::OpenApi;
        let api_doc = text_generation_router::server::ApiDoc::openapi();
        let api_doc = serde_json::to_string_pretty(&api_doc).unwrap();
        println!("{}", api_doc);
        std::process::exit(0);
    };
    text_generation_router::logging::init_logging(otlp_endpoint, otlp_service_name, json_output);

    // Validate args
    if max_input_tokens >= max_total_tokens {
        return Err(RouterError::ArgumentValidation(
            "`max_input_tokens` must be < `max_total_tokens`".to_string(),
        ));
    }
    if max_input_tokens as u32 > max_batch_prefill_tokens {
        return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be >= `max_input_tokens`. Given: {max_batch_prefill_tokens} and {max_input_tokens}")));
    }

    if validation_workers == 0 {
        return Err(RouterError::ArgumentValidation(
            "`validation_workers` must be > 0".to_string(),
        ));
    }

    if let Some(ref max_batch_total_tokens) = max_batch_total_tokens {
        if max_batch_prefill_tokens > *max_batch_total_tokens {
            return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be <= `max_batch_total_tokens`. Given: {max_batch_prefill_tokens} and {max_batch_total_tokens}")));
        }
        if max_total_tokens as u32 > *max_batch_total_tokens {
            return Err(RouterError::ArgumentValidation(format!("`max_total_tokens` must be <= `max_batch_total_tokens`. Given: {max_total_tokens} and {max_batch_total_tokens}")));
        }
    }

    if let Some(max_batch_size) = max_batch_size {
        if max_batch_size == 0 {
            return Err(RouterError::ArgumentValidation(
                "`max_batch_size` must be > 0".to_string(),
            ));
        }
    }

    let (backend, _backend_info) = connect_backend(
        max_input_tokens,
        max_total_tokens,
        master_shard_uds_path,
        waiting_served_ratio,
        max_batch_prefill_tokens,
        max_batch_total_tokens,
        max_waiting_tokens,
        max_batch_size,
    )
    .await?;

    // Run server
    server::run(
        backend,
        max_concurrent_requests,
        max_best_of,
        max_stop_sequences,
        max_top_n_tokens,
        max_input_tokens,
        max_total_tokens,
        validation_workers,
        api_key,
        tokenizer_name,
        tokenizer_config_path,
        revision,
        trust_remote_code,
        hostname,
        port,
        cors_allow_origin,
        ngrok,
        ngrok_authtoken,
        ngrok_edge,
        disable_grammar_support,
        max_client_batch_size,
        usage_stats,
        payload_limit,
        max_image_fetch_size,
        prometheus_port,
    )
    .await?;
    Ok(())
}

#[derive(Debug, Error)]
enum RouterError {
    #[error("Argument validation error: {0}")]
    ArgumentValidation(String),
    #[error("Backend failed: {0}")]
    Backend(#[from] V2Error),
    #[error("WebServer error: {0}")]
    WebServer(#[from] server::WebServerError),
    #[error("Tokio runtime failed to start: {0}")]
    Tokio(#[from] std::io::Error),
}


================================================
FILE: backends/v2/src/queue.rs
================================================
use crate::client::{
    Batch, GrammarType, NextTokenChooserParameters, Request, StoppingCriteriaParameters,
};
use nohash_hasher::{BuildNoHashHasher, IntMap};
use std::cmp::min;
use std::collections::VecDeque;
use text_generation_router::infer::InferError;
use text_generation_router::infer::InferStreamResponse;
use text_generation_router::validation::{
    ChunksToString, ValidGenerateRequest, ValidGrammar, ValidParameters, ValidStoppingParameters,
};
use tokio::sync::{mpsc, oneshot};
use tokio::time::Instant;
use tracing::{info_span, instrument, Span};

/// Queue entry
#[derive(Debug)]
pub(crate) struct Entry {
    /// Request
    pub request: ValidGenerateRequest,
    /// Response sender to communicate between the Infer struct and the batching_task
    pub response_tx: mpsc::UnboundedSender<Result<InferStreamResponse, InferError>>,
    /// Span that will live as long as entry
    pub span: Span,
    /// Temporary span used as a guard when logging inference, wait times...
    pub temp_span: Option<Span>,
    /// Instant when this entry was queued
    pub queue_time: Instant,
    /// Instant when this entry was added to a batch
    pub batch_time: Option<Instant>,
}

/// Request Queue
#[derive(Debug, Clone)]
pub(crate) struct Queue {
    /// Channel to communicate with the background queue task
    queue_sender: mpsc::UnboundedSender<QueueCommand>,
}

impl Queue {
    pub(crate) fn new(
        requires_padding: bool,
        block_size: u32,
        window_size: Option<u32>,
        speculate: u32,
    ) -> Self {
        // Create channel
        let (queue_sender, queue_receiver) = mpsc::unbounded_channel();

        // Launch background queue task
        tokio::spawn(queue_task(
            requires_padding,
            block_size,
            window_size,
            speculate,
            queue_receiver,
        ));

        Self { queue_sender }
    }

    #[instrument(skip_all)]
    pub(crate) fn append(&self, entry: Entry) {
        // Send append command to the background task managing the state
        // Unwrap is safe here
        self.queue_sender
            .send(QueueCommand::Append(Box::new(entry), Span::current()))
            .unwrap();
    }

    // Get the next batch
    #[instrument(skip(self))]
    pub(crate) async fn next_batch(
        &self,
        min_size: Option<usize>,
        max_size: Option<usize>,
        prefill_token_budget: u32,
        token_budget: u32,
    ) -> Option<NextBatch> {
        // Create response channel
        let (response_sender, response_receiver) = oneshot::channel();
        // Send next batch command to the background task managing the state
        // Unwrap is safe here
        self.queue_sender
            .send(QueueCommand::NextBatch {
                min_size,
                max_size,
                prefill_token_budget,
                token_budget,
                response_sender,
                span: Span::current(),
            })
            .unwrap();
        // Await on response channel
        // Unwrap is safe here
        response_receiver.await.unwrap()
    }
}

// Background task responsible of the queue state
async fn queue_task(
    requires_padding: bool,
    block_size: u32,
    window_size: Option<u32>,
    speculate: u32,
    mut receiver: mpsc::UnboundedReceiver<QueueCommand>,
) {
    let mut state = State::new(requires_padding, block_size, window_size, speculate);

    while let Some(cmd) = receiver.recv().await {
        match cmd {
            QueueCommand::Append(entry, span) => {
                span.in_scope(|| state.append(*entry));
                metrics::gauge!("tgi_queue_size").increment(1.0);
            }
            QueueCommand::NextBatch {
                min_size,
                max_size,
                prefill_token_budget,
                token_budget,
                response_sender,
                span,
            } => span.in_scope(|| {
                let next_batch =
                    state.next_batch(min_size, max_size, prefill_token_budget, token_budget);
                response_sender.send(next_batch).unwrap();
                metrics::gauge!("tgi_queue_size").set(state.entries.len() as f64);
            }),
        }
    }
}

/// Queue State
#[derive(Debug)]
struct State {
    /// Queue entries organized in a Vec
    entries: VecDeque<(u64, Entry)>,

    /// Id of the next entry
    next_id: u64,

    /// Id of the next batch
    next_batch_id: u64,

    /// Whether the model is using padding
    requires_padding: bool,

    /// Paged Attention block size
    block_size: u32,

    /// Sliding window
    window_size: Option<u32>,

    /// Speculation amount
    speculate: u32,
}

impl State {
    fn new(
        requires_padding: bool,
        block_size: u32,
        window_size: Option<u32>,
        speculate: u32,
    ) -> Self {
        Self {
            entries: VecDeque::with_capacity(128),
            next_id: 0,
            next_batch_id: 0,
            requires_padding,
            block_size,
            window_size,
            speculate,
        }
    }

    /// Append an entry to the queue
    fn append(&mut self, mut entry: Entry) {
        // Create a span that will live as long as the entry is in the queue waiting to be batched
        let queue_span = info_span!(parent: &entry.span, "queued");
        entry.temp_span = Some(queue_span);

        // Push entry in the queue
        self.entries.push_back((self.next_id, entry));
        self.next_id += 1;
    }

    // Get the next batch
    fn next_batch(
        &mut self,
        min_size: Option<usize>,
        max_size: Option<usize>,
        prefill_token_budget: u32,
        token_budget: u32,
    ) -> Option<NextBatch> {
        if self.entries.is_empty() {
            tracing::debug!("No queue");
            return None;
        }

        // Check if we have enough entries
        if let Some(min_size) = min_size {
            if self.entries.len() < min_size {
                tracing::debug!("Not enough entries");
                return None;
            }
        }

        if let Some(max_size) = max_size {
            if max_size == 0 {
                tracing::debug!("No capacity");
                return None;
            }
        }

        // Pad prefill_token_budget to be a multiple of block size
        let prefill_token_budget = prefill_token_budget.div_ceil(self.block_size) * self.block_size;

        // Create span for this batch to add context to inference calls
        let next_batch_span = info_span!(parent: None, "batch", batch_size = tracing::field::Empty);
        next_batch_span.follows_from(Span::current());

        let mut batch_requests = Vec::with_capacity(self.entries.len());
        let mut batch_entries =
            IntMap::with_capacity_and_hasher(self.entries.len(), BuildNoHashHasher::default());

        let mut max_input_length = 0;
        let mut prefill_tokens: u32 = 0;
        let mut decode_tokens: u32 = 0;

        // Pop entries starting from the front of the queue
        while let Some((id, mut entry)) = self.entries.pop_front() {
            // Filter entries where the response receiver was dropped (== entries where the request
            // was dropped by the client)
            if entry.response_tx.is_closed() {
                metrics::counter!("tgi_request_failure", "err" => "dropped").increment(1);
                tracing::debug!("Dropping entry");
                continue;
            }

            if self.requires_padding {
                // We pad to max input length in the Python shards
                // We need to take these padding tokens into the equation
                max_input_length = max_input_length.max(entry.request.input_length);
                prefill_tokens = (batch_requests.len() + 1) as u32 * max_input_length
            } else {
                // pad to block size
                prefill_tokens +=
                    entry.request.input_length.div_ceil(self.block_size) * self.block_size;
            }

            if self.requires_padding {
                decode_tokens += entry.request.stopping_parameters.max_new_tokens;
            } else {
                let max_new_tokens = match self.window_size {
                    None => entry.request.stopping_parameters.max_new_tokens,
                    Some(window_size) => min(
                        window_size.saturating_sub(entry.request.input_length),
                        entry.request.stopping_parameters.max_new_tokens,
                    ),
                };

                // pad to block size
                decode_tokens += max_new_tokens.div_ceil(self.block_size) * self.block_size;
            }

            if prefill_tokens > prefill_token_budget
                || (prefill_tokens + decode_tokens + self.speculate) > token_budget
            {
                // Entry is over budget
                // Add it back to the front
                tracing::debug!("Over budget: prefill_tokens={prefill_tokens} > {prefill_token_budget} || {prefill_tokens} + {decode_tokens} + {} > {token_budget}", self.speculate);
                self.entries.push_front((id, entry));
                break;
            }

            tracing::debug!("Accepting entry");
            // Create a new span to link the batch back to this entry
            let entry_batch_span = info_span!(parent: &entry.span, "infer");
            // Add relationships
            next_batch_span.follows_from(&entry_batch_span);
            entry_batch_span.follows_from(&next_batch_span);
            // Update entry
            entry.temp_span = Some(entry_batch_span);

            batch_requests.push(Request {
                id,
                prefill_logprobs: entry.request.decoder_input_details,
                inputs: entry.request.inputs.chunks_to_string(),
                truncate: entry.request.truncate,
                parameters: Some(NextTokenChooserParameters::from(
                    entry.request.parameters.clone(),
                )),
                stopping_parameters: Some(StoppingCriteriaParameters::from(
                    entry.request.stopping_parameters.clone(),
                )),
                top_n_tokens: entry.request.top_n_tokens,
            });
            // Set batch_time
            entry.batch_time = Some(Instant::now());
            // Insert in batch_entries IntMap
            batch_entries.insert(id, entry);

            // Check if max_size
            if Some(batch_requests.len()) == max_size {
                break;
            }
        }

        // Empty batch
        if batch_requests.is_empty() {
            tracing::debug!("Filtered out all entries");
            return None;
        }

        // Check if our batch is big enough
        if let Some(min_size) = min_size {
            // Batch is too small
            if batch_requests.len() < min_size {
                // Add back entries to the queue in the correct order
                for r in batch_requests.into_iter().rev() {
                    let id = r.id;
                    let entry = batch_entries.remove(&id).unwrap();
                    self.entries.push_front((id, entry));
                }

                return None;
            }
        }

        // Final batch size
        let size = batch_requests.len() as u32;
        next_batch_span.record("batch_size", size);

        let batch = Batch {
            id: self.next_batch_id,
            requests: batch_requests,
            size,
            max_tokens: (prefill_tokens + decode_tokens),
        };
        // Increment batch id
        self.next_batch_id += 1;

        metrics::histogram!("tgi_batch_next_size").record(batch.size as f64);

        Some((batch_entries, batch, next_batch_span))
    }
}

type NextBatch = (IntMap<u64, Entry>, Batch, Span);

#[derive(Debug)]
enum QueueCommand {
    Append(Box<Entry>, Span),
    NextBatch {
        min_size: Option<usize>,
        max_size: Option<usize>,
        prefill_token_budget: u32,
        token_budget: u32,
        response_sender: oneshot::Sender<Option<NextBatch>>,
        span: Span,
    },
}

impl From<ValidParameters> for NextTokenChooserParameters {
    fn from(value: ValidParameters) -> Self {
        let (grammar, grammar_type) = match value.grammar {
            None => (String::new(), GrammarType::None),

            Some(grammar) => match grammar {
                ValidGrammar::Json(grammar_string) => (grammar_string, GrammarType::Json),
                ValidGrammar::Regex(grammar_string) => (grammar_string, GrammarType::Regex),
            },
        };

        Self {
            temperature: value.temperature,
            top_k: value.top_k,
            top_p: value.top_p,
            typical_p: value.typical_p,
            do_sample: value.do_sample,
            seed: value.seed,
            repetition_penalty: value.repetition_penalty,
            frequency_penalty: value.frequency_penalty,
            watermark: value.watermark,
            grammar,
            grammar_type: grammar_type.into(),
        }
    }
}

impl From<ValidStoppingParameters> for StoppingCriteriaParameters {
    fn from(value: ValidStoppingParameters) -> Self {
        Self {
            max_new_tokens: value.max_new_tokens,
            stop_sequences: value.stop_sequences,
            ignore_eos_token: value.ignore_eos_token,
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::sync::Arc;
    use tracing::info_span;

    fn default_entry() -> (
        Entry,
        mpsc::UnboundedReceiver<Result<InferStreamResponse, InferError>>,
    ) {
        let (response_tx, receiver_tx) = mpsc::unbounded_channel();

        let entry = Entry {
            request: ValidGenerateRequest {
                inputs: vec![],
                input_ids: Some(Arc::new(vec![])),
                input_length: 0,
                add_special_tokens: true,
                truncate: 0,
                decoder_input_details: false,
                parameters: ValidParameters {
                    temperature: 0.0,
                    top_k: 0,
                    top_p: 0.0,
                    typical_p: 0.0,
                    do_sample: false,
                    seed: 0,
                    repetition_penalty: 0.0,
                    frequency_penalty: 0.0,
                    watermark: false,
                    grammar: None,
                },
                stopping_parameters: ValidStoppingParameters {
                    ignore_eos_token: false,
                    max_new_tokens: 1,
                    max_total_new_tokens: 1024,
                    stop_sequences: vec![],
                },
                top_n_tokens: 0,
                adapter_id: None,
            },
            response_tx,
            span: info_span!("entry"),
            temp_span: None,
            queue_time: Instant::now(),
            batch_time: None,
        };
        (entry, receiver_tx)
    }

    #[test]
    fn test_append() {
        let mut state = State::new(false, 1, None, 0);
        let (entry, _guard) = default_entry();

        assert_eq!(state.next_id, 0);
        assert_eq!(state.entries.len(), 0);

        state.append(entry);

        assert_eq!(state.next_id, 1);
        assert_eq!(state.entries.len(), 1);
        let (id, _) = state.entries.remove(0).unwrap();
        assert_eq!(id, 0);
    }

    #[test]
    fn test_next_batch_empty() {
        let mut state = State::new(false, 1, None, 0);

        assert!(state.next_batch(None, None, 1, 1).is_none());
        assert!(state.next_batch(Some(1), None, 1, 1).is_none());
    }

    #[test]
    fn test_next_batch_min_size() {
        let mut state = State::new(false, 1, None, 0);
        let (entry1, _guard1) = default_entry();
        let (entry2, _guard2) = default_entry();
        state.append(entry1);
        state.append(entry2);

        let (entries, batch, _) = state.next_batch(None, None, 2, 2).unwrap();
        assert_eq!(entries.len(), 2);
        assert!(entries.contains_key(&0));
        assert!(entries.contains_key(&1));
        assert!(entries.get(&0).unwrap().batch_time.is_some());
        assert!(entries.get(&1).unwrap().batch_time.is_some());
        assert_eq!(batch.id, 0);
        assert_eq!(batch.size, 2);

        assert_eq!(state.next_id, 2);
        assert_eq!(state.entries.len(), 0);
        assert_eq!(state.next_batch_id, 1);

        let (entry3, _guard3) = default_entry();
        state.append(entry3);

        assert!(state.next_batch(Some(2), None, 2, 2).is_none());

        assert_eq!(state.next_id, 3);
        assert_eq!(state.entries.len(), 1);
        let (id, _) = state.entries.remove(0).unwrap();
        assert_eq!(id, 2);
    }

    #[test]
    fn test_next_batch_max_size() {
        let mut state = State::new(false, 1, None, 0);
        let (entry1, _guard1) = default_entry();
        let (entry2, _guard2) = default_entry();
        state.append(entry1);
        state.append(entry2);

        let (entries, batch, _) = state.next_batch(None, Some(1), 2, 2).unwrap();
        assert_eq!(entries.len(), 1);
        assert!(entries.contains_key(&0));
        assert!(entries.get(&0).unwrap().batch_time.is_some());
        assert_eq!(batch.id, 0);
        assert_eq!(batch.size, 1);

        assert_eq!(state.next_id, 2);
        assert_eq!(state.entries.len(), 1);
        assert_eq!(state.next_batch_id, 1);
    }

    #[test]
    fn test_next_batch_token_budget() {
        let mut state = State::new(false, 1, None, 0);
        let (entry1, _guard1) = default_entry();
        let (entry2, _guard2) = default_entry();
        state.append(entry1);
        state.append(entry2);

        let (entries, batch, _) = state.next_batch(None, None, 1, 1).unwrap();
        assert_eq!(entries.len(), 1);
        assert!(entries.contains_key(&0));
        assert_eq!(batch.id, 0);
        assert_eq!(batch.size, 1);

        assert_eq!(state.next_id, 2);
        assert_eq!(state.entries.len(), 1);
        assert_eq!(state.next_batch_id, 1);

        let (entry3, _guard3) = default_entry();
        state.append(entry3);

        let (entries, batch, _) = state.next_batch(None, None, 3, 3).unwrap();
        assert_eq!(entries.len(), 2);
        assert!(entries.contains_key(&1));
        assert!(entries.contains_key(&2));
        assert_eq!(batch.id, 1);
        assert_eq!(batch.size, 2);

        assert_eq!(state.next_id, 3);
        assert_eq!(state.entries.len(), 0);
        assert_eq!(state.next_batch_id, 2);
    }

    #[tokio::test]
    async fn test_queue_append() {
        let queue = Queue::new(false, 1, None, 0);
        let (entry, _guard) = default_entry();
        queue.append(entry);
    }

    #[tokio::test]
    async fn test_queue_next_batch_empty() {
        let queue = Queue::new(false, 1, None, 0);

        assert!(queue.next_batch(None, None, 1, 1).await.is_none());
        assert!(queue.next_batch(Some(1), None, 1, 1).await.is_none());
    }

    #[tokio::test]
    async fn test_queue_next_batch_min_size() {
        let queue = Queue::new(false, 1, None, 0);
        let (entry1, _guard1) = default_entry();
        let (entry2, _guard2) = default_entry();
        queue.append(entry1);
        queue.append(entry2);

        let (entries, batch, _) = queue.next_batch(None, None, 2, 2).await.unwrap();
        assert_eq!(entries.len(), 2);
        assert!(entries.contains_key(&0));
        assert!(entries.contains_key(&1));
        assert!(entries.get(&0).unwrap().batch_time.is_some());
        assert!(entries.get(&1).unwrap().batch_time.is_some());
        assert_eq!(batch.id, 0);
        assert_eq!(batch.size, 2);

        let (entry3, _guard3) = default_entry();
        queue.append(entry3);

        // Not enough requests pending
        assert!(queue.next_batch(Some(2), None, 2, 2).await.is_none());
        // Not enough token budget
        assert!(queue.next_batch(Some(1), None, 0, 0).await.is_none());
        // Ok
        let (entries2, batch2, _) = queue.next_batch(Some(1), None, 2, 2).await.unwrap();
        assert_eq!(entries2.len(), 1);
        assert!(entries2.contains_key(&2));
        assert!(entries2.get(&2).unwrap().batch_time.is_some());
        assert_eq!(batch2.id, 1);
        assert_eq!(batch2.size, 1);
    }

    #[tokio::test]
    async fn test_queue_next_batch_max_size() {
        let queue = Queue::new(false, 1, None, 0);
        let (entry1, _guard1) = default_entry();
        let (entry2, _guard2) = default_entry();
        queue.append(entry1);
        queue.append(entry2);

        let (entries, batch, _) = queue.next_batch(None, Some(1), 2, 2).await.unwrap();
        assert_eq!(entries.len(), 1);
        assert!(entries.contains_key(&0));
        assert!(entries.get(&0).unwrap().batch_time.is_some());
        assert_eq!(batch.id, 0);
        assert_eq!(batch.size, 1);
    }

    #[tokio::test]
    async fn test_queue_next_batch_token_budget() {
        let queue = Queue::new(false, 1, None, 0);
        let (entry1, _guard1) = default_entry();
        let (entry2, _guard2) = default_entry();
        queue.append(entry1);
        queue.append(entry2);

        let (entries, batch, _) = queue.next_batch(None, None, 1, 1).await.unwrap();
        assert_eq!(entries.len(), 1);
        assert!(entries.contains_key(&0));
        assert_eq!(batch.id, 0);
        assert_eq!(batch.size, 1);

        let (entry3, _guard3) = default_entry();
        queue.append(entry3);

        let (entries, batch, _) = queue.next_batch(None, None, 3, 3).await.unwrap();
        assert_eq!(entries.len(), 2);
        assert!(entries.contains_key(&1));
        assert!(entries.contains_key(&2));
        assert_eq!(batch.id, 1);
        assert_eq!(batch.size, 2);
    }

    #[tokio::test]
    async fn test_queue_next_batch_token_speculate() {
        let queue = Queue::new(false, 1, None, 2);
        let (entry1, _guard1) = default_entry();
        let (entry2, _guard2) = default_entry();
        queue.append(entry1);
        queue.append(entry2);

        // Budget of 1 is not enough
        assert!(queue.next_batch(None, None, 1, 1).await.is_none());

        let (entries, batch, _) = queue.next_batch(None, None, 6, 6).await.unwrap();
        assert_eq!(entries.len(), 2);
        assert!(entries.contains_key(&0));
        assert!(entries.contains_key(&1));
        assert_eq!(batch.id, 0);
        assert_eq!(batch.size, 2);
    }

    #[tokio::test]
    async fn test_queue_next_batch_dropped_receiver() {
        let queue = Queue::new(false, 1, None, 0);
        let (entry, _) = default_entry();
        queue.append(entry);

        assert!(queue.next_batch(None, None, 1, 1).await.is_none());
    }
}


================================================
FILE: backends/v3/Cargo.toml
================================================
[package]
name = "text-generation-router-v3"
description = "Text Generation Webserver"
version.workspace = true
edition.workspace = true
authors.workspace = true
homepage.workspace = true

[lib]
path = "src/lib.rs"

[[bin]]
name = "text-generation-router"
path = "src/main.rs"

[dependencies]
async-trait = "0.1.74"
async-stream = "0.3.5"
axum = { version = "0.7", features = ["json"] }
axum-tracing-opentelemetry = "0.16"
text-generation-router = { path = "../../router" }
clap = { version = "4.4.5", features = ["derive", "env"] }
grpc-metadata = { path = "../grpc-metadata" }
futures = "0.3.28"
hf-hub = { workspace = true }
jsonschema = { version = "0.28.0" }
metrics = { workspace = true }
metrics-exporter-prometheus = { workspace = true }
nohash-hasher = "0.2.0"
opentelemetry = { version = "0.20.0", features = ["rt-tokio"] }
opentelemetry-otlp = "0.13.0"
rand = "0.8.5"
reqwest = { version = "0.11.20", features = [] }
serde = "1.0.188"
serde_json = "1.0.107"
slotmap = "1.0.7"
thiserror = "1.0.48"
tokenizers = { workspace = true }
tokio = { version = "1.32.0", features = [
  "rt",
  "rt-multi-thread",
  "parking_lot",
  "signal",
  "sync",
] }
tokio-stream = "0.1.14"
tower-http = { version = "0.5.1", features = ["cors"] }
tracing = "0.1.37"
tracing-opentelemetry = "0.21.0"
tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] }
utoipa = { version = "4.2.0", features = ["axum_extras"] }
utoipa-swagger-ui = { version = "6.0.0", features = ["axum"] }
init-tracing-opentelemetry = { version = "0.14.1", features = [
  "opentelemetry-otlp",
] }
minijinja = { workspace = true }
minijinja-contrib = { workspace = true }
futures-util = "0.3.30"
regex = "1.10.3"
once_cell = "1.19.0"
image = "0.25.1"
base64 = { workspace = true }
prost = "^0.12"
tonic = "^0.10"
tower = "^0.4"

[build-dependencies]
tonic-build = "0.10.1"
prost-build = "0.12.1"

[dev-dependencies]
criterion = "0.3"
itertools = "0.13"
rustc-hash = "2"

[features]
default = ["ngrok"]
ngrok = ["text-generation-router/ngrok"]
google = ["text-generation-router/google"]
kserve = ["text-generation-router/kserve"]

[[bench]]
name = "prefix_cache"
harness = false


================================================
FILE: backends/v3/benches/prefix_cache.rs
================================================
use std::sync::Arc;

use criterion::{black_box, criterion_group, criterion_main, Criterion};
use rand::Rng;

use text_generation_router_v3::block_allocator::Allocator;
use text_generation_router_v3::radix::RadixAllocator;

fn prefix_cache_benchmark(c: &mut Criterion) {
    // let prefixes: Vec<Vec<u32>> = (0..8192)
    //     .chunks(256)
    //     .into_iter()
    //     .map(|c| c.collect())
    //     .collect();

    let mut cache = RadixAllocator::new(1, 262144, None);

    c.bench_function("Radix allocator", |b| {
        b.iter_batched(
            || {
                //prefixes
                //    .choose_multiple(&mut rand::thread_rng(), 5)
                //    .fold(Vec::new(), |mut v, s| {
                //        v.extend(s);
                //        v
                //    })

                (0..7936)
                    .map(|_| rand::thread_rng().gen_range(0..1024))
                    .collect::<Vec<u32>>()
            },
            |prefill| {
                let alloc = cache.allocate(
                    prefill.len() as u32 + 13,
                    Some(Arc::new(black_box(prefill))),
                );
                if let Some(alloc) = alloc {
                    cache.free(alloc.blocks.clone(), alloc.allocation_id);
                }
            },
            criterion::BatchSize::SmallInput,
        );
    });
}

criterion_group!(benches, prefix_cache_benchmark);
criterion_main!(benches);


================================================
FILE: backends/v3/build.rs
================================================
use std::fs;

fn main() -> Result<(), Box<dyn std::error::Error>> {
    println!("cargo:rerun-if-changed=../../proto/");

    fs::create_dir_all("src/client/pb").unwrap_or(());
    let mut config = prost_build::Config::new();
    config.protoc_arg("--experimental_allow_proto3_optional");

    tonic_build::configure()
        .build_client(true)
        .build_server(false)
        .out_dir("src/client/pb")
        .include_file("mod.rs")
        .compile_with_config(config, &["../../proto/v3/generate.proto"], &["../../proto"])
        .unwrap_or_else(|e| panic!("protobuf compilation failed: {e}"));

    Ok(())
}


================================================
FILE: backends/v3/src/backend.rs
================================================
/// Batching and inference logic
use crate::client::{
    Batch, CachedBatch, ClientError, Generation, Health, InfoResponse, ShardedClient,
};
use crate::queue::{Entry, Queue};
use async_trait::async_trait;
use nohash_hasher::IntMap;
use std::sync::Arc;
use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse};
use text_generation_router::validation::ValidGenerateRequest;
use text_generation_router::{FinishReason, PrefillToken, Token};
use tokio::sync::mpsc::error::SendError;
use tokio::sync::{mpsc, Notify};
use tokio::time::Instant;
use tokio_stream::wrappers::UnboundedReceiverStream;
use tracing::{info_span, instrument, Instrument, Span};

pub struct BackendV3 {
    /// Request queue
    queue: Queue,
    /// Notify batcher on queue appends
    batching_task_notifier: Arc<Notify>,
    /// Client clone, used for health checks to skip the queue
    client: ShardedClient,
}

impl BackendV3 {
    #[allow(clippy::too_many_arguments)]
    pub(crate) fn new(
        client: ShardedClient,
        waiting_served_ratio: f32,
        max_batch_prefill_tokens: u32,
        max_batch_total_tokens: u32,
        max_waiting_tokens: usize,
        max_batch_size: Option<usize>,
        shard_info: InfoResponse,
    ) -> Self {
        if shard_info.support_chunking {
            tracing::warn!("Model supports prefill chunking. `waiting_served_ratio` and `max_waiting_tokens` will be ignored.");
        }

        let block_size = shard_info.block_size;

        let queue = Queue::new(
            shard_info.requires_padding,
            block_size,
            shard_info.use_prefix_caching,
            shard_info.window_size,
            shard_info.speculate,
            max_batch_total_tokens,
            shard_info.support_chunking,
        );
        let batching_task_notifier = Arc::new(Notify::new());

        // Spawn batching background task that contains all the inference logic
        tokio::spawn(batching_task(
            client.clone(),
            waiting_served_ratio,
            max_batch_prefill_tokens,
            max_batch_total_tokens,
            max_waiting_tokens,
            max_batch_size,
            shard_info.support_chunking,
            queue.clone(),
            batching_task_notifier.clone(),
        ));

        Self {
            queue,
            batching_task_notifier,
            client,
        }
    }
}

#[async_trait]
impl Backend for BackendV3 {
    #[instrument(skip_all)]
    fn schedule(
        &self,
        request: ValidGenerateRequest,
    ) -> Result<UnboundedReceiverStream<Result<InferStreamResponse, InferError>>, InferError> {
        // MPSC channel to communicate with the background batching task
        let (response_tx, response_rx) = mpsc::unbounded_channel();

        // Append the request to the queue
        self.queue.append(Entry {
            request,
            response_tx,
            span: Span::current(),
            temp_span: None,
            queue_time: Instant::now(),
            batch_time: None,
            block_allocation: None,
        });

        // Notify the background task that we have a new entry in the queue that needs
        // to be batched
        self.batching_task_notifier.notify_one();

        // Return stream
        Ok(UnboundedReceiverStream::new(response_rx))
    }

    async fn health(&self, current_health: bool) -> bool {
        if current_health {
            // Generation is healthy, we only check that the shards can allocate on device
            self.client.device_health().await
        } else {
            self.client.model_health().await
        }
        .is_ok()
    }

    fn start_health(&self) -> bool {
        true
    }

    fn name(&self) -> &'static str {
        "tgi-v3"
    }
}

/// Batching logic
/// Will be launched in a background Tokio task
///
/// Batches requests and sends them to the inference server
#[allow(clippy::too_many_arguments)]
pub(crate) async fn batching_task(
    mut client: ShardedClient,
    waiting_served_ratio: f32,
    max_batch_prefill_tokens: u32,
    max_batch_total_tokens: u32,
    max_waiting_tokens: usize,
    max_batch_size: Option<usize>,
    support_chunking: bool,
    queue: Queue,
    notifier: Arc<Notify>,
) {
    // Infinite loop
    loop {
        // Wait for a notification from the Infer struct
        notifier.notified().await;

        // Get the next batch from the queue
        // This batch might be smaller than the maximum batch size if there are not enough requests
        // waiting in the queue
        while let Some((mut entries, batch, span)) = queue
            .next_batch(
                None,
                max_batch_size,
                max_batch_prefill_tokens,
                max_batch_total_tokens,
            )
            .await
        {
            let mut cached_batch = prefill(&mut client, batch, None, &mut entries)
                .instrument(span)
                .await;
            let mut waiting_tokens = 1;

            // We loop until we do not receive any cached batch from the inference server (== until
            // all requests have met their stopping criteria)
            while let Some(batch) = cached_batch {
                // Get current batch info
                let batch_size = batch.size;
                let batch_max_tokens = batch.max_tokens;
                let current_tokens = batch.current_tokens;
                let mut batches = vec![batch];
                metrics::gauge!("tgi_batch_current_size").set(batch_size as f64);
                metrics::gauge!("tgi_batch_current_max_tokens").set(batch_max_tokens as f64);

                let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens);

                let (min_size, max_size, prefill_token_budget) = if support_chunking {
                    // Since the next batch will be concatenated with the current batch,
                    // the current batch tokens must be subtracted to the prefill budget
                    let prefill_token_budget =
                        max_batch_prefill_tokens.saturating_sub(current_tokens);
                    // We can ignore min_size and max_size
                    // Models than rely on max_size cannot support chunking
                    // Regarding min_size, chunking allow us to consistently run at the compute
                    // bound, making min_size useless.
                    (None, None, prefill_token_budget)
                } else {
                    let min_size = if waiting_tokens >= max_waiting_tokens {
                        // If we didn't onboard any new requests since >= max_waiting_tokens, we try
                        // to add a new batch even though its size might be small
                        None
                    } else {
                        // Minimum batch size
                        // TODO: temporarily disable to avoid incorrect deallocation +
                        //       reallocation when using prefix caching.
                        Some((batch_size as f32 * waiting_served_ratio).floor() as usize)
                    };

                    let max_size =
                        max_batch_size.map(|max_size| max_size.saturating_sub(batch_size as usize));

                    (min_size, max_size, max_batch_prefill_tokens)
                };

                // Try to get a new batch
                if let Some((mut new_entries, new_batch, span)) = queue
                    .next_batch(min_size, max_size, prefill_token_budget, token_budget)
                    .await
                {
                    // Tracking metrics
                    if min_size.is_some() {
                        metrics::counter!("tgi_batch_concat", "reason" => "backpressure")
                            .increment(1);
                    } else {
                        let counter = if support_chunking {
                            metrics::counter!("tgi_batch_concat", "reason" => "chunking")
                        } else {
                            metrics::counter!("tgi_batch_concat", "reason" => "wait_exceeded")
                        };
                        counter.increment(1);
                    }

                    let new_cached_batch = if support_chunking {
                        // Get cached batch
                        let cached_batch = batches.pop();
                        // Extend entries with the new entries since the batch will be
                        // concatenated during the prefill op server side
                        entries.extend(new_entries);
                        // Generate one token for both the cached batch and the new batch
                        let new_cached_batch =
                            prefill(&mut client, new_batch, cached_batch, &mut entries)
                                .instrument(span)
                                .await;
                        if new_cached_batch.is_none() {
                            // New cached batch is empty, no work left
                            break;
                        }
                        new_cached_batch
                    } else {
                        // Request are waiting because we cannot concatenate the batches if the
                        // model/server does not support chunking
                        entries.iter_mut().for_each(|(_, entry)| {
                            // Create a new span to add the info that this entry is waiting
                            // because a new batch is being computed
                            let entry_waiting_span = info_span!(parent: &entry.span, "waiting");
                            // Add relationships
                            span.follows_from(&entry_waiting_span);
                            entry_waiting_span.follows_from(&span);
                            // Update entry
                            entry.temp_span = Some(entry_waiting_span);
                        });

                        // Generate one token for this new batch to have the attention past in cache
                        let new_cached_batch =
                            prefill(&mut client, new_batch, None, &mut new_entries)
                                .instrument(span)
                                .await;
                        if new_cached_batch.is_some() {
                            // Extend entries
                            entries.extend(new_entries);
                        }
                        new_cached_batch
                    };

                    // Reset waiting counter
                    waiting_tokens = 1;
                    // Extend current batch with the new batch
                    if let Some(new_cached_batch) = new_cached_batch {
                        batches.push(new_cached_batch);
                    }
                }

                // Create span for this batch to add context to inference calls
                let next_batch_size = entries.len();
                let next_batch_span =
                    info_span!(parent: None, "batch", batch_size = next_batch_size);
                entries.iter_mut().for_each(|(_, entry)| {
                    // Create a new span to link the batch back to this entry
                    let entry_batch_span = info_span!(parent: &entry.span, "infer");
                    // Add relationships
                    next_batch_span.follows_from(&entry_batch_span);
                    entry_batch_span.follows_from(&next_batch_span);
                    // Update entry
                    entry.temp_span = Some(entry_batch_span);
                });

                cached_batch = decode(&mut client, batches, &mut entries)
                    .instrument(next_batch_span)
                    .await;
                waiting_tokens += 1;
            }
            metrics::gauge!("tgi_batch_current_size").set(0.0);
            metrics::gauge!("tgi_batch_current_max_tokens").set(0.0);
        }
    }
}

#[instrument(skip_all)]
async fn prefill(
    client: &mut ShardedClient,
    batch: Batch,
    cached_batch: Option<CachedBatch>,
    entries: &mut IntMap<u64, Entry>,
) -> Option<CachedBatch> {
    let start_time = Instant::now();
    let batch_id = batch.id;
    metrics::counter!("tgi_batch_inference_count", "method" => "prefill").increment(1);

    match client.prefill(batch, cached_batch).await {
        Ok((generations, next_batch, timings)) => {
            let start_filtering_time = Instant::now();
            // Send generated tokens and filter stopped entries
            filter_send_generations(generations, entries);

            // Filter next batch and remove requests that were stopped
            let next_batch = filter_batch(client, next_batch, entries).await;

            if let Some(concat_duration) = timings.concat {
                metrics::histogram!("tgi_batch_concat_duration", "method" => "decode")
                    .record(concat_duration.as_secs_f64());
            }
            metrics::histogram!("tgi_batch_forward_duration", "method" => "prefill")
                .record(timings.forward.as_secs_f64());
            metrics::histogram!("tgi_batch_decode_duration", "method" => "prefill")
                .record(timings.decode.as_secs_f64());
            metrics::histogram!("tgi_batch_filter_duration", "method" => "prefill")
                .record(start_filtering_time.elapsed().as_secs_f64());
            metrics::histogram!("tgi_batch_inference_duration", "method" => "prefill")
                .record(start_time.elapsed().as_secs_f64());
            metrics::counter!("tgi_batch_inference_success", "method" => "prefill").increment(1);
            next_batch
        }
        // If we have an error, we discard the whole batch
        Err(err) => {
            let _ = client.clear_cache(Some(batch_id)).await;
            send_errors(err, entries);
            metrics::counter!("tgi_batch_inference_failure", "method" => "prefill").increment(1);
            None
        }
    }
}

#[instrument(skip_all)]
async fn decode(
    client: &mut ShardedClient,
    batches: Vec<CachedBatch>,
    entries: &mut IntMap<u64, Entry>,
) -> Option<CachedBatch> {
    let start_time = Instant::now();
    let batch_ids: Vec<u64> = batches.iter().map(|b| b.id).collect();
    metrics::counter!("tgi_batch_inference_count", "method" => "decode").increment(1);

    match client.decode(batches).await {
        Ok((generations, next_batch, timings)) => {
            let start_filtering_time = Instant::now();
            // Send generated tokens and filter stopped entries
            filter_send_generations(generations, entries);

            // Filter next batch and remove requests that were stopped
            let next_batch = filter_batch(client, next_batch, entries).await;

            if let Some(concat_duration) = timings.concat {
                metrics::histogram!("tgi_batch_concat_duration", "method" => "decode")
                    .record(concat_duration.as_secs_f64());
            }
            metrics::histogram!("tgi_batch_forward_duration", "method" => "decode")
                .record(timings.forward.as_secs_f64());
            metrics::histogram!("tgi_batch_decode_duration", "method" => "decode")
                .record(timings.decode.as_secs_f64());
            metrics::histogram!("tgi_batch_filter_duration", "method" => "decode")
                .record(start_filtering_time.elapsed().as_secs_f64());
            metrics::histogram!("tgi_batch_inference_duration", "method" => "decode")
                .record(start_time.elapsed().as_secs_f64());
            metrics::counter!("tgi_batch_inference_success", "method" => "decode").increment(1);
            next_batch
        }
        // If we have an error, we discard the whole batch
        Err(err) => {
            for id in batch_ids {
                let _ = client.clear_cache(Some(id)).await;
            }
            send_errors(err, entries);
            metrics::counter!("tgi_batch_inference_failure", "method" => "decode").increment(1);
            None
        }
    }
}

/// Filter a `batch` and remove all requests not present in `entries`
#[instrument(skip_all)]
async fn filter_batch(
    client: &mut ShardedClient,
    next_batch: Option<CachedBatch>,
    entries: &IntMap<u64, Entry>,
) -> Option<CachedBatch> {
    let mut batch = next_batch?;

    // No need to filter
    if batch.size as usize == entries.len() {
        return Some(batch);
    }

    let id = batch.id;

    // Retain only requests that are still in entries
    batch.request_ids.retain(|id| entries.contains_key(id));

    if batch.request_ids.is_empty() {
        // All requests have been filtered out
        // Next batch is now empty
        // Clear it from the Python shards cache
        // We unwrap here as we need to panic since we cannot recover if this method fails
        client.clear_cache(Some(id)).await.unwrap();
        None
    } else {
        // Filter Python shard cache
        // We unwrap here as we need to panic since we cannot recover if this method fails
        client.filter_batch(id, batch.request_ids).await.unwrap()
    }
}

/// Send one or multiple `InferStreamResponse` to Infer for all `entries`
/// and filter entries
#[instrument(skip_all)]
fn filter_send_generations(generations: Vec<Generation>, entries: &mut IntMap<u64, Entry>) {
    generations.into_iter().for_each(|generation| {
        let id = generation.request_id;
        // Get entry
        // We can `expect` here as the request id should always be in the entries
        let entry = entries
            .get(&id)
            .expect("ID not found in entries. This is a bug.");

        // Create and enter a span to link this function back to the entry
        let _span = info_span!(parent: entry.temp_span.as_ref().expect("batch_span is None. This is a bug."), "send_generation", generation = ?generation).entered();
        // Send generation responses back to the infer task
        // If the receive an error from the Flume channel, it means that the client dropped the
        // request and we need to stop generating hence why we unwrap_or(true)
        let stopped = send_responses(generation, entry).inspect_err(|_err| {
            tracing::error!("Entry response channel error.");
            metrics::counter!("tgi_request_failure", "err" => "dropped").increment(1);
        }).unwrap_or(true);
        if stopped {
            entries.remove(&id).expect("ID not found in entries. This is a bug.");
        }
    });
}

/// Send responses through the `entry` response channel
fn send_responses(
    generation: Generation,
    entry: &Entry,
) -> Result<bool, Box<SendError<Result<InferStreamResponse, InferError>>>> {
    // Return directly if the channel is disconnected
    if entry.response_tx.is_closed() {
        metrics::counter!("tgi_request_failure", "err" => "dropped").increment(1);
        return Ok(true);
    }

    let mut stopped = false;

    if let Some(prefill_tokens) = generation.prefill_tokens {
        // Create Token objects
        // We do that here instead of in the Python code as Rust for loops are faster
        let prefill_tokens = prefill_tokens
            .ids
            .into_iter()
            .zip(prefill_tokens.logprobs)
            .zip(prefill_tokens.texts)
            .map(|((id, logprob), text)| PrefillToken { id, text, logprob })
            .collect();

        // Send message
        entry
            .response_tx
            .send(Ok(InferStreamResponse::Prefill(prefill_tokens)))?;
    }

    // Create last Token
    let tokens_ = generation.tokens.expect("Non empty tokens in generation");
    let n = tokens_.ids.len();
    metrics::histogram!("tgi_request_skipped_tokens").record((n - 1) as f64);
    let mut iterator = tokens_
        .ids
        .into_iter()
        .zip(tokens_.logprobs)
        .zip(tokens_.texts)
        .zip(tokens_.is_special)
        .enumerate()
        .peekable();
    while let Some((i, (((id, logprob), text), special))) = iterator.next() {
        let token = Token {
            id,
            text,
            logprob,
            special,
        };
        let top_tokens = if let Some(top_tokens_) = generation.top_tokens.get(i) {
            top_tokens_
                .ids
                .iter()
                .zip(top_tokens_.logprobs.iter())
                .zip(top_tokens_.texts.iter())
                .zip(top_tokens_.is_special.iter())
                .map(|(((&id, &logprob), text), &special)| Token {
                    id,
                    text: text.to_string(),
                    logprob,
                    special,
                })
                .collect()
        } else {
            vec![]
        };
        match (&generation.generated_text, iterator.peek()) {
            (Some(generated_text), None) => {
                // Generation has ended
                stopped = true;
                // Send message
                entry.response_tx.send(Ok(InferStreamResponse::End {
                    token,
                    top_tokens,
                    generated_text: GeneratedText::from(generated_text.clone()),
                    queued: entry.queue_time,
                    start: entry.batch_time.unwrap(),
                }))?;
            }
            _ => {
                // Send message
                entry
                    .response_tx
                    .send(Ok(InferStreamResponse::Intermediate { token, top_tokens }))?;
            }
        }
    }

    Ok(stopped)
}

/// Send errors to Infer for all `entries`
#[instrument(skip_all)]
fn send_errors(error: ClientError, entries: &mut IntMap<u64, Entry>) {
    entries.drain().for_each(|(_, entry)| {
        // Create and enter a span to link this function back to the entry
        let _send_error_span = info_span!(parent: entry.temp_span.as_ref().expect("batch_span is None. This is a bug."), "send_error").entered();
        let err = InferError::GenerationError(error.to_string());
        metrics::counter!("tgi_request_failure", "err" => "generation").increment(1);
        tracing::error!("{err}");

        // unwrap_or is valid here as we don't care if the receiver is gone.
        entry
            .response_tx
            .send(Err(err))
            .unwrap_or(());
    });
}

impl From<crate::client::GeneratedText> for GeneratedText {
    fn from(value: crate::client::GeneratedText) -> Self {
        let v3_finish_reason = crate::client::FinishReason::try_from(value.finish_reason).unwrap();
        let finish_reason = match v3_finish_reason {
            crate::client::FinishReason::Length => FinishReason::Length,
            crate::client::FinishReason::EosToken => FinishReason::EndOfSequenceToken,
            crate::client::FinishReason::StopSequence => FinishReason::StopSequence,
        };

        Self {
            text: value.text,
            generated_tokens: value.generated_tokens,
            finish_reason,
            seed: value.seed,
        }
    }
}


================================================
FILE: backends/v3/src/block_allocator.rs
================================================
use std::sync::Arc;
use tokio::sync::{mpsc, oneshot};

use crate::radix::RadixAllocator;
use text_generation_router::usage_stats::Env;
#[derive(Debug, Clone)]
pub struct BlockAllocation {
    pub allocation_id: u64,
    pub blocks: Vec<u32>,
    pub slots: Vec<u32>,

    /// Prefix that was cached and for which the KV does not have to
    /// be recomputed.
    pub prefix_len: u32,

    pub(crate) block_allocator: Option<BlockAllocator>,
}

impl Drop for BlockAllocation {
    fn drop(&mut self) {
        if let Some(block_allocator) = self.block_allocator.as_mut() {
            block_allocator.free(self.blocks.clone(), self.allocation_id)
        }
    }
}

#[derive(Debug, Clone)]
pub struct BlockAllocator {
    /// Channel to communicate with the background task
    block_allocator: mpsc::UnboundedSender<BlockAllocatorCommand>,
}

impl BlockAllocator {
    pub(crate) fn new(
        max_batch_total_tokens: u32,
        block_size: u32,
        prefix_caching: bool,
        window_size: Option<u32>,
    ) -> Self {
        // Create channel
        let (sender, receiver) = mpsc::unbounded_channel();

        // Launch background queue task
        tokio::spawn(block_allocator_task(
            max_batch_total_tokens / block_size,
            block_size,
            prefix_caching,
            window_size,
            receiver,
        ));

        Self {
            block_allocator: sender,
        }
    }

    pub(crate) async fn allocate(
        &self,
        tokens: u32,
        prefill_tokens: Option<Arc<Vec<u32>>>,
    ) -> Option<BlockAllocation> {
        let (response_sender, response_receiver) = oneshot::channel();
        self.block_allocator
            .send(BlockAllocatorCommand::Allocate {
                tokens,
                prefill_tokens,
                response_sender,
            })
            .unwrap();

        response_receiver.await.unwrap().map(|mut allocation| {
            allocation.block_allocator = Some(self.clone());
            allocation
        })
    }

    pub(crate) fn free(&self, blocks: Vec<u32>, allocation_id: u64) {
        self.block_allocator
            .send(BlockAllocatorCommand::Free {
                allocation_id,
                blocks,
            })
            .unwrap();
    }
}

async fn block_allocator_task(
    blocks: u32,
    block_size: u32,
    prefix_caching: bool,
    window_size: Option<u32>,
    mut receiver: mpsc::UnboundedReceiver<BlockAllocatorCommand>,
) {
    let mut allocator: Box<dyn Allocator + Send> = if prefix_caching {
        Box::new(RadixAllocator::new(block_size, blocks, window_size))
    } else {
        Box::new(SimpleAllocator::new(blocks, block_size, window_size))
    };
    while let Some(cmd) = receiver.recv().await {
        match cmd {
            BlockAllocatorCommand::Free {
                blocks,
                allocation_id,
            } => allocator.free(blocks, allocation_id),
            BlockAllocatorCommand::Allocate {
                tokens,
                prefill_tokens,
                response_sender,
            } => {
                response_sender
                    .send(allocator.allocate(tokens, prefill_tokens))
                    .unwrap();
            }
        }
    }
}

#[derive(Debug)]
enum BlockAllocatorCommand {
    Free {
        blocks: Vec<u32>,
        allocation_id: u64,
    },
    Allocate {
        tokens: u32,
        prefill_tokens: Option<Arc<Vec<u32>>>,
        response_sender: oneshot::Sender<Option<BlockAllocation>>,
    },
}

pub trait Allocator {
    fn allocate(
        &mut self,
        tokens: u32,
        prefill_tokens: Option<Arc<Vec<u32>>>,
    ) -> Option<BlockAllocation>;

    fn free(&mut self, blocks: Vec<u32>, allocation_id: u64);
}
pub struct SimpleAllocator {
    free_blocks: Vec<u32>,
    block_size: u32,
    window_size: Option<u32>,
    is_hpu_device: bool,
}

impl SimpleAllocator {
    fn new(blocks: u32, block_size: u32, window_size: Option<u32>) -> Self {
        SimpleAllocator {
            block_size,
            // Block 0 is reserved for health checks
            free_blocks: (1..blocks).collect(),
            window_size,
            is_hpu_device: Env::new().is_hpu_device(),
        }
    }
}

impl Allocator for SimpleAllocator {
    fn allocate(
        &mut self,
        tokens: u32,
        _prefill_tokens: Option<Arc<Vec<u32>>>,
    ) -> Option<BlockAllocation> {
        let mut tokens = tokens;
        if self.is_hpu_device {
            // need 1 slot for ping-pong optimization
            tokens += 1;
        }
        // Apply window size
        let (required_blocks, repeats) = {
            let (tokens, repeats) = match self.window_size {
                None => (tokens, 1),
                Some(window_size) => {
                    let repeats = tokens.div_ceil(window_size);
                    let tokens = core::cmp::min(tokens, window_size);
                    (tokens, repeats as usize)
                }
            };
            // Pad to a multiple of block size
            let required_blocks = tokens.div_ceil(self.block_size);
            (required_blocks, repeats)
        };
        let tokens = tokens as usize;
        if required_blocks > self.free_blocks.len() as u32 {
            None
        } else {
            if self.is_hpu_device {
                self.free_blocks.sort_by(|a, b| b.cmp(a));
            }
            let mut blocks = self
                .free_blocks
                .split_off(self.free_blocks.len() - required_blocks as usize);
            if self.is_hpu_device {
                blocks.sort();
            }
            let mut slots =
                Vec::with_capacity((required_blocks * self.block_size * repeats as u32) as usize);

            'slots: for block_id in blocks.repeat(repeats).iter() {
                for s in (block_id * self.block_size)..((block_id + 1) * self.block_size) {
                    slots.push(s);
                    if slots.len() == tokens {
                        break 'slots;
                    }
                }
            }
            Some(BlockAllocation {
                allocation_id: 0,
                blocks,
                slots,
                prefix_len: 0,
                block_allocator: None,
            })
        }
    }

    fn free(&mut self, blocks: Vec<u32>, _allocation_id: u64) {
        self.free_blocks.extend(blocks)
    }
}


================================================
FILE: backends/v3/src/client/grpc_client.rs
================================================
/// Single shard Client
use crate::client::{pb, Chunk};
use crate::client::{ClientError, Result, WARMUP_IMAGE_BASE64};
use base64::engine::general_purpose::STANDARD;
use base64::Engine;
use grpc_metadata::InjectTelemetryContext;
use pb::generate::v3::text_generation_service_client::TextGenerationServiceClient;
use pb::generate::v3::*;
use std::cmp::min;
use std::time::Duration;
use tonic::transport::{Channel, Uri};
use tracing::instrument;

/// Text Generation Inference gRPC client
#[derive(Debug, Clone)]
pub struct Client {
    stub: TextGenerationServiceClient<Channel>,
}

impl Client {
    /// Returns a client connected to the given url
    #[allow(dead_code)]
    pub async fn connect(uri: Uri) -> Result<Self> {
        let channel = Channel::builder(uri).connect().await?;

        Ok(Self {
            stub: TextGenerationServiceClient::new(channel),
        })
    }

    /// Returns a client connected to the given unix socket
    pub async fn connect_uds(path: String) -> Result<Self> {
        let channel = Channel::from_shared("http://[::]:50051".to_string())
            .unwrap()
            .connect_with_connector(tower::service_fn(move |_: Uri| {
                tokio::net::UnixStream::connect(path.clone())
            }))
            .await?;

        Ok(Self {
            stub: TextGenerationServiceClient::new(channel),
        })
    }

    /// Returns a list of uris or unix sockets of all shards
    #[instrument(skip(self))]
    pub async fn service_discovery(&mut self) -> Result<Vec<String>> {
        let request = tonic::Request::new(ServiceDiscoveryRequest {}).inject_context();
        let response = self.stub.service_discovery(request).await.map_err(|_| {
            ClientError::Connection("Server does not support v3 interface".to_string())
        })?;
        let urls = response
            .into_inner()
            .urls
            .into_iter()
            // Remove unix socket prefix
            .map(|url| match url.strip_prefix("unix://") {
                None => url,
                Some(stripped_url) => stripped_url.to_string(),
            })
            .collect();
        Ok(urls)
    }

    /// Get model info
    #[instrument(skip(self))]
    pub async fn info(&mut self) -> Result<InfoResponse> {
        let request = tonic::Request::new(InfoRequest {}).inject_context();
        let response = self.stub.info(request).await?.into_inner();
        Ok(response)
    }

    /// Get model health
    #[instrument(skip(self))]
    pub async fn health(&mut self) -> Result<HealthResponse> {
        let request = tonic::Request::new(HealthRequest {}).inject_context();
        let response = self.stub.health(request).await?.into_inner();
        Ok(response)
    }

    /// Clear the past generations cache
    #[instrument(skip(self))]
    pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {
        let request = tonic::Request::new(ClearCacheRequest { id: batch_id }).inject_context();
        self.stub.clear_cache(request).await?;
        Ok(())
    }

    /// Filter a cached batch
    #[instrument(skip(self))]
    pub async fn filter_batch(
        &mut self,
        batch_id: u64,
        request_ids: Vec<u64>,
    ) -> Result<Option<CachedBatch>> {
        let request = tonic::Request::new(FilterBatchRequest {
            batch_id,
            request_ids,
        })
        .inject_context();
        let filtered_batch = self.stub.filter_batch(request).await?.into_inner();
        Ok(filtered_batch.batch)
    }

    /// Warmup on a max size batch
    ///
    /// Returns the maximum amount of tokens supported by the hardware
    #[instrument(skip_all)]
    pub async fn warmup(
        &mut self,
        max_input_tokens: Option<u32>,
        max_prefill_tokens: u32,
        max_total_tokens: Option<u32>,
        max_batch_size: Option<usize>,
    ) -> Result<(Option<u32>, u32, u32)> {
        let mut n_tokens = 0;
        let mut requests = Vec::new();
        // Create requests
        while n_tokens < max_prefill_tokens {
            let mut truncate = max_prefill_tokens - n_tokens;
            if let Some(max_input_tokens) = max_input_tokens {
                truncate = min(max_input_tokens, truncate);
            }

            let mut input_chunks = Vec::new();
            input_chunks.push(Chunk::Text("_test ".to_string().repeat(truncate as usize)).into());
            if n_tokens == 0 {
                input_chunks.push(
                    Chunk::Image(Image {
                        // Safe unwrap, because we control the data.
                        data: STANDARD.decode(WARMUP_IMAGE_BASE64).unwrap(),
                        mimetype: "image/jpeg;base64".to_string(),
                    })
                    .into(),
                );
            }

            // Send stringly-typed inputs for compatibility for backends that haven't
            // been updated to support chunks.

            let mut inputs = String::new();
            inputs.push_str(&"_test ".to_string().repeat(truncate as usize));
            if n_tokens == 0 {
                // 1 request is enough to test vision heads.
                // Sending images on other queries messes up easily with truncation.
                inputs.push_str(&format!(
                    "![](data:image/jpeg;base64,{WARMUP_IMAGE_BASE64})",
                ));
            }

            let max_new_tokens = if let Some(max_total_tokens) = max_total_tokens {
                max_total_tokens - truncate
            } else {
                1
            };

            requests.push(Request {
                id: 0,
                inputs,
                add_special_tokens: true,
                input_chunks: Some(Input {
                    chunks: input_chunks,
                }),
                // We truncate the input on the server side to be sure that it has the correct size
                truncate,
                // Blocks and slots will be set on the server side if we use paged attention
                blocks: vec![],
                slots: vec![],
                cache_len: 0,
                chunk_len: None,
                // Set sampling parameters to also take these ops into account in the max memory
                parameters: Some(NextTokenChooserParameters {
                    temperature: 0.9,
                    top_k: 10,
                    top_p: 0.9,
                    typical_p: 0.9,
                    do_sample: false,
                    seed: 0,
                    repetition_penalty: 1.2,
                    frequency_penalty: 0.1,
                    watermark: true,
                    grammar: String::new(),
                    grammar_type: GrammarType::None as i32,
                }),
                stopping_parameters: Some(StoppingCriteriaParameters {
                    max_new_tokens,
                    stop_sequences: vec![],
                    ignore_eos_token: true,
                }),
                prefill_logprobs: true,
                top_n_tokens: 20,
                adapter_id: None,
            });
            n_tokens += truncate;

            // Check max_batch_size
            if Some(requests.len()) == max_batch_size {
                break;
            }
        }

        let batch = Batch {
            id: 0,
            size: requests.len() as u32,
            requests,
            max_tokens: max_input_tokens.unwrap_or(0),
            max_blocks: 0,
        };

        let request = tonic::Request::new(WarmupRequest {
            batch: Some(batch),
            max_input_tokens,
            max_prefill_tokens,
            max_total_tokens,
        })
        .inject_context();
        let response = self.stub.warmup(request).await?.into_inner();
        Ok((
            response.max_supported_total_tokens,
            response.max_input_tokens,
            response.max_total_tokens,
        ))
    }

    /// Generate one token for each request in the given batch
    ///
    /// Returns Generation for each request in batch
    /// and the next cached batch
    #[instrument(skip_all, fields(id = &batch.id, size = &batch.size))]
    pub async fn prefill(
        &mut self,
        batch: Batch,
        cached_batch: Option<CachedBatch>,
    ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
        let request = tonic::Request::new(PrefillRequest {
            batch: Some(batch),
            cached_batch,
        })
        .inject_context();
        let response = self.stub.prefill(request).await?.into_inner();
        Ok((
            response.generations,
            response.batch,
            PrefillTimings::new(
                response.concat_ns,
                response.forward_ns,
                response.decode_ns,
                response.total_ns,
            ),
        ))
    }

    /// Generate one token for each request in the given cached batches
    ///
    /// Returns Generation for each request in batches
    /// and the next cached batch
    #[instrument(skip_all, fields(size = batches.iter().map(|batch|{batch.size}).sum::<u32>()))]
    pub async fn decode(
        &mut self,
        batches: Vec<CachedBatch>,
    ) -> Result<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)> {
        let request = tonic::Request::new(DecodeRequest { batches }).inject_context();
        let response = self.stub.decode(request).await?.into_inner();
        Ok((
            response.generations,
            response.batch,
            DecodeTimings::new(
                response.concat_ns,
                response.forward_ns,
                response.decode_ns,
                response.total_ns,
            ),
        ))
    }
}

pub struct PrefillTimings {
    pub concat: Option<Duration>,
    pub forward: Duration,
    pub decode: Duration,
    pub total: Duration,
}

impl PrefillTimings {
    fn new(concat_ns: Option<u64>, forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
        Self {
            concat: concat_ns.map(Duration::from_nanos),
            forward: Duration::from_nanos(forward_ns),
            decode: Duration::from_nanos(decode_ns),
            total: Duration::from_nanos(total_ns),
        }
    }
}

pub struct DecodeTimings {
    pub concat: Option<Duration>,
    pub forward: Duration,
    pub decode: Duration,
    pub total: Duration,
}

impl DecodeTimings {
    fn new(concat_ns: Option<u64>, forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
        Self {
            concat: concat_ns.map(Duration::from_nanos),
            forward: Duration::from_nanos(forward_ns),
            decode: Duration::from_nanos(decode_ns),
            total: Duration::from_nanos(total_ns),
        }
    }
}


================================================
FILE: backends/v3/src/client/mod.rs
================================================
//! Text Generation gRPC client library

use async_trait::async_trait;
use thiserror::Error;
use tonic::transport;
use tonic::Status;

#[allow(clippy::derive_partial_eq_without_eq)]
mod pb;

mod grpc_client;
mod sharded_client;

pub use grpc_client::Client;
pub use pb::generate::v3::{
    input_chunk::Chunk, Batch, CachedBatch, FinishReason, GeneratedText, Generation, GrammarType,
    HealthResponse, Image, InfoResponse, Input, InputChunk, NextTokenChooserParameters, Request,
    StoppingCriteriaParameters,
};
pub use sharded_client::ShardedClient;

#[async_trait]
pub trait Health {
    /// Check if a generate server is healthy by asking it to allocate a tensor on device
    async fn device_health(&self) -> Result<()>;

    /// Check if a generate server is healthy by doing a forward pass.
    /// EXPENSIVE
    async fn model_health(&self) -> Result<()>;
}

#[derive(Error, Debug, Clone)]
pub enum ClientError {
    #[error("Could not connect to Text Generation server: {0}")]
    Connection(String),
    #[error("Server error: {0}")]
    Generation(String),
    #[error("Sharded results are empty")]
    EmptyResults,
}

impl From<Status> for ClientError {
    fn from(err: Status) -> Self {
        let err = Self::Generation(err.message().to_string());
        tracing::error!("{err}");
        err
    }
}

impl From<transport::Error> for ClientError {
    fn from(err: transport::Error) -> Self {
        let err = Self::Connection(err.to_string());
        tracing::error!("{err}");
        err
    }
}

// Small convenience re-wrapping of `Chunk`.
impl From<Chunk> for InputChunk {
    fn from(chunk: Chunk) -> Self {
        InputChunk { chunk: Some(chunk) }
    }
}

static WARMUP_IMAGE_BASE64 :&str = "iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAIAAAAC64paAAABg2lDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw0AcxV/TSotUROxQxCFDdbKLijjWKhShQqgVWnUwufQLmrQkKS6OgmvBwY/FqoOLs64OroIg+AHi7OCk6CIl/i8ptIjx4Lgf7+497t4BQqvKNDOQADTdMjKppJjLr4rBVwQQwhAERGVm1uckKQ3P8XUPH1/v4jzL+9yfY0AtmAzwicQJVjcs4g3imU2rznmfOMLKskp8Tjxh0AWJH7muuPzGueSwwDMjRjYzTxwhFks9rPQwKxsa8TRxTNV0yhdyLquctzhr1Qbr3JO/MFzQV5a5TnMUKSxiCRJEKGiggiosxGnVSTGRof2kh3/E8UvkUshVASPHAmrQIDt+8D/43a1ZnJp0k8JJoO/Ftj/GgOAu0G7a9vexbbdPAP8zcKV3/bUWMPtJerOrxY6AwW3g4rqrKXvA5Q4QfarLhuxIfppCsQi8n9E35YHhW6B/ze2ts4/TByBLXaVvgINDYLxE2ese7w719vbvmU5/PycecohsjayNAAAACXBIWXMAAC4jAAAuIwF4pT92AAAAB3RJTUUH6AQIEQMnlTSSjwAAABl0RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAASSURBVDjLY2AYBaNgFIyCoQsABMQAAeRw1DoAAAAASUVORK5CYII=";

pub type Result<T> = std::result::Result<T, ClientError>;


================================================
FILE: backends/v3/src/client/sharded_client.rs
================================================
use crate::client::Health;
/// Multi shard Client
use crate::client::{ClientError, Result};

use crate::client::grpc_client::{DecodeTimings, PrefillTimings};
use crate::client::{
    Batch, CachedBatch, Client, Generation, GrammarType, HealthResponse,
    NextTokenChooserParameters, Request, StoppingCriteriaParameters,
};
use crate::client::{Chunk, InfoResponse, Input};
use async_trait::async_trait;
use futures::future::join_all;
use tonic::transport::Uri;
use tracing::instrument;

#[derive(Debug, Clone)]
/// Text Generation Inference gRPC multi client
pub struct ShardedClient {
    clients: Vec<Client>,
}

impl ShardedClient {
    fn new(clients: Vec<Client>) -> Self {
        Self { clients }
    }

    /// Create a new ShardedClient from a master client. The master client will communicate with
    /// the other shards and returns all uris/unix sockets with the `service_discovery` gRPC method.
    async fn from_master_client(mut master_client: Client) -> Result<Self> {
        // Get all uris/unix sockets from the master client
        let uris = master_client.service_discovery().await?;
        let futures = uris.into_iter().map(Client::connect_uds);
        let clients: Result<Vec<Client>> = join_all(futures).await.into_iter().collect();
        Ok(Self::new(clients?))
    }

    /// Returns a client connected to the given uri
    #[allow(dead_code)]
    pub async fn connect(uri: Uri) -> Result<Self> {
        let master_client = Client::connect(uri).await?;
        Self::from_master_client(master_client).await
    }

    /// Returns a client connected to the given unix socket
    pub async fn connect_uds(path: String) -> Result<Self> {
        let master_client = Client::connect_uds(path).await?;
        Self::from_master_client(master_client).await
    }

    /// Get the model info
    #[instrument(skip(self))]
    pub async fn info(&mut self) -> Result<InfoResponse> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| client.info())
            .collect();
        join_all(futures).await.pop().unwrap()
    }

    /// GRPC health check
    #[instrument(skip(self))]
    pub async fn health(&mut self) -> Result<HealthResponse> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| client.health())
            .collect();
        join_all(futures).await.pop().unwrap()
    }

    /// Clear the past generations cache
    #[instrument(skip(self))]
    pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| client.clear_cache(batch_id))
            .collect();
        join_all(futures).await.into_iter().collect()
    }

    /// Filter a cached batch
    #[instrument(skip(self))]
    pub async fn filter_batch(
        &mut self,
        batch_id: u64,
        request_ids: Vec<u64>,
    ) -> Result<Option<CachedBatch>> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| Box::pin(client.filter_batch(batch_id, request_ids.clone())))
            .collect();
        // all shards return the same message
        join_all(futures).await.pop().unwrap()
    }

    /// Warmup on a max size batch
    ///
    /// Returns the maximum amount of tokens supported by the hardware
    #[instrument(skip(self))]
    pub async fn warmup(
        &mut self,
        max_input_length: Option<u32>,
        max_prefill_tokens: u32,
        max_total_tokens: Option<u32>,
        max_batch_size: Option<usize>,
    ) -> Result<(Option<u32>, u32, u32)> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| {
                Box::pin(client.warmup(
                    max_input_length,
                    max_prefill_tokens,
                    max_total_tokens,
                    max_batch_size,
                ))
            })
            .collect();
        let results = join_all(futures)
            .await
            .into_iter()
            .collect::<Result<Vec<(Option<u32>, u32, u32)>>>()?;

        // Take the minimum value
        // Different shards hold different parts of vocab, might yield
        // different available block size.
        let min = results
            .iter()
            .min()
            .expect("Expect at least 1 warmup result");
        Ok(*min)
    }

    /// Generate one token for each request in the given batch
    ///
    /// Returns Generation for each request in batch
    /// and the next cached batch
    #[instrument(skip_all, fields(id = & batch.id, size = & batch.size))]
    pub async fn prefill(
        &mut self,
        batch: Batch,
        cached_batch: Option<CachedBatch>,
    ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| Box::pin(client.prefill(batch.clone(), cached_batch.clone())))
            .collect();
        #[allow(clippy::type_complexity)]
        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)>> =
            join_all(futures).await.into_iter().collect();
        let mut results = results?;

        let (mut generations, next_batch, mut timings) =
            results.pop().ok_or(ClientError::EmptyResults)?;

        // Merge generations from different model shards
        for (mut shard_generations, _, shard_timings) in results.into_iter() {
            generations.append(&mut shard_generations);
            // Return the timings of the slowest shard
            if shard_timings.total > timings.total {
                timings = shard_timings;
            }
        }
        Ok((generations, next_batch, timings))
    }

    /// Generate one token for each request in the given cached batches
    ///
    /// Returns Generation for each request in batches
    /// and the next cached batch
    #[instrument(skip_all, fields(size = batches.iter().map(| batch | {batch.size}).sum::< u32 > ()))]
    pub async fn decode(
        &mut self,
        batches: Vec<CachedBatch>,
    ) -> Result<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| Box::pin(client.decode(batches.clone())))
            .collect();
        #[allow(clippy::type_complexity)]
        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)>> =
            join_all(futures).await.into_iter().collect();
        let mut results = results?;

        let (mut generations, next_batch, mut timings) =
            results.pop().ok_or(ClientError::EmptyResults)?;

        // Merge generations from different model shards
        for (mut shard_generations, _, shard_timings) in results.into_iter() {
            generations.append(&mut shard_generations);
            // Return the timings of the slowest shard
            if shard_timings.total > timings.total {
                timings = shard_timings;
            }
        }
        Ok((generations, next_batch, timings))
    }
}

#[async_trait]
impl Health for ShardedClient {
    async fn device_health(&self) -> Result<()> {
        self.clone().health().await?;
        Ok(())
    }

    async fn model_health(&self) -> Result<()> {
        // Dummy batch of 1 token and 1 generated token
        let liveness_request = Request {
            id: u64::MAX,
            inputs: "liveness".to_string(),
            input_chunks: Some(Input {
                chunks: vec![Chunk::Text("liveness".into()).into()],
            }),
            truncate: 1,
            add_special_tokens: false,
            prefill_logprobs: false,
            parameters: Some(NextTokenChooserParameters {
                temperature: 1.0,
                top_k: 0,
                top_p: 1.0,
                typical_p: 1.0,
                do_sample: false,
                seed: 0,
                repetition_penalty: 1.0,
                frequency_penalty: 0.0,
                watermark: false,
                grammar: String::new(),
                grammar_type: GrammarType::None as i32,
            }),
            stopping_parameters: Some(StoppingCriteriaParameters {
                max_new_tokens: 1,
                stop_sequences: vec![],
                ignore_eos_token: false,
            }),
            top_n_tokens: 0,
            // Block 0 is reserved for health checks
            blocks: vec![0],
            slots: vec![0],
            cache_len: 0,
            adapter_id: None,
            chunk_len: None,
        };
        let batch = Batch {
            id: u64::MAX,
            requests: vec![liveness_request],
            size: 1,
            max_tokens: 2,
            max_blocks: 1,
        };
        self.clone().prefill(batch, None).await?;
        Ok(())
    }
}


================================================
FILE: backends/v3/src/lib.rs
================================================
mod backend;
pub mod block_allocator;
mod client;
mod queue;
pub mod radix;

use crate::client::{ClientError, ShardedClient};
pub(crate) use backend::BackendV3;
use serde::Serialize;
use thiserror::Error;
use utoipa::ToSchema;

#[derive(Clone, Debug, Serialize, ToSchema)]
pub struct BackendInfo {
    /// Mandatory
    #[schema(example = "cuda")]
    pub model_device_type: String,
    #[schema(example = "torch.float16")]
    pub model_dtype: String,

    /// Backend parameters
    #[schema(example = "1")]
    pub speculate: usize,
    #[schema(example = "1.2")]
    pub waiting_served_ratio: f32,
    #[schema(example = "32000")]
    pub max_batch_total_tokens: u32,
    #[schema(example = "20")]
    pub max_waiting_tokens: usize,
    #[schema(nullable = true, example = "null")]
    pub max_batch_size: Option<usize>,
    #[schema(example = "false")]
    pub support_chunking: bool,
    #[schema(example = "false")]
    pub prefix_caching: bool,
    #[schema(example = "flashinfer")]
    pub attention_impl: String,
    #[schema(example = "1")]
    pub block_size: u32,

    #[schema(example = "30000")]
    pub max_input_tokens: usize,
    #[schema(example = "32000")]
    pub max_total_tokens: usize,
}

#[allow(clippy::too_many_arguments)]
pub async fn connect_backend(
    max_input_tokens: Option<usize>,
    max_total_tokens: Option<usize>,
    master_shard_uds_path: String,
    waiting_served_ratio: f32,
    max_batch_prefill_tokens: u32,
    max_batch_total_tokens: Option<u32>,
    max_waiting_tokens: usize,
    max_batch_size: Option<usize>,
) -> Result<(BackendV3, BackendInfo), V3Error> {
    // Helper function
    let check_max_batch_total_tokens = |(
        max_supported_batch_total_tokens,
        shard_max_input_tokens,
        shard_max_total_tokens,
    ): (Option<u32>, u32, u32)|
     -> Result<(u32, usize, usize), V3Error> {
        if let Some(max_input_tokens) = max_input_tokens {
            assert_eq!(max_input_tokens as u32, shard_max_input_tokens);
        }
        if let Some(max_total_tokens) = max_total_tokens {
            assert_eq!(max_total_tokens as u32, shard_max_total_tokens);
        }
        match max_supported_batch_total_tokens {
            // Older models do not support automatic max-batch-total-tokens
            None => {
                let max_batch_total_tokens = max_batch_total_tokens.unwrap_or(
                    16000
                        .max(shard_max_total_tokens)
                        .max(max_batch_prefill_tokens),
                );
                tracing::warn!("Model does not support automatic max batch total tokens");
                Ok((
                    max_batch_total_tokens,
                    shard_max_input_tokens as usize,
                    shard_max_total_tokens as usize,
                ))
            }
            // Flash attention models return their max supported total tokens
            Some(max_supported_batch_total_tokens) => {
                // Warn if user added his own max-batch-total-tokens as we will ignore it
                if max_batch_total_tokens.is_some() {
                    tracing::warn!(
                        "`--max-batch-total-tokens` is deprecated for Flash \
                        Attention models."
                    );
                    tracing::warn!(
                        "Inferred max batch total tokens: {max_supported_batch_total_tokens}"
                    );
                }
                if shard_max_total_tokens > max_supported_batch_total_tokens {
                    return Err(V3Error::NotEnoughMemory(shard_max_total_tokens as usize));
                }

                Ok((
                    max_supported_batch_total_tokens,
                    shard_max_input_tokens as usize,
                    shard_max_total_tokens as usize,
                ))
            }
        }
    };

    let mut sharded_client = ShardedClient::connect_uds(master_shard_uds_path)
        .await
        .map_err(V3Error::Connection)?;

    // server is running on v3
    // Clear the cache; useful if the webserver rebooted
    sharded_client
        .clear_cache(None)
        .await
        .map_err(V3Error::Cache)?;
    // Get info from the shard
    let shard_info = sharded_client.info().await.map_err(V3Error::Info)?;

    // Warmup model
    tracing::info!("Warming up model");
    let answer = sharded_client
        .warmup(
            max_input_tokens.map(|p| p as u32),
            max_batch_prefill_tokens,
            max_total_tokens.map(|p| p as u32),
            max_batch_size,
        )
        .await
        .map_err(V3Error::Warmup)?;
    let (max_batch_total_tokens, max_input_tokens, max_total_tokens) =
        check_max_batch_total_tokens(answer)?;
    tracing::info!("Setting max batch total tokens to {max_batch_total_tokens}");
    metrics::gauge!("tgi_batch_max_total_tokens").set(max_batch_total_tokens);

    let backend_info = BackendInfo {
        waiting_served_ratio,
        max_batch_total_tokens,
        max_input_tokens,
        max_total_tokens,
        max_waiting_tokens,
        max_batch_size,
        model_device_type: shard_info.device_type.clone(),
        model_dtype: shard_info.dtype.clone(),
        speculate: shard_info.speculate as usize,
        support_chunking: shard_info.support_chunking,
        prefix_caching: shard_info.use_prefix_caching,
        attention_impl: shard_info.attention_impl.clone(),
        block_size: shard_info.block_size,
    };

    let backend = BackendV3::new(
        sharded_client,
        waiting_served_ratio,
        max_batch_prefill_tokens,
        max_batch_total_tokens,
        max_waiting_tokens,
        max_batch_size,
        shard_info,
    );

    tracing::info!("Using backend V3");

    Ok((backend, backend_info))
}

#[derive(Debug, Error)]
pub enum V3Error {
    #[error("Unable to clear the Python model shards cache: {0}")]
    Cache(ClientError),
    #[error("Unable to connect to the Python model shards: {0}")]
    Connection(ClientError),
    #[error("Unable to get the Python model shards info: {0}")]
    Info(ClientError),
    #[error("Unable to warmup the Python model shards: {0}")]
    Warmup(ClientError),
    #[error("Not enough memory to handle `max_total_tokens={0}`")]
    NotEnoughMemory(usize),
}


================================================
FILE: backends/v3/src/main.rs
================================================
use clap::{Parser, Subcommand};
use text_generation_router::{server, usage_stats};
use text_generation_router_v3::{connect_backend, V3Error};
use thiserror::Error;

/// App Configuration
#[derive(Parser, Debug)]
#[clap(author, version, about, long_about = None)]
struct Args {
    #[command(subcommand)]
    command: Option<Commands>,

    #[clap(default_value = "128", long, env)]
    max_concurrent_requests: usize,
    #[clap(default_value = "2", long, env)]
    max_best_of: usize,
    #[clap(default_value = "4", long, env)]
    max_stop_sequences: usize,
    #[clap(default_value = "5", long, env)]
    max_top_n_tokens: u32,
    #[clap(long, env)]
    max_input_tokens: Option<usize>,
    #[clap(long, env)]
    max_total_tokens: Option<usize>,
    #[clap(default_value = "1.2", long, env)]
    waiting_served_ratio: f32,
    #[clap(default_value = "4096", long, env)]
    max_batch_prefill_tokens: u32,
    #[clap(long, env)]
    max_batch_total_tokens: Option<u32>,
    #[clap(default_value = "20", long, env)]
    max_waiting_tokens: usize,
    #[clap(long, env)]
    max_batch_size: Option<usize>,
    #[clap(default_value = "0.0.0.0", long, env)]
    hostname: String,
    #[clap(default_value = "3000", long, short, env)]
    port: u16,
    #[clap(default_value = "9000", long, short, env)]
    prometheus_port: u16,
    #[clap(default_value = "/tmp/text-generation-server-0", long, env)]
    master_shard_uds_path: String,
    #[clap(default_value = "bigscience/bloom", long, env)]
    tokenizer_name: String,
    #[clap(long, env)]
    tokenizer_config_path: Option<String>,
    #[clap(long, env)]
    revision: Option<String>,
    #[clap(long, env, value_enum)]
    trust_remote_code: bool,
    #[clap(default_value = "2", long, env)]
    validation_workers: usize,
    #[clap(long, env)]
    api_key: Option<String>,
    #[clap(long, env)]
    json_output: bool,
    #[clap(long, env)]
    otlp_endpoint: Option<String>,
    #[clap(default_value = "text-generation-inference.router", long, env)]
    otlp_service_name: String,
    #[clap(long, env)]
    cors_allow_origin: Option<Vec<String>>,
    #[clap(long, env)]
    ngrok: bool,
    #[clap(long, env)]
    ngrok_authtoken: Option<String>,
    #[clap(long, env)]
    ngrok_edge: Option<String>,
    #[clap(long, env, default_value_t = false)]
    disable_grammar_support: bool,
    #[clap(default_value = "4", long, env)]
    max_client_batch_size: usize,
    #[clap(default_value = "on", long, env)]
    usage_stats: usage_stats::UsageStatsLevel,
    #[clap(default_value = "2000000", long, env)]
    payload_limit: usize,
    #[clap(default_value = "1073741824", long, env)]
    max_image_fetch_size: usize,
}

#[derive(Debug, Subcommand)]
enum Commands {
    PrintSchema,
}

#[tokio::main]
async fn main() -> Result<(), RouterError> {
    // Get args
    let args = Args::parse();
    // Pattern match configuration
    let Args {
        command,
        max_concurrent_requests,
        max_best_of,
        max_stop_sequences,
        max_top_n_tokens,
        max_input_tokens,
        max_total_tokens,
        waiting_served_ratio,
        max_batch_prefill_tokens,
        max_batch_total_tokens,
        max_waiting_tokens,
        max_batch_size,
        hostname,
        port,
        prometheus_port,
        master_shard_uds_path,
        tokenizer_name,
        tokenizer_config_path,
        revision,
        trust_remote_code,
        validation_workers,
        api_key,
        json_output,
        otlp_endpoint,
        otlp_service_name,
        cors_allow_origin,
        ngrok,
        ngrok_authtoken,
        ngrok_edge,
        disable_grammar_support,
        max_client_batch_size,
        usage_stats,
        payload_limit,
        max_image_fetch_size,
    } = args;

    if let Some(Commands::PrintSchema) = command {
        use utoipa::OpenApi;
        let api_doc = text_generation_router::server::ApiDoc::openapi();
        let api_doc = serde_json::to_string_pretty(&api_doc).unwrap();
        println!("{}", api_doc);
        std::process::exit(0);
    };
    text_generation_router::logging::init_logging(otlp_endpoint, otlp_service_name, json_output);

    // Validate args
    if validation_workers == 0 {
        return Err(RouterError::ArgumentValidation(
            "`validation_workers` must be > 0".to_string(),
        ));
    }
    if let Some(max_batch_size) = max_batch_size {
        if max_batch_size == 0 {
            return Err(RouterError::ArgumentValidation(
                "`max_batch_size` must be > 0".to_string(),
            ));
        }
    }

    let (backend, backend_info) = connect_backend(
        max_input_tokens,
        max_total_tokens,
        master_shard_uds_path,
        waiting_served_ratio,
        max_batch_prefill_tokens,
        max_batch_total_tokens,
        max_waiting_tokens,
        max_batch_size,
    )
    .await?;

    // Validate remaining args now that the backend is known
    let support_chunking = backend_info.support_chunking;
    let max_batch_total_tokens = backend_info.max_batch_total_tokens;

    if max_input_tokens.is_none() {
        tracing::info!(
            "Maximum input tokens defaulted to {}",
            backend_info.max_input_tokens
        );
    }
    if max_total_tokens.is_none() {
        tracing::info!(
            "Maximum total tokens defaulted to {}",
            backend_info.max_total_tokens
        );
    }

    let max_input_tokens = backend_info.max_input_tokens;
    let max_total_tokens = backend_info.max_total_tokens;
    if max_input_tokens >= max_total_tokens {
        return Err(RouterError::ArgumentValidation(
            "`max_input_tokens` must be < `max_total_tokens`".to_string(),
        ));
    }

    if max_input_tokens as u32 > max_batch_prefill_tokens && !support_chunking {
        return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be >= `max_input_tokens`. Given: {max_batch_prefill_tokens} and {max_input_tokens}")));
    }
    if max_batch_prefill_tokens > max_batch_total_tokens {
        return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be <= `max_batch_total_tokens`. Given: {max_batch_prefill_tokens} and {max_batch_total_tokens}")));
    }
    if max_total_tokens as u32 > max_batch_total_tokens {
        return Err(RouterError::ArgumentValidation(format!("`max_total_tokens` must be <= `max_batch_total_tokens`. Given: {max_total_tokens} and {max_batch_total_tokens}")));
    }

    // Run server
    server::run(
        backend,
        max_concurrent_requests,
        max_best_of,
        max_stop_sequences,
        max_top_n_tokens,
        max_input_tokens,
        max_total_tokens,
        validation_workers,
        api_key,
        tokenizer_name,
        tokenizer_config_path,
        revision,
        trust_remote_code,
        hostname,
        port,
        cors_allow_origin,
        ngrok,
        ngrok_authtoken,
        ngrok_edge,
        disable_grammar_support,
        max_client_batch_size,
        usage_stats,
        payload_limit,
        max_image_fetch_size,
        prometheus_port,
    )
    .await?;
    Ok(())
}

#[derive(Debug, Error)]
enum RouterError {
    #[error("Argument validation error: {0}")]
    ArgumentValidation(String),
    #[error("Backend failed: {0}")]
    Backend(#[from] V3Error),
    #[error("WebServer error: {0}")]
    WebServer(#[from] server::WebServerError),
    #[error("Tokio runtime failed to start: {0}")]
    Tokio(#[from] std::io::Error),
}


================================================
FILE: backends/v3/src/queue.rs
================================================
use crate::block_allocator::{BlockAllocation, BlockAllocator};
use crate::client;
use crate::client::{
    Batch, GrammarType, NextTokenChooserParameters, Request, StoppingCriteriaParameters,
};
use nohash_hasher::{BuildNoHashHasher, IntMap};
use std::cmp::max;
use std::collections::VecDeque;
use text_generation_router::infer::InferError;
use text_generation_router::infer::InferStreamResponse;
use text_generation_router::usage_stats::Env;
use text_generation_router::validation::{
    Chunk, ChunksToString, ValidGenerateRequest, ValidGrammar, ValidParameters,
    ValidStoppingParameters,
};
use tokio::sync::{mpsc, oneshot};
use tokio::time::Instant;
use tracing::{info_span, instrument, Instrument, Span};

/// Queue entry
#[derive(Debug)]
pub(crate) struct Entry {
    /// Request
    pub request: ValidGenerateRequest,
    /// Response sender to communicate between the Infer struct and the batching_task
    pub response_tx: mpsc::UnboundedSender<Result<InferStreamResponse, InferError>>,
    /// Span that will live as long as entry
    pub span: Span,
    /// Temporary span used as a guard when logging inference, wait times...
    pub temp_span: Option<Span>,
    /// Instant when this entry was queued
    pub queue_time: Instant,
    /// Instant when this entry was added to a batch
    pub batch_time: Option<Instant>,
    /// Block Allocation
    pub block_allocation: Option<BlockAllocation>,
}

/// Request Queue
#[derive(Debug, Clone)]
pub(crate) struct Queue {
    /// Channel to communicate with the background queue task
    queue_sender: mpsc::UnboundedSender<QueueCommand>,
}

impl Queue {
    pub(crate) fn new(
        requires_padding: bool,
        block_size: u32,
        prefix_caching: bool,
        window_size: Option<u32>,
        speculate: u32,
        max_batch_total_tokens: u32,
        support_chunking: bool,
    ) -> Self {
        // Create channel
        let (queue_sender, queue_receiver) = mpsc::unbounded_channel();

        // Launch background queue task
        tokio::spawn(queue_task(
            requires_padding,
            block_size,
            prefix_caching,
            window_size,
            speculate,
            max_batch_total_tokens,
            support_chunking,
            queue_receiver,
        ));

        Self { queue_sender }
    }

    /// Append an entry to the queue
    #[instrument(skip_all)]
    pub(crate) fn append(&self, entry: Entry) {
        // Send append command to the background task managing the state
        // Unwrap is safe here
        self.queue_sender
            .send(QueueCommand::Append(Box::new(entry), Span::current()))
            .unwrap();
    }

    // Get the next batch
    #[instrument(skip(self))]
    pub(crate) async fn next_batch(
        &self,
        min_size: Option<usize>,
        max_size: Option<usize>,
        prefill_token_budget: u32,
        token_budget: u32,
    ) -> Option<NextBatch> {
        if prefill_token_budget == 0 || token_budget == 0 {
            return None;
        };

        // Create response channel
        let (response_sender, response_receiver) = oneshot::channel();
        // Send next batch command to the background task managing the state
        // Unwrap is safe here
        self.queue_sender
            .send(QueueCommand::NextBatch {
                min_size,
                max_size,
                prefill_token_budget,
                token_budget,
                response_sender,
                span: Span::current(),
            })
            .unwrap();
        // Await on response channel
        // Unwrap is safe here
        response_receiver.await.unwrap()
    }
}

// Background task responsible of the queue state
#[allow(clippy::too_many_arguments)]
async fn queue_task(
    requires_padding: bool,
    block_size: u32,
    prefix_caching: bool,
    window_size: Option<u32>,
    speculate: u32,
    max_batch_total_tokens: u32,
    support_chunking: bool,
    mut receiver: mpsc::UnboundedReceiver<QueueCommand>,
) {
    let mut state = State::new(
        requires_padding,
        block_size,
        prefix_caching,
        window_size,
        speculate,
        max_batch_total_tokens,
        support_chunking,
    );

    while let Some(cmd) = receiver.recv().await {
        match cmd {
            QueueCommand::Append(entry, span) => {
                span.in_scope(|| state.append(*entry));
                metrics::gauge!("tgi_queue_size").increment(1.0);
            }
            QueueCommand::NextBatch {
                min_size,
                max_size,
                prefill_token_budget,
                token_budget,
                response_sender,
                span,
            } => {
                let next_batch = state
                    .next_batch(min_size, max_size, prefill_token_budget, token_budget)
                    .instrument(span)
                    .await;
                response_sender.send(next_batch).unwrap();
                metrics::gauge!("tgi_queue_size").set(state.entries.len() as f64);
            }
        }
    }
}

/// Queue State
#[derive(Debug)]
struct State {
    /// Queue entries organized in a Vec
    entries: VecDeque<(u64, Entry)>,

    /// Id of the next entry
    next_id: u64,

    /// Id of the next batch
    next_batch_id: u64,

    /// Paged Attention block size
    block_size: u32,

    /// Speculation amount
    speculate: u32,

    /// Whether the model allow the prefill chunking
    /// If it does, the last request in the batch will be split to exactly match the prefill
    /// token budget
    support_chunking: bool,

    /// Paged Attention Block Allocation
    block_allocator: Option<BlockAllocator>,

    /// indicate if it's hpu device, the hpu device needs padding to generate first token.
    is_hpu_device: bool,
}

impl State {
    fn new(
        requires_padding: bool,
        block_size: u32,
        prefix_caching: bool,
        window_size: Option<u32>,
        speculate: u32,
        max_batch_total_tokens: u32,
        support_chunking: bool,
    ) -> Self {
        let block_allocator = (!requires_padding).then(|| {
            BlockAllocator::new(
                max_batch_total_tokens,
                block_size,
                prefix_caching,
                window_size,
            )
        });

        Self {
            entries: VecDeque::with_capacity(128),
            next_id: 0,
            next_batch_id: 0,
            block_size,
            speculate,
            support_chunking,
            block_allocator,
            is_hpu_device: Env::new().is_hpu_device(),
        }
    }

    /// Append an entry to the queue
    fn append(&mut self, mut entry: Entry) {
        // Create a span that will live as long as the entry is in the queue waiting to be batched
        let queue_span = info_span!(parent: &entry.span, "queued");
        entry.temp_span = Some(queue_span);

        // Push entry in the queue
        self.entries.push_back((self.next_id, entry));
        self.next_id += 1;
    }

    // Get the next batch
    async fn next_batch(
        &mut self,
        min_size: Option<usize>,
        max_size: Option<usize>,
        prefill_token_budget: u32,
        token_budget: u32,
    ) -> Option<NextBatch> {
        if self.entries.is_empty() {
            tracing::debug!("No queue");
            return None;
        }

        // Check if we have enough entries
        if let Some(min_size) = min_size {
            if self.entries.len() < min_size {
                tracing::debug!("Not enough entries");
                return None;
            }
        }

        if let Some(max_size) = max_size {
            if max_size == 0 {
                tracing::debug!("No capacity");
                return None;
            }
        }

        // Pad prefill_token_budget to be a multiple of block size
        let prefill_token_budget = prefill_token_budget.div_ceil(self.block_size) * self.block_size;

        // Create span for this batch to add context to inference calls
        let next_batch_span = info_span!(parent: None, "batch", batch_size = tracing::field::Empty);
        next_batch_span.follows_from(Span::current());

        let mut batch = Vec::with_capacity(self.entries.len());
        let mut max_input_length = 0;
        let mut prefill_tokens: u32 = 0;
        let mut decode_tokens: u32 = 0;
        let mut max_blocks = 0;

        // Pop entries starting from the front of the queue
        'entry_loop: while let Some((id, entry)) = self.entries.pop_front() {
            // Filter entries where the response receiver was dropped (== entries where the request
            // was dropped by the client)
            if entry.response_tx.is_closed() {
                metrics::counter!("tgi_request_failure", "err" => "dropped").increment(1);
                tracing::debug!("Dropping entry");
                continue;
            }

            let block_allocation = match &self.block_allocator {
                None => {
                    // We pad to max input length in the Python shards
                    // We need to take these padding tokens into the equation
                    max_input_length = max_input_length.max(entry.request.input_length);
                    prefill_tokens = (batch.len() + 1) as u32 * max_input_length;

                    decode_tokens += entry.request.stopping_parameters.max_new_tokens;
                    let total_tokens = prefill_tokens + decode_tokens + self.speculate;

                    if prefill_tokens > prefill_token_budget || total_tokens > token_budget {
                        // Entry is over budget
                        // Add it back to the front
                        tracing::debug!("Over budget: prefill_tokens={prefill_tokens} > {prefill_token_budget} || {prefill_tokens} + {decode_tokens} + {} > {token_budget}", self.speculate);
                        self.entries.push_front((id, entry));
                        break 'entry_loop;
                    }
                    None
                }
                Some(block_allocator) => {
                    // If users wants the prefill logprobs, we cannot reuse the cache.
                    // So no input_ids for the radix tree.
                    let input_ids = if entry.request.decoder_input_details {
                        None
                    } else {
                        entry.request.input_ids.clone()
                    };

                    let tokens = entry.request.input_length
                        + entry.request.stopping_parameters.max_new_tokens
                        + self.speculate
                        - 1;
                    // tracing::debug!("Allocating {tokens} with {input_ids:?}");

                    let block_allocation = match block_allocator.allocate(tokens, input_ids).await {
                        None => {
                            // Entry is over budget
                            // Add it back to the front
                            tracing::debug!("Over budget: not enough free blocks");
                            self.entries.push_front((id, entry));
                            break 'entry_loop;
                        }
                        Some(mut block_allocation) => {
                            // tracing::debug!("Allocation: {block_allocation:?}");
                            max_blocks = max(max_blocks, block_allocation.blocks.len() as u32);

                            if block_allocation.prefix_len == entry.request.input_length {
                                // The whole request was found in the radix trie
                                // However, for the transformer forward to work, we need to
                                // have at least one token of postfix.
                                block_allocation.prefix_len -= 1;
                            }

                            block_allocation
                        }
                    };

                    let postfix_len = entry.request.input_length - block_allocation.prefix_len;

                    if prefill_tokens + postfix_len > prefill_token_budget {
                        // Entry is over budget
                        if self.support_chunking {
                            // We support chunking, just set postfix_len to exactly match prefill_token_budget
                            let chunk_len = prefill_token_budget.saturating_sub(prefill_tokens);
                            if chunk_len > 0 {
                                // Push this entry inside the batch
                                batch.push((id, entry, Some(block_allocation), Some(chunk_len)));
                            } else {
                                // We cannot prefill even one token for this entry
                                // Add it back to the queue
                                self.entries.push_front((id, entry));
                            }
                            tracing::debug!(
                                "Matched budget: prefill_tokens={} == {prefill_token_budget}",
                                prefill_tokens + postfix_len
                            );
                            break 'entry_loop;
                        } else {
                            // We don't support chunking, this entry needs to go back to the buffer
                            // Add it back to the front
                            tracing::debug!(
                                "Over budget: prefill_tokens={} > {prefill_token_budget}",
                                prefill_tokens + postfix_len
                            );
                            self.entries.push_front((id, entry));
                            break 'entry_loop;
                        }
                    }

                    if self.is_hpu_device {
                        //HPU needs to pad for the prefill
                        max_input_length = max_input_length.max(entry.request.input_length);
                        let actual_prefill_tokens_for_hpu =
                            (batch.len() + 1) as u32 * max_input_length;

                        if actual_prefill_tokens_for_hpu > prefill_token_budget {
                            // Entry is over budget
                            // Add it back to the front
                            tracing::debug!("Over budget: prefill_tokens={actual_prefill_tokens_for_hpu} > {prefill_token_budget}");
                            self.entries.push_front((id, entry));
                            break 'entry_loop;
                        }
                    }

                    prefill_tokens += postfix_len;

                    Some(block_allocation)
                }
            };
            batch.push((id, entry, block_allocation, None));
            if Some(batch.len()) == max_size {
                break;
            }
        }

        // Empty batch
        if batch.is_empty() {
            tracing::debug!("Filterered out all entries");
            return None;
        }

        // XXX We haven't allocated yet, so we're allowed to ditch the results.
        // Check if our batch is big enough
        if let Some(min_size) = min_size {
            // Batch is too small
            if batch.len() < min_size {
                // Add back entries to the queue in the correct order
                for (id, entry, _, _) in batch.into_iter().rev() {
                    self.entries.push_front((id, entry));
                }
                return None;
            }
        }

        let mut batch_requests = Vec::with_capacity(self.entries.len());
        let mut batch_entries =
            IntMap::with_capacity_and_hasher(self.entries.len(), BuildNoHashHasher::default());

        for (id, mut entry, block_allocation, chunk_len) in batch {
            // Create a new span to link the batch back to this entry
            let entry_batch_span = info_span!(parent: &entry.span, "infer");
            // Add relationships
            next_batch_span.follows_from(&entry_batch_span);
            entry_batch_span.follows_from(&next_batch_span);
            // Update entry
            entry.temp_span = Some(entry_batch_span);

            let (blocks, slots, prefix_len) = match &block_allocation {
                None => (Vec::new(), Vec::new(), 0),
                Some(block_allocation) => (
                    block_allocation.blocks.clone(),
                    block_allocation.slots.clone(),
                    block_allocation.prefix_len,
                ),
            };

            entry.block_allocation = block_allocation;

            batch_requests.push(Request {
                id,
                prefill_logprobs: entry.request.decoder_input_details,
                input_chunks: Some(client::Input {
                    chunks: entry
                        .request
                        .inputs
                        .clone()
                        .into_iter()
                        .map(|c| client::InputChunk {
                            chunk: Some(match c {
                                Chunk::Text(text) => client::Chunk::Text(text),
                                Chunk::Image(image) => client::Chunk::Image(client::Image {
                                    data: image.data,
                                    mimetype: image.mimetype,
                                }),
                            }),
                        })
                        .collect(),
                }),
                inputs: entry.request.inputs.chunks_to_string(),
                truncate: entry.request.truncate,
                add_special_tokens: entry.request.add_special_tokens,
                parameters: Some(NextTokenChooserParameters::from(
                    entry.request.parameters.clone(),
                )),
                stopping_parameters: Some(StoppingCriteriaParameters::from(
                    entry.request.stopping_parameters.clone(),
                )),
                top_n_tokens: entry.request.top_n_tokens,
                blocks,
                slots,
                cache_len: prefix_len,
                adapter_id: entry.request.adapter_id.clone(),
                chunk_len,
            });
            // Set batch_time
            entry.batch_time = Some(Instant::now());
            // Insert in batch_entries IntMap
            batch_entries.insert(id, entry);
        }

        // Final batch size
        let size = batch_requests.len() as u32;
        next_batch_span.record("batch_size", size);

        let batch = Batch {
            id: self.next_batch_id,
            requests: batch_requests,
            size,
            max_tokens: (prefill_tokens + decode_tokens),
            max_blocks,
        };
        // Increment batch id
        self.next_batch_id += 1;

        metrics::histogram!("tgi_batch_next_size").record(batch.size as f64);

        Some((batch_entries, batch, next_batch_span))
    }
}

type NextBatch = (IntMap<u64, Entry>, Batch, Span);

#[derive(Debug)]
enum QueueCommand {
    Append(Box<Entry>, Span),
    NextBatch {
        min_size: Option<usize>,
        max_size: Option<usize>,
        prefill_token_budget: u32,
        token_budget: u32,
        response_sender: oneshot::Sender<Option<NextBatch>>,
        span: Span,
    },
}

impl From<ValidParameters> for NextTokenChooserParameters {
    fn from(value: ValidParameters) -> Self {
        let (grammar, grammar_type) = match value.grammar {
            None => (String::new(), GrammarType::None),

            Some(grammar) => match grammar {
                ValidGrammar::Json(grammar_string) => (grammar_string, GrammarType::Json),
                ValidGrammar::Regex(grammar_string) => (grammar_string, GrammarType::Regex),
            },
        };

        Self {
            temperature: value.temperature,
            top_k: value.top_k,
            top_p: value.top_p,
            typical_p: value.typical_p,
            do_sample: value.do_sample,
            seed: value.seed,
            repetition_penalty: value.repetition_penalty,
            frequency_penalty: value.frequency_penalty,
            watermark: value.watermark,
            grammar,
            grammar_type: grammar_type.into(),
        }
    }
}

impl From<ValidStoppingParameters> for StoppingCriteriaParameters {
    fn from(value: ValidStoppingParameters) -> Self {
        Self {
            max_new_tokens: value.max_new_tokens,
            stop_sequences: value.stop_sequences,
            ignore_eos_token: value.ignore_eos_token,
        }
    }
}

#[cfg(test)]
mod tests {
    use std::sync::Arc;

    use super::*;
    use tracing::info_span;

    fn default_entry() -> (
        Entry,
        mpsc::UnboundedReceiver<Result<InferStreamResponse, InferError>>,
    ) {
        let (response_tx, receiver_tx) = mpsc::unbounded_channel();

        let entry = Entry {
            request: ValidGenerateRequest {
                inputs: vec![],
                input_ids: Some(Arc::new(vec![])),
                input_length: 1,
                add_special_tokens: true,
                truncate: 0,
                decoder_input_details: false,
                parameters: ValidParameters {
                    temperature: 0.0,
                    top_k: 0,
                    top_p: 0.0,
                    typical_p: 0.0,
                    do_sample: false,
                    seed: 0,
                    repetition_penalty: 0.0,
                    frequency_penalty: 0.0,
                    watermark: false,
                    grammar: None,
                },
                stopping_parameters: ValidStoppingParameters {
                    ignore_eos_token: false,
                    max_new_tokens: 1,
                    max_total_new_tokens: 1024,
                    stop_sequences: vec![],
                },
                top_n_tokens: 0,
                adapter_id: None,
            },
            response_tx,
            span: info_span!("entry"),
            temp_span: None,
            queue_time: Instant::now(),
            batch_time: None,
            block_allocation: None,
        };
        (entry, receiver_tx)
    }

    #[tokio::test]
    async fn test_append() {
        let mut state = State::new(false, 1, false, None, 0, 16, false);
        let (entry, _guard) = default_entry();

        assert_eq!(state.next_id, 0);
        assert_eq!(state.entries.len(), 0);

        state.append(entry);

        assert_eq!(state.next_id, 1);
        assert_eq!(state.entries.len(), 1);
        let (id, _) = state.entries.remove(0).unwrap();
        assert_eq!(id, 0);
    }

    #[tokio::test]
    async fn test_next_batch_empty() {
        let mut state = State::new(false, 1, false, None, 0, 16, false);

        assert!(state.next_batch(None, None, 1, 1).await.is_none());
        assert!(state.next_batch(Some(1), None, 1, 1).await.is_none());
    }

    #[tokio::test]
    async fn test_next_batch_min_size() {
        let mut state = State::new(false, 1, false, None, 0, 16, false);
        let (entry1, _guard1) = default_entry();
        let (entry2, _guard2) = default_entry();
        state.append(entry1);
        state.append(entry2);

        let (entries, batch, _) = state.next_batch(None, None, 2, 2).await.unwrap();
        assert_eq!(entries.len(), 2);
        assert!(entries.contains_key(&0));
        assert!(entries.contains_key(&1));
        assert!(entries.get(&0).unwrap().batch_time.is_some());
        assert!(entries.get(&1).unwrap().batch_time.is_some());
        assert_eq!(batch.id, 0);
        assert_eq!(batch.size, 2);

        assert_eq!(state.next_id, 2);
        assert_eq!(state.entries.len(), 0);
        assert_eq!(state.next_batch_id, 1);

        let (entry3, _guard3) = default_entry();
        state.append(entry3);

        assert!(state.next_batch(Some(2), None, 2, 2).await.is_none());

        assert_eq!(state.next_id, 3);
        assert_eq!(state.entries.len(), 1);
        let (id, _) = state.entries.remove(0).unwrap();
        assert_eq!(id, 2);
    }

    #[tokio::test]
    async fn test_next_batch_max_size() {
        let mut state = State::new(false, 1, false, None, 0, 16, false);
        let (entry1, _guard1) = default_entry();
        let (entry2, _guard2) = default_entry();
        state.append(entry1);
        state.append(entry2);

        let (entries, batch, _) = state.next_batch(None, Some(1), 2, 2).await.unwrap();
        assert_eq!(entries.len(), 1);
        assert!(entries.contains_key(&0));
        assert!(entries.get(&0).unwrap().batch_time.is_some());
        assert_eq!(batch.id, 0);
        assert_eq!(batch.size, 1);

        assert_eq!(state.next_id, 2);
        assert_eq!(state.entries.len(), 1);
        assert_eq!(state.next_batch_id, 1);
    }

    #[tokio::test]
    async fn test_next_batch_token_budget() {
        let mut state = State::new(false, 1, false, None, 0, 16, false);
        let (entry1, _guard1) = default_entry();
        let (entry2, _guard2) = default_entry();
        state.append(entry1);
        state.append(entry2);

        let (entries, batch, _) = state.next_batch(None, None, 1, 1).await.unwrap();
        assert_eq!(entries.len(), 1);
        assert!(entries.contains_key(&0));
        assert_eq!(batch.id, 0);
        assert_eq!(batch.size, 1);

        assert_eq!(state.next_id, 2);
        assert_eq!(state.entries.len(), 1);
        assert_eq!(state.next_batch_id, 1);

        let (entry3, _guard3) = default_entry();
        state.append(entry3);

        let (entries, batch, _) = state.next_batch(None, None, 3, 3).await.unwrap();
        assert_eq!(entries.len(), 2);
        assert!(entries.contains_key(&1));
        assert!(entries.contains_key(&2));
        assert_eq!(batch.id, 1);
        assert_eq!(batch.size, 2);

        assert_eq!(state.next_id, 3);
        assert_eq!(state.entries.len(), 0);
        assert_eq!(state.next_batch_id, 2);
    }

    #[tokio::test]
    async fn test_queue_append() {
        let queue = Queue::new(false, 1, false, None, 0, 16, false);
        let (entry, _guard) = default_entry();
        queue.append(entry);
    }

    #[tokio::test]
    async fn test_queue_next_batch_empty() {
        let queue = Queue::new(false, 1, false, None, 0, 16, false);

        assert!(queue.next_batch(None, None, 1, 1).await.is_none());
        assert!(queue.next_batch(Some(1), None, 1, 1).await.is_none());
    }

    #[tokio::test]
    async fn test_queue_next_batch_min_size() {
        let queue = Queue::new(false, 1, false, None, 0, 16, false);
        let (entry1, _guard1) = default_entry();
        let (entry2, _guard2) = default_entry();
        queue.append(entry1);
        queue.append(entry2);

        let (entries, batch, _) = queue.next_batch(None, None, 2, 2).await.unwrap();
        assert_eq!(entries.len(), 2);
        assert!(entries.contains_key(&0));
        assert!(entries.contains_key(&1));
        assert!(entries.get(&0).unwrap().batch_time.is_some());
        assert!(entries.get(&1).unwrap().batch_time.is_some());
        assert_eq!(batch.id, 0);
        assert_eq!(batch.size, 2);

        let (entry3, _guard3) = default_entry();
        queue.append(entry3);

        // Not enough requests pending
        assert!(queue.next_batch(Some(2), None, 2, 2).await.is_none());
        // Not enough token budget
        assert!(queue.next_batch(Some(1), None, 0, 0).await.is_none());
        // Ok
        let (entries2, batch2, _) = queue.next_batch(Some(1), None, 2, 2).await.unwrap();
        assert_eq!(entries2.len(), 1);
        assert!(entries2.contains_key(&2));
        assert!(entries2.get(&2).unwrap().batch_time.is_some());
        assert_eq!(batch2.id, 1);
        assert_eq!(batch2.size, 1);
    }

    #[tokio::test]
    async fn test_queue_next_batch_max_size() {
        let queue = Queue::new(false, 1, false, None, 0, 16, false);
        let (entry1, _guard1) = default_entry();
        let (entry2, _guard2) = default_entry();
        queue.append(entry1);
        queue.append(entry2);

        let (entries, batch, _) = queue.next_batch(None, Some(1), 2, 2).await.unwrap();
        assert_eq!(entries.len(), 1);
        assert!(entries.contains_key(&0));
        assert!(entries.get(&0).unwrap().batch_time.is_some());
        assert_eq!(batch.id, 0);
        assert_eq!(batch.size, 1);
    }

    #[tokio::test]
    async fn test_queue_next_batch_token_budget() {
        let queue = Queue::new(false, 1, false, None, 0, 16, false);
        let (entry1, _guard1) = default_entry();
        let (entry2, _guard2) = default_entry();
        queue.append(entry1);
        queue.append(entry2);

        let (entries, batch, _) = queue.next_batch(None, None, 1, 1).await.unwrap();
        assert_eq!(entries.len(), 1);
        assert!(entries.contains_key(&0));
        assert_eq!(batch.id, 0);
        assert_eq!(batch.size, 1);

        let (entry3, _guard3) = default_entry();
        queue.append(entry3);

        let (entries, batch, _) = queue.next_batch(None, None, 3, 3).await.unwrap();
        assert_eq!(entries.len(), 2);
        assert!(entries.contains_key(&1));
        assert!(entries.contains_key(&2));
        assert_eq!(batch.id, 1);
        assert_eq!(batch.size, 2);
    }

    #[tokio::test]
    async fn test_queue_next_batch_token_speculate() {
        let queue = Queue::new(true, 1, false, None, 2, 16, false);
        let (entry1, _guard1) = default_entry();
        let (entry2, _guard2) = default_entry();
        queue.append(entry1);
        queue.append(entry2);

        // Budget of 1 is not enough
        assert!(queue.next_batch(None, None, 1, 1).await.is_none());

        let (entries, batch, _) = queue.next_batch(None, None, 6, 6).await.unwrap();
        assert_eq!(entries.len(), 2);
        assert!(entries.contains_key(&0));
        assert!(entries.contains_key(&1));
        assert_eq!(batch.id, 0);
        assert_eq!(batch.size, 2);
    }

    #[tokio::test]
    async fn test_queue_next_batch_dropped_receiver() {
        let queue = Queue::new(false, 1, false, None, 0, 16, false);
        let (entry, _) = default_entry();
        queue.append(entry);

        assert!(queue.next_batch(None, None, 1, 1).await.is_none());
    }
}


================================================
FILE: backends/v3/src/radix.rs
================================================
use crate::block_allocator::{Allocator, BlockAllocation};
use slotmap::{DefaultKey, SlotMap};
use std::hash::{Hash, Hasher};
use std::{
    collections::{BTreeSet, HashMap},
    sync::Arc,
};

fn hash(slice: &[u32]) -> u64 {
    assert!(!slice.is_empty());
    if slice.len() == 1 {
        slice[0] as u64
    } else {
        let mut s = std::hash::DefaultHasher::new();
        slice.hash(&mut s);
        s.finish()
    }
}

pub struct RadixAllocator {
    allocation_id: u64,

    allocations: HashMap<u64, RadixAllocation>,

    cache_blocks: RadixTrie,

    /// Blocks that are immediately available for allocation.
    free_blocks: Vec<u32>,

    #[allow(dead_code)]
    // This isn't used because the prefix need to match without the windowing
    // mecanism. This at worst is overallocating, not necessarily being wrong.
    window_size: Option<u32>,

    block_size: u32,
}

impl RadixAllocator {
    pub fn new(block_size: u32, n_blocks: u32, window_size: Option<u32>) -> Self {
        RadixAllocator {
            allocation_id: 0,
            allocations: HashMap::new(),
            cache_blocks: RadixTrie::new(block_size as usize),

            // Block 0 is reserved for health checks.
            free_blocks: (1..n_blocks).collect(),
            window_size,
            block_size,
        }
    }

    fn alloc_or_reclaim(&mut self, n_blocks_needed: usize) -> Option<Vec<u32>> {
        if self.free_blocks.len() < n_blocks_needed {
            // This is a bit annoying, we first extend the free list and then
            // split it off again below. This is because we need to put it on
            // the free list if we cannot allocate enough blocks. This is only
            // temporary, the trie needs to be able to report whether it can
            // allocate the requested amount. Just not implemented yet.
            tracing::debug!(
                "Free blocks {}  need {n_blocks_needed}",
                self.free_blocks.len()
            );
            self.free_blocks.extend(
                self.cache_blocks
                    .evict(n_blocks_needed - self.free_blocks.len()),
            );
        }

        if self.free_blocks.len() >= n_blocks_needed {
            Some(
                self.free_blocks
                    .split_off(self.free_blocks.len() - n_blocks_needed),
            )
        } else {
            None
        }
    }
}

// Allocator trait
impl Allocator for RadixAllocator {
    fn allocate(
        &mut self,
        tokens: u32,
        prefill_tokens: Option<Arc<Vec<u32>>>,
    ) -> Option<BlockAllocation> {
        let mut blocks = vec![];
        let prefix_node = if let Some(prefill_tokens) = prefill_tokens.as_ref() {
            let node_id = self
                .cache_blocks
                .find(prefill_tokens.as_slice(), &mut blocks);
            node_id
        } else {
            self.cache_blocks.root_id()
        };

        // Even if this allocation fails below, we need to increase he
        // refcount to ensure that the prefix that was found is not evicted.
        self.cache_blocks
            .incref(prefix_node)
            .expect("Failed to increment refcount");

        let prefix_len = blocks.len() * self.block_size as usize;
        let suffix_len = tokens - prefix_len as u32;

        let suffix_blocks = suffix_len.div_ceil(self.block_size);

        tracing::info!("Prefix {prefix_len} - Suffix {suffix_len}");

        match self.alloc_or_reclaim(suffix_blocks as usize) {
            Some(suffix_blocks) => blocks.extend(suffix_blocks),
            None => {
                tracing::debug!("Cannot allocate {:?}", self.cache_blocks);
                tracing::debug!("Found {prefix_len} prefix tokens need {suffix_blocks} suffix blocks for {tokens} tokens");
                tracing::debug!("Block size {}", self.block_size);
                self.cache_blocks
                    .decref(prefix_node)
                    .expect("Failed to decrement refcount");
                return None;
            }
        }

        // 1:1 mapping of blocks and slots.
        let slots = if self.block_size == 1 {
            blocks.clone()
        } else {
            let mut slots = Vec::with_capacity(blocks.len() * self.block_size as usize);
            'slots: for block_id in &blocks {
                for s in (block_id * self.block_size)..((block_id + 1) * self.block_size) {
                    slots.push(s);
                    if slots.len() as u32 == tokens {
                        break 'slots;
                    }
                }
            }
            slots
        };

        let allocation = RadixAllocation {
            prefix_node,
            cached_prefix_len: prefix_len,
            prefill_tokens: prefill_tokens.clone(),
        };

        self.allocation_id += 1;
        self.allocations.insert(self.allocation_id, allocation);

        Some(BlockAllocation {
            allocation_id: self.allocation_id,
            block_allocator: None,
            blocks,
            slots,
            prefix_len: prefix_len as u32,
        })
    }

    fn free(&mut self, blocks: Vec<u32>, allocation_id: u64) {
        let allocation = match self.allocations.remove(&allocation_id) {
            Some(allocation) => allocation,
            None => unreachable!("Tried to free an unknown allocation."),
        };

        self.cache_blocks
            .decref(allocation.prefix_node)
            .expect("Failed to decrement refcount");

        if let Some(prefill_tokens) = allocation.prefill_tokens {
            let prefill_tokens = prefill_tokens.as_slice();

            // If there are prefill tokens that did not come from the cache,
            // add them to the cache.
            if prefill_tokens.len() > allocation.cached_prefix_len {
                let aligned =
                    (prefill_tokens.len() / self.block_size as usize) * self.block_size as usize;
                if aligned > 0 {
                    let prefix_len = self
                        .cache_blocks
                        .insert(
                            &prefill_tokens[..aligned],
                            &blocks[..aligned / self.block_size as usize],
                        )
                        // Unwrap, failing is a programming error.
                        .expect("Failed to store prefill tokens");
                    // We can have a prefill with the following structure:
                    //
                    // |---| From the prefix cache.
                    // A B C D E F G
                    //|--------| Found in the trie during insertion.
                    //
                    // This means that while processing this request there was a
                    // partially overlapping request that had A..=E in its
                    // prefill. In this case we need to free the blocks D E.
                    if prefix_len > allocation.cached_prefix_len {
                        self.free_blocks.extend(
                            &blocks[allocation.cached_prefix_len / self.block_size as usize
                                ..prefix_len / self.block_size as usize],
                        );
                    }
                }
            }

            // Free non-prefill blocks.
            self.free_blocks
                .extend(&blocks[prefill_tokens.len() / self.block_size as usize..]);
        } else {
            self.free_blocks.extend(blocks);
        }
    }
}

struct RadixAllocation {
    prefix_node: NodeId,
    cached_prefix_len: usize,
    prefill_tokens: Option<Arc<Vec<u32>>>,
}

// Radix trie that is heavily inspired by radix attention from sglang.
//
// The trie is optimized for prefix caching:
//
// - A normal radix trie stores discrete values. In this radix trie,
//   inserting *abc* with value *xyz* will also enable lookup for
//   *a* (*x*) and *ab* (*xy*).
// - As a result, every value is required to have the same length as
//   the key.
// - We store additional information in each node, such as last access
//   time and a reference count.

#[derive(Debug)]
pub enum TrieError {
    InvalidNodeId,
    RefCountUnderflow,
}

pub type NodeId = DefaultKey;

#[derive(Debug)]
pub struct RadixTrie {
    /// Identifier of the root nod.
    root: DefaultKey,

    /// Leave node identifiers ordered by increasing recency.
    leaves: BTreeSet<(u64, NodeId)>,

    /// All trie nodes.
    nodes: SlotMap<NodeId, TrieNode>,

    /// Time as a monotonically increating counter to avoid the system
    /// call that a real time lookup would require.
    time: u64,

    /// All blocks need to be aligned with this
    block_size: usize,
}

impl RadixTrie {
    /// Construct a new radix trie.
    pub fn new(block_size: usize) -> Self {
        let root = TrieNode::new(vec![], vec![], 0, None);
        let mut nodes = SlotMap::new();
        let root = nodes.insert(root);
        RadixTrie {
            leaves: BTreeSet::new(),
            nodes,
            root,
            time: 0,
            block_size,
        }
    }

    /// Find the prefix of the given tokens.
    ///
    /// The blocks corresponding to the part of the prefix that could be found
    /// are written to `blocks`. The number of blocks is in `0..=tokens.len()`.
    /// Returns the identifier of the trie node that contains the longest
    /// prefix. The node identifier can be used by callers to e.g. increase its
    /// reference count.
    ///
    /// Using this method will update the access time of the traversed nodes.
    pub fn find(&mut self, key: &[u32], blocks: &mut Vec<u32>) -> NodeId {
        self.time += 1;
        self.find_(self.root, key, blocks)
    }

    /// Find worker.
    fn find_(&mut self, node_id: NodeId, key: &[u32], blocks: &mut Vec<u32>) -> NodeId {
        let node = &self.nodes[node_id];

        if key.len() >= self.block_size {
            let node_key = hash(&key[..self.block_size]);
            if let Some(&child_id) = node.children.get(&node_key) {
                self.update_access_time(child_id);
                let child = self.nodes.get(child_id).expect("Invalid child identifier");
                let shared_prefix_len = shared_prefix(&child.key, key, self.block_size);
                assert_eq!(shared_prefix_len % self.block_size, 0);
                blocks.extend(&child.blocks[..shared_prefix_len / self.block_size]);

                // A node represents the prefix of its children. So, only
                // recurse when there is a full prefix match.
                let key = &key[shared_prefix_len..];
                if !key.is_empty() && shared_prefix_len == child.key.len() {
                    return self.find_(child_id, key, blocks);
                } else {
                    return child_id;
                }
            }
        }

        node_id
    }

    /// Decrease the reference count of a node.
    pub fn decref(&mut self, node_id: NodeId) -> Result<(), TrieError> {
        // We don't care about refcounting for root, since it will never
        // be evicted.
        if node_id == self.root {
            return Ok(());
        }

        let node = self
            .nodes
            .get_mut(node_id)
            .ok_or(TrieError::InvalidNodeId)?;
        if node.ref_count == 0 {
            return Err(TrieError::RefCountUnderflow);
        }

        node.ref_count -= 1;
        if node.ref_count == 0 {
            assert!(
                node.children.is_empty(),
                "Nodes with children must have refcount > 0"
            );

            self.leaves.insert((node.last_accessed, node_id));
        }

        Ok(())
    }

    /// Increase the reference count of a node.
    pub fn incref(&mut self, node_id: NodeId) -> Result<(), TrieError> {
        if node_id == self.root {
            return Ok(());
        }

        let node = self
            .nodes
            .get_mut(node_id)
            .ok_or(TrieError::InvalidNodeId)?;
        if node.ref_count == 0 {
            self.leaves.remove(&(node.last_accessed, node_id));
        }
        node.ref_count += 1;

        Ok(())
    }

    /// Evict `n_blocks` from the trie.
    ///
    /// Returns the evicted blocks. When the length is less than `n_blocks`,
    /// not enough blocks could be evicted.
    pub fn evict(&mut self, n_blocks: usize) -> Vec<u32> {
        // NOTE: we don't return Result here. If any of the unwrapping fails,
        // it's a programming error in the trie implementation, not a user
        // error caused by e.g. an invalid argument.

        // TODO: add some bookkeeping in the future to check whether we can
        // evict n_blocks and return `None` if we can't. We are now needlessly
        // evicting prefixes from the cache in such a case.
        let mut evicted = Vec::new();
        tracing::debug!("Evicting in search of {n_blocks}");

        while let Some((last_access, node_id)) = self.leaves.pop_first() {
            let blocks_needed = n_blocks.saturating_sub(evicted.len());
            tracing::debug!("Evicting node {node_id:?} ");

            let node = self.nodes.get(node_id).expect("Leave does not exist");
            assert_eq!(
                node.ref_count, 0,
                "Leaf must have refcount of 0, got {}",
                node.ref_count
            );

            if blocks_needed >= node.blocks.len() {
                // We need to evict the whole node if we need more blocks than it has.
                let node = self.remove_node(node_id);
                evicted.extend(node.blocks);

                if evicted.len() >= n_blocks {
                    break;
                }
            } else {
                // The node has more blocks than needed, so we'll just remove
                // the required number of blocks and leave the remaining blocks
                // untouched.
                let node = self.nodes.get_mut(node_id).expect("Leave does not exist");

                let truncate_blocks = node.blocks.len() - blocks_needed;
                let truncate_tokens = truncate_blocks * self.block_size;
                node.key.truncate(truncate_tokens);
                evicted.extend(node.blocks.split_off(truncate_blocks));
                self.leaves.insert((last_access, node_id));
                break;
            }
        }

        evicted
    }

    /// Insert a prefill along with its blocks.
    ///
    /// This method returns the length of the prefix that was already
    /// in the trie. E.g. if the length is 10, this means that for
    /// the first 10 elements of the tree **the blocks are not updated**.
    pub fn insert(&mut self, tokens: &[u32], blocks: &[u32]) -> Result<usize, TrieError> {
        self.time += 1;
        let common = self.insert_(self.root, tokens, blocks)?;
        Ok(common)
    }

    /// Insertion worker.
    fn insert_(
        &mut self,
        node_id: NodeId,
        tokens: &[u32],
        blocks: &[u32],
    ) -> Result<usize, TrieError> {
        // TODO: in the future we may want to check that the blocks match for
        // the part of the prefix that is already in the trie to detect
        // mismatches.

        assert_eq!(tokens.len(), blocks.len() * self.block_size);

        let node_key = hash(&tokens[..self.block_size]);
        if let Some(&child_id) = self.nodes[node_id].children.get(&node_key) {
            self.update_access_time(child_id);
            let child = self
                .nodes
                .get_mut(child_id)
                // Unwrap here, since failure is a bug.
                .expect("Child node does not exist");
            let shared_prefix_len = shared_prefix(&child.key, tokens, self.block_size);

            // We are done, the prefix is already in the trie.
            if shared_prefix_len == tokens.len() || shared_prefix_len == 0 {
                return Ok(shared_prefix_len);
            }

            // The node's prefix is a prefix of the insertion prefix.
            if shared_prefix_len == child.key.len() {
                return Ok(shared_prefix_len
                    + self.insert_(
                        child_id,
                        &tokens[shared_prefix_len..],
                        &blocks[shared_prefix_len / self.block_size..],
                    )?);
            }

            // The node's prefix and the insertion prefix only match partially,
            // split the node to just contain the matching part. Then insert the
            // remainder of the prefix into the node again
            let child_id = self.split_node(child_id, shared_prefix_len);
            let key = &tokens[shared_prefix_len..];
            let blocks = &blocks[shared_prefix_len / self.block_size..];
            Ok(shared_prefix_len + self.insert_(child_id, key, blocks)?)
        } else {
            self.add_node(node_id, tokens, blocks);
            Ok(0)
        }
    }

    fn split_node(&mut self, node_id: NodeId, prefix_len: usize) -> NodeId {
        // We have to make the current node a child to ensure that its
        // properties and node id stay the same.

        // This funcion unwraps, an  invalid node_id is a programming error.

        let node = self
            .nodes
            .get_mut(node_id)
            .expect("Node to-be split does not exist");
        let mut parent_key = node.key.split_off(prefix_len);
        let prefix_blocks = prefix_len / self.block_size;
        let mut parent_blocks = node.blocks.split_off(prefix_blocks);

        // Move first part of the prefix to the parent. We swap to avoid
        // an allocation + copy for both splits of the key/blocks.
        std::mem::swap(&mut node.key, &mut parent_key);
        std::mem::swap(&mut node.blocks, &mut parent_blocks);

        let node_key = hash(&node.key[..self.block_size]);

        let grandparent_id = node.parent.expect("Node does not have a parent");
        let parent_id = self.add_node(grandparent_id, parent_key, parent_blocks);
        self.add_node_to_parent(parent_id, node_key, node_id);

        // Reborrow to make the borrow checker happy.
        let node = self
            .nodes
            .get_mut(node_id)
            .expect("Node to-be split does not exist");
        node.parent = Some(parent_id);

        parent_id
    }

    /// Create a node and add it to the parent.
    fn add_node(
        &mut self,
        parent_id: NodeId,
        key: impl Into<Vec<u32>>,
        blocks: impl Into<Vec<u32>>,
    ) -> NodeId {
        let key = key.into();
        let blocks = blocks.into();
        let first = hash(&key[..self.block_size]);

        let child = TrieNode::new(key, blocks, self.time, Some(parent_id));
        let child_id = self.nodes.insert(child);

        self.add_node_to_parent(parent_id, first, child_id);
        self.leaves.insert((self.time, child_id));

        child_id
    }

    /// Add a node to the parent.
    fn add_node_to_parent(&mut self, parent_id: NodeId, hash: u64, child_id: NodeId) {
        // Unwrap here, passing in an unknown id is a programming error.
        let parent = self.nodes.get_mut(parent_id).expect("Unknown parent node");
        if parent.children.insert(hash, child_id).is_none() {
            // Only increase reference count if child does not replace another child.
            self.incref(parent_id)
                .expect("Failed to increase parent refcount");
        }
    }

    /// Remove a node from the trie.
    fn remove_node(&mut self, node_id: NodeId) -> TrieNode {
        // Unwrap here, passing in an unknown id is a programming error.
        let node = self.nodes.remove(node_id).expect("Unknown node");
        assert!(
            node.children.is_empty(),
            "Tried to remove a node with {} children",
            node.children.len()
        );
        let parent_id = node.parent.expect("Attempted to remove root node");
        let parent = self.nodes.get_mut(parent_id).expect("Unknown parent node");

        let node_key = hash(&node.key[..self.block_size]);
        parent.children.remove(&node_key);
        self.decref(parent_id)
            .expect("Failed to decrease parent refcount");
        node
    }

    fn update_access_time(&mut self, node_id: NodeId) {
        // Unwrap here, passing in an unknown id is a programming error.
        let node = self.nodes.get_mut(node_id).expect("Unknown node");

        // Update the ordered leaves set if the node is a leave.
        if self.leaves.remove(&(node.last_accessed, node_id)) {
            self.leaves.insert((self.time, node_id));
        }

        node.last_accessed = self.time;
    }

    #[allow(dead_code)]
    #[doc(hidden)]
    /// Print debugging output for the trie.
    ///
    /// In contrast to `Debug` nicely formatted.
    pub fn print_debug(&self) {
        self.print_debug_(self.root, 0);
    }

    fn print_debug_(&self, node_id: NodeId, indent: usize) {
        let node = &self.nodes[node_id];
        eprintln!(
            "{}{:?}, key: {:?}, blocks: {:?}, ref_count: {}, last_accessed: {}, parent: {:?}, children: {:?}",
            " ".repeat(indent),
            node_id,
            node.key,
            node.blocks,
            node.ref_count,
            node.last_accessed,
            node.parent,
            node.children
        );
        for child_id in self.nodes[node_id].children.values() {
            self.print_debug_(*child_id, indent + 2);
        }
    }

    pub(crate) fn root_id(&self) -> DefaultKey {
        self.root
    }
}

/// Trie node.
#[derive(Debug)]
struct TrieNode {
    blocks: Vec<u32>,
    children: HashMap<u64, NodeId>,
    key: Vec<u32>,
    last_accessed: u64,
    parent: Option<NodeId>,
    ref_count: usize,
}

impl TrieNode {
    fn new(key: Vec<u32>, blocks: Vec<u32>, last_accessed: u64, parent: Option<NodeId>) -> Self {
        TrieNode {
            children: HashMap::new(),
            key,
            blocks,
            last_accessed,
            parent,
            ref_count: 0,
        }
    }
}

fn shared_prefix(left: &[u32], right: &[u32], block_size: usize) -> usize {
    let full = left.iter().zip(right).take_while(|(a, b)| a == b).count();
    // NOTE: this is the case because the child node was chosen based on
    //       matching the first character of the key/prefix.
    assert!(full > 0, "Prefixes must at least share 1 token");
    (full / block_size) * block_size
}

#[cfg(test)]
mod tests {
    use std::sync::Arc;

    use rand::{
        distributions::Uniform, prelude::Distribution, rngs::SmallRng, seq::SliceRandom,
        SeedableRng,
    };
    use rustc_hash::FxHashSet;

    use super::*;

    #[test]
    fn allocator_block_size() {
        let mut cache = RadixAllocator::new(2, 12, None);
        let allocation = cache.allocate(8, Some(Arc::new(vec![0, 1, 2, 3]))).unwrap();
        assert_eq!(allocation.blocks, vec![8, 9, 10, 11]);
        assert_eq!(allocation.slots, vec![16, 17, 18, 19, 20, 21, 22, 23]);
        assert_eq!(allocation.prefix_len, 0);
        cache.free(allocation.blocks.clone(), allocation.allocation_id);

        let allocation = cache.allocate(8, Some(Arc::new(vec![0, 1, 2, 3]))).unwrap();
        assert_eq!(allocation.blocks, vec![8, 9, 10, 11]);
        assert_eq!(allocation.slots, vec![16, 17, 18, 19, 20, 21, 22, 23]);
        assert_eq!(allocation.prefix_len, 4);
    }

    #[test]
    fn allocator_block_size_non_aligned() {
        let mut cache = RadixAllocator::new(2, 12, None);
        let allocation = cache.allocate(7, Some(Arc::new(vec![0, 1, 2]))).unwrap();
        assert_eq!(allocation.blocks, vec![8, 9, 10, 11]);
        assert_eq!(allocation.slots, vec![16, 17, 18, 19, 20, 21, 22]);
        assert_eq!(allocation.prefix_len, 0);
        cache.free(allocation.blocks.clone(), allocation.allocation_id);

        let allocation = cache.allocate(7, Some(Arc::new(vec![0, 1, 2]))).unwrap();
        assert_eq!(allocation.blocks, vec![8, 9, 10, 11]);
        assert_eq!(allocation.slots, vec![16, 17, 18, 19, 20, 21, 22]);
        assert_eq!(allocation.prefix_len, 2);
    }

    #[test]
    fn allocator_reuses_prefixes() {
        let mut cache = RadixAllocator::new(1, 12, None);
        let allocation = cache.allocate(8, Some(Arc::new(vec![0, 1, 2, 3]))).unwrap();
        assert_eq!(allocation.blocks, vec![4, 5, 6, 7, 8, 9, 10, 11]);
        assert_eq!(allocation.blocks, allocation.slots);
        assert_eq!(allocation.prefix_len, 0);
        cache.free(allocation.blocks.clone(), allocation.allocation_id);

        let allocation = cache.allocate(8, Some(Arc::new(vec![0, 1, 2, 3]))).unwrap();
        assert_eq!(allocation.blocks, vec![4, 5, 6, 7, 8, 9, 10, 11]);
        assert_eq!(allocation.prefix_len, 4);
    }

    #[test]
    fn allocator_collects_older_prefixes_first() {
        let mut cache = RadixAllocator::new(1, 7, None);
        let allocation1 = cache.allocate(4, Some(Arc::new(vec![0, 1, 2, 3]))).unwrap();
        assert_eq!(allocation1.blocks, vec![3, 4, 5, 6]);
        assert_eq!(allocation1.prefix_len, 0);

        let allocation2 = cache.allocate(2, Some(Arc::new(vec![4, 5]))).unwrap();
        assert_eq!(allocation2.blocks, vec![1, 2]);
        assert_eq!(allocation2.prefix_len, 0);

        cache.free(allocation1.blocks.clone(), allocation1.allocation_id);
        cache.free(allocation2.blocks.clone(), allocation2.allocation_id);

        // We should get the blocks of the first allocation, since they are more recent.
        let allocation3 = cache.allocate(4, Some(Arc::new(vec![6, 7, 8, 9]))).unwrap();
        assert_eq!(allocation3.blocks, vec![3, 4, 5, 6]);
        assert_eq!(allocation3.prefix_len, 0);
    }

    #[test]
    fn allocator_frees_fully_overlapping_prefills() {
        let mut cache = RadixAllocator::new(1, 10, None);
        let allocation1 = cache.allocate(4, Some(Arc::new(vec![0, 1, 2, 3]))).unwrap();
        let allocation2 = cache.allocate(4, Some(Arc::new(vec![0, 1, 2, 3]))).unwrap();

        cache.free(allocation2.blocks.clone(), allocation2.allocation_id);
        cache.free(allocation1.blocks.clone(), allocation1.allocation_id);

        let allocation3 = cache.allocate(4, Some(Arc::new(vec![0, 1, 2, 3]))).unwrap();
        assert_eq!(allocation3.prefix_len, 4);

        // 10 blocks, of which 1 reserved for health checks, 4 for the cached blocks.
        assert_eq!(cache.free_blocks.len(), 5);
    }

    #[test]
    fn allocator_frees_partially_overlapping_prefills() {
        let mut cache = RadixAllocator::new(1, 20, None);
        let allocation1 = cache.allocate(4, Some(Arc::new(vec![0, 1]))).unwrap();
        assert_eq!(allocation1.blocks, vec![16, 17, 18, 19]);
        assert_eq!(allocation1.prefix_len, 0);

        cache.free(allocation1.blocks.clone(), allocation1.allocation_id);

        let allocation2 = cache
            .allocate(8, Some(Arc::new(vec![0, 1, 2, 3, 4, 5])))
            .unwrap();
        assert_eq!(allocation2.blocks, vec![16, 17, 12, 13, 14, 15, 18, 19]);
        assert_eq!(allocation2.prefix_len, 2);

        let allocation3 = cache
            .allocate(8, Some(Arc::new(vec![0, 1, 2, 3, 6, 7])))
            .unwrap();
        assert_eq!(allocation3.blocks, vec![16, 17, 6, 7, 8, 9, 10, 11]);
        assert_eq!(allocation3.prefix_len, 2);

        cache.free(allocation3.blocks.clone(), allocation3.allocation_id);
        cache.free(allocation2.blocks.clone(), allocation2.allocation_id);

        // 20 blocks, of which 1 reserved for health checks, 6 for allocation3, 2 for allocation2.
        assert_eq!(cache.free_blocks.len(), 11);

        let allocation4 = cache
            .allocate(6, Some(Arc::new(vec![0, 1, 2, 3, 4, 5])))
            .unwrap();
        assert_eq!(allocation4.blocks, vec![16, 17, 6, 7, 14, 15]);
        assert_eq!(allocation4.prefix_len, 6);
        assert_eq!(cache.free_blocks.len(), 11);

        let allocation5 = cache
            .allocate(6, Some(Arc::new(vec![0, 1, 2, 3, 6, 7])))
            .unwrap();
        assert_eq!(allocation5.blocks, vec![16, 17, 6, 7, 8, 9]);
        assert_eq!(allocation5.prefix_len, 6);
        assert_eq!(cache.free_blocks.len(), 11);
    }

    #[test]
    fn trie_insertions_have_correct_prefix_len() {
        let mut trie = RadixTrie::new(1);

        assert_eq!(trie.insert(&[0, 1, 2], &[0, 1, 2]).unwrap(), 0);

        // Already exists.
        assert_eq!(trie.insert(&[0, 1, 2], &[0, 1, 2]).unwrap(), 3);

        // Completely new at root-level
        assert_eq!(trie.insert(&[1, 2, 3], &[1, 2, 3]).unwrap(), 0);

        // Contains full prefix, but longer.
        assert_eq!(trie.insert(&[0, 1, 2, 3, 4], &[0, 1, 2, 3, 4]).unwrap(), 3);

        // Shares partial prefix, we need a split.
        assert_eq!(
            trie.insert(&[0, 1, 2, 3, 5, 6, 7], &[0, 1, 2, 3, 5, 6, 7])
                .unwrap(),
            4
        );
    }

    #[test]
    fn trie_insertions_block_size() {
        let mut trie = RadixTrie::new(2);

        assert_eq!(trie.insert(&[0, 1, 2, 3], &[0, 1]).unwrap(), 0);

        // Already exists.
        // But needs to be block_size aligned
        assert_eq!(trie.insert(&[0, 1, 2, 3], &[0, 1]).unwrap(), 4);

        // Completely new at root-level
        assert_eq!(trie.insert(&[1, 2, 3, 4], &[1, 2]).unwrap(), 0);

        // Contains full prefix, but longer.
        assert_eq!(trie.insert(&[0, 1, 2, 3, 4, 5], &[0, 1, 2]).unwrap(), 4);

        // Shares partial prefix, we need a split.
        assert_eq!(
            trie.insert(&[0, 1, 3, 4, 5, 6, 7, 8], &[0, 1, 2, 3])
                .unwrap(),
            2
        );
    }

    #[test]
    fn trie_get_returns_correct_blocks() {
        let mut trie = RadixTrie::new(1);
        trie.insert(&[0, 1, 2], &[0, 1, 2]).unwrap();
        trie.insert(&[1, 2, 3], &[1, 2, 3]).unwrap();
        trie.insert(&[0, 1, 2, 3, 4], &[0, 1, 2, 3, 4]).unwrap();
        trie.insert(&[0, 1, 2, 3, 5, 6, 7], &[0, 1, 2, 3, 5, 6, 7])
            .unwrap();

        let mut blocks = Vec::new();
        trie.find(&[0], &mut blocks);
        assert_eq!(blocks, vec![0]);

        blocks.clear();
        trie.find(&[0, 1, 2], &mut blocks);
        assert_eq!(blocks, vec![0, 1, 2]);

        blocks.clear();
        trie.find(&[1, 2, 3], &mut blocks);
        assert_eq!(blocks, vec![1, 2, 3]);

        blocks.clear();
        trie.find(&[0, 1, 2, 3], &mut blocks);
        assert_eq!(blocks, vec![0, 1, 2, 3]);

        blocks.clear();
        trie.find(&[0, 1, 2, 3, 4], &mut blocks);
        assert_eq!(blocks, vec![0, 1, 2, 3, 4]);

        blocks.clear();
        trie.find(&[0, 1, 2, 3, 5], &mut blocks);
        assert_eq!(blocks, vec![0, 1, 2, 3, 5]);
    }

    #[test]
    fn trie_evict_removes_correct_blocks() {
        let mut trie = RadixTrie::new(1);
        trie.insert(&[0, 1, 2], &[0, 1, 2]).unwrap();
        trie.insert(&[0, 1, 2, 3, 5, 6, 7], &[0, 1, 2, 3, 5, 6, 7])
            .unwrap();
        trie.insert(&[0, 1, 2, 3, 4], &[0, 1, 2, 3, 4]).unwrap();
        trie.insert(&[1, 2, 3], &[1, 2, 3]).unwrap();

        let mut blocks = Vec::new();

        // Remove less than the leave blocks.
        assert_eq!(trie.evict(1), vec![7]);
        trie.find(&[0, 1, 2, 3, 5, 6, 7], &mut blocks);
        assert_eq!(blocks, vec![0, 1, 2, 3, 5, 6]);

        // Refresh other leaf.
        trie.find(&[0, 1, 2, 3, 4], &mut blocks);
        trie.find(&[1, 2, 3], &mut blocks);

        // Remove the leave blocks exactly.
        assert_eq!(trie.evict(2), vec![5, 6]);
        blocks.clear();
        trie.find(&[0, 1, 2, 3, 5, 6, 7], &mut blocks);
        assert_eq!(blocks, vec![0, 1, 2, 3]);

        trie.find(&[1, 2, 3], &mut blocks);

        // Remove more than the leave blocks.
        assert_eq!(trie.evict(3), vec![4, 3, 2]);
        blocks.clear();
        trie.find(&[0, 1, 2, 3, 4], &mut blocks);
        assert_eq!(blocks, vec![0, 1]);

        // Clear out the whole trie.
        assert_eq!(trie.evict(10), vec![1, 2, 3, 0, 1]);
    }

    #[test]
    fn full_match_returns_correct_node() {
        let mut trie = RadixTrie::new(1);
        trie.insert(&[0, 1, 2], &[0, 1, 2]).unwrap();
        let node_id = trie.find(&[0, 1, 2], &mut vec![]);
        // At this point, there are only two nodes: the root and the node
        // with tokens 0, 1, 2. Looking up the exact prefix must return
        // the non-root node.
        assert_ne!(node_id, trie.root);
    }

    #[test]
    fn partial_match_does_not_recurse() {
        let mut trie = RadixTrie::new(1);
        trie.insert(&[0, 1, 2], &[0, 1, 2]).unwrap();
        trie.insert(&[0, 1, 2, 3, 4, 5], &[0, 1, 2, 3, 4, 5])
            .unwrap();
        let mut blocks = Vec::new();
        let node_id = trie.find(&[0, 1, 3, 4, 5], &mut blocks);
        assert_eq!(blocks, vec![0, 1]);
        assert_eq!(node_id, trie.find(&[0, 1], &mut blocks))
    }

    struct AllocationWithInfo {
        allocation: BlockAllocation,
        // We are doing a lot of set operations and `FxBuildHasher` is
        // muc faster for a set of integers.
        blockset: FxHashSet<u32>,
        non_prefix_blocks: FxHashSet<u32>,
    }

    #[test]
    fn invariants_hold_on_many_operations_remove_all() {
        invariants_hold_on_many_insertions(true);
    }

    #[test]
    fn invariants_hold_on_many_operations_remove_subset() {
        invariants_hold_on_many_insertions(false);
    }

    fn invariants_hold_on_many_insertions(remove_all: bool) {
        // Small vocabulary sizes lead to violations more quickly due to
        // prefix sharing, etc.
        const VOCAB_SIZE: u32 = 2;
        const DATA_LEN: usize = 1_000;

        const MAX_PREFILL_LEN: usize = 8;
        const MAX_DECODE_LEN: usize = 8;

        let vocab_range = Uniform::new(0, VOCAB_SIZE);
        let data_range = Uniform::new(0, DATA_LEN);
        let prefill_len_range = Uniform::new(0, MAX_PREFILL_LEN);
        let decode_len_range = Uniform::new(0, MAX_DECODE_LEN);

        let mut rng = SmallRng::seed_from_u64(64);
        let data = (0..DATA_LEN)
            .map(|_| vocab_range.sample(&mut rng))
            .collect::<Vec<_>>();
        let mut allocator = RadixAllocator::new(1, 100, None);

        let mut allocations = Vec::new();

        for i in 0..100_000 {
            // Allocate until all blocks are used.
            'allocation: loop {
                // Use offset 0 half of the times for prefix sharing.
                let prefill_offset = data_range.sample(&mut rng);
                let prefill_len = prefill_len_range.sample(&mut rng);
                let decode_len = decode_len_range.sample(&mut rng);

                let prefill =
                    data[prefill_offset..data.len().min(prefill_offset + prefill_len)].to_vec();

                let allocation = match allocator
                    .allocate((prefill.len() + decode_len) as u32, Some(Arc::new(prefill)))
                {
                    Some(allocation) => allocation,
                    None => break 'allocation,
                };
                let non_prefix_blocks = allocation.blocks[allocation.prefix_len as usize..]
                    .iter()
                    .copied()
                    .collect::<FxHashSet<_>>();
                let blockset = allocation.blocks.iter().copied().collect::<FxHashSet<_>>();

                // No duplicate blocks in an allocation.
                assert_eq!(
                    allocation.blocks.len(),
                    blockset.len(),
                    "Duplicate blocks in allocation"
                );

                allocations.push(AllocationWithInfo {
                    allocation,
                    blockset,
                    non_prefix_blocks,
                });
            }

            // Check invariants. Skip first iteration, since there is no prefix sharing yet.
            if i > 1 {
                check_allocation_invariants(&allocations);
            }

            // Remove 20% of the allocations, randomly.
            if remove_all {
                allocations.into_iter().for_each(|allocation| {
                    allocator.free(
                        allocation.allocation.blocks.clone(),
                        allocation.allocation.allocation_id,
                    )
                });
                allocations = Vec::new();
            } else {
                allocations.shuffle(&mut rng);
                let remove_index = (allocations.len() as f64 * 0.8) as usize;
                for allocation in allocations.drain(remove_index..) {
                    allocator.free(
                        allocation.allocation.blocks.clone(),
                        allocation.allocation.allocation_id,
                    );
                }
            }
        }
    }

    fn check_allocation_invariants(allocations: &[AllocationWithInfo]) {
        for i in 0..allocations.len() {
            let allocation = &allocations[i];

            // 0 is used for health checks, must not be used.
            assert!(
                !allocation.blockset.contains(&0),
                "Block 0 must not be allocated"
            );

            // No duplicate blocks in an allocation.
            assert_eq!(
                allocation.allocation.blocks.len(),
                allocation.blockset.len(),
                "Duplicate blocks in allocation"
            );

            for other_allocation in &allocations[i + 1..] {
                assert!(
                    other_allocation
                        .non_prefix_blocks
                        .is_disjoint(&allocation.non_prefix_blocks),
                    "Allocations share non-prefix blocks"
                )
            }
        }
    }
}


================================================
FILE: benchmark/Cargo.toml
================================================
[package]
name = "text-generation-benchmark"
description = "Text Generation Benchmarking tool"
version.workspace = true
edition.workspace = true
authors.workspace = true
homepage.workspace = true

[lib]
path = "src/lib.rs"

[[bin]]
name = "text-generation-benchmark"
path = "src/main.rs"

[dependencies]
average = "0.14"
clap = { version = "4.4.5", features = ["derive", "env"] }
float-ord = "0.3.2"
serde = {version = "1.0.188", features = ["derive"]}
serde_json = "1.0"
tabled = "0.14.0"
text-generation-client = { path = "../backends/client" }
thiserror = "1.0.48"
tokenizers = { workspace = true }
tokio = { version = "1.32.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync", "macros"] }
ratatui = "0.28.1"
tracing = "0.1.37"
tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] }
hf-hub = { workspace = true }


================================================
FILE: benchmark/README.md
================================================
<div align="center">

# Text Generation Inference benchmarking tool

![benchmark](../assets/benchmark.png)

</div>

A lightweight benchmarking tool based inspired by [oha](https://github.com/hatoo/oha)
and powered by [Ratatui](https://github.com/ratatui/ratatui).

## Install

```shell
make install-benchmark
```

## Run

First, start `text-generation-inference`:

```shell
text-generation-launcher --model-id bigscience/bloom-560m
```

Then run the benchmarking tool:

```shell
text-generation-benchmark --tokenizer-name bigscience/bloom-560m
```


================================================
FILE: benchmark/src/app.rs
================================================
/// Inspired by https://github.com/hatoo/oha/blob/bb989ea3cd77727e7743e7daa60a19894bb5e901/src/monitor.rs
use crate::generation::{Decode, Message, Prefill};
use ratatui::crossterm::event::{KeyCode, KeyEvent, KeyModifiers};
use ratatui::layout::{Alignment, Constraint, Direction, Layout};
use ratatui::style::{Color, Modifier, Style};
use ratatui::text::{Line, Span};
use ratatui::widgets::{
    Axis, BarChart, Block, Borders, Chart, Dataset, Gauge, GraphType, Paragraph, Tabs,
};
use ratatui::{symbols, Frame};
use text_generation_client::ClientError;
use tokio::sync::mpsc;

/// TUI powered App
pub(crate) struct App {
    pub(crate) running: bool,
    pub(crate) data: Data,
    completed_runs: Vec<usize>,
    completed_batch: usize,
    current_batch: usize,
    current_tab: usize,
    touched_tab: bool,
    zoom: bool,
    is_error: bool,
    tokenizer_name: String,
    sequence_length: u32,
    decode_length: u32,
    n_run: usize,
    receiver: mpsc::Receiver<Result<Message, ClientError>>,
}

impl App {
    pub(crate) fn new(
        receiver: mpsc::Receiver<Result<Message, ClientError>>,
        tokenizer_name: String,
        sequence_length: u32,
        decode_length: u32,
        n_run: usize,
        batch_size: Vec<u32>,
    ) -> Self {
        let current_tab = 0;

        let completed_runs: Vec<usize> = (0..batch_size.len()).map(|_| 0).collect();
        let completed_batch = 0;
        let current_batch = 0;
        let is_error = false;

        let data = Data::new(n_run, batch_size);

        Self {
            running: true,
            data,
            completed_runs,
            completed_batch,
            current_batch,
            current_tab,
            touched_tab: false,
            zoom: false,
            is_error,
            tokenizer_name,
            sequence_length,
            decode_length,
            n_run,
            receiver,
        }
    }

    /// Handle crossterm key events
    pub(crate) fn handle_key_event(&mut self, key_event: KeyEvent) {
        match key_event {
            // Increase and wrap tab
            KeyEvent {
                code: KeyCode::Right,
                ..
            }
            | KeyEvent {
                code: KeyCode::Tab, ..
            } => {
                self.touched_tab = true;
                self.current_tab = (self.current_tab + 1) % self.data.batch_size.len();
            }
            // Decrease and wrap tab
            KeyEvent {
                code: KeyCode::Left,
                ..
            } => {
                self.touched_tab = true;
                if self.current_tab > 0 {
                    self.current_tab -= 1;
                } else {
                    self.current_tab = self.data.batch_size.len() - 1;
                }
            }
            // Zoom on throughput/latency fig
            KeyEvent {
                code: KeyCode::Char('+'),
                ..
            } => {
                self.zoom = true;
            }
            // Unzoom on throughput/latency fig
            KeyEvent {
                code: KeyCode::Char('-'),
                ..
            } => {
                self.zoom = false;
            }
            // Quit
            KeyEvent {
                code: KeyCode::Char('q'),
                ..
            }
            | KeyEvent {
                code: KeyCode::Char('c'),
                modifiers: KeyModifiers::CONTROL,
                ..
            } => {
                self.running = false;
            }
            _ => (),
        }
    }

    /// Get all pending messages from generation task
    pub(crate) fn tick(&mut self) {
        while let Ok(message) = self.receiver.try_recv() {
            match message {
                Ok(message) => match message {
                    Message::Prefill(step) => self.data.push_prefill(step, self.current_batch),
                    Message::Decode(step) => self.data.push_decode(step, self.current_batch),
                    Message::EndRun => {
                        self.completed_runs[self.current_batch] += 1;
                    }
                    Message::EndBatch => {
                        self.data.end_batch(self.current_batch);
                        self.completed_batch += 1;

                        if self.current_batch < self.data.batch_size.len() - 1 {
                            // Only go to next tab if the user never touched the tab keys
                            if !self.touched_tab {
                                self.current_tab += 1;
                            }

                            self.current_batch += 1;
                        }
                    }
                    Message::Warmup => {}
                },
                Err(_) => self.is_error = true,
            }
        }
    }

    /// Render frame
    pub fn render(&mut self, f: &mut Frame) {
        let batch_progress =
            (self.completed_batch as f64 / self.data.batch_size.len() as f64).clamp(0.0, 1.0);
        let run_progress =
            (self.completed_runs[self.current_batch] as f64 / self.n_run as f64).clamp(0.0, 1.0);

        // Vertical layout
        let row5 = Layout::default()
            .direction(Direction::Vertical)
            .constraints(
                [
                    Constraint::Length(1),
                    Constraint::Length(3),
                    Constraint::Length(3),
                    Constraint::Length(13),
                    Constraint::Min(10),
                ]
                .as_ref(),
            )
            .split(f.area());

        // Top row horizontal layout
        let top = Layout::default()
            .direction(Direction::Horizontal)
            .constraints([Constraint::Percentage(50), Constraint::Percentage(50)].as_ref())
            .split(row5[2]);

        // Mid row horizontal layout
        let mid = Layout::default()
            .direction(Direction::Horizontal)
            .constraints(
                [
                    Constraint::Percentage(25),
                    Constraint::Percentage(25),
                    Constraint::Percentage(25),
                    Constraint::Percentage(25),
                ]
                .as_ref(),
            )
            .split(row5[3]);

        // Left mid row vertical layout
        let prefill_text = Layout::default()
            .direction(Direction::Vertical)
            .constraints([Constraint::Length(8), Constraint::Length(5)].as_ref())
            .split(mid[0]);

        // Right mid row vertical layout
        let decode_text = Layout::default()
            .direction(Direction::Vertical)
            .constraints([Constraint::Length(8), Constraint::Length(5)].as_ref())
            .split(mid[2]);
        let decode_text_latency = Layout::default()
            .direction(Direction::Horizontal)
            .constraints([Constraint::Percentage(50), Constraint::Percentage(50)].as_ref())
            .split(decode_text[0]);

        // Bottom row horizontal layout
        let bottom = Layout::default()
            .direction(Direction::Horizontal)
            .constraints([Constraint::Percentage(50), Constraint::Percentage(50)].as_ref())
            .split(row5[4]);

        // Title
        let title = Block::default()
            .borders(Borders::NONE)
            .title(format!(
                "Model: {} | Sequence Length: {} | Decode Length: {}",
                self.tokenizer_name, self.sequence_length, self.decode_length
            ))
            .style(
                Style::default()
                    .add_modifier(Modifier::BOLD)
                    .fg(Color::White),
            );
        f.render_widget(title, row5[0]);

        // Helper
        let helper = Block::default()
            .borders(Borders::NONE)
            .title("<- | tab | ->: change batch tab | q / CTRL + c: quit | +/-: zoom")
            .title_alignment(Alignment::Right)
            .style(Style::default().fg(Color::White));
        f.render_widget(helper, row5[0]);

        // Batch tabs
        let titles: Vec<Line> = self
            .data
            .batch_size
            .iter()
            .map(|b| {
                Line::from(vec![Span::styled(
                    format!("Batch: {b}"),
                    Style::default().fg(Color::White),
                )])
            })
            .collect();
        let tabs = Tabs::new(titles)
            .block(Block::default().borders(Borders::ALL).title("Tabs"))
            .select(self.current_tab)
            .style(Style::default().fg(Color::LightCyan))
            .highlight_style(
                Style::default()
                    .add_modifier(Modifier::BOLD)
                    .bg(Color::Black),
            );
        f.render_widget(tabs, row5[1]);

        // Total progress bar
        let color = if self.is_error {
            Color::Red
        } else {
            Color::LightGreen
        };
        let batch_gauge = progress_gauge(
            "Total Progress",
            format!("{} / {}", self.completed_batch, self.data.batch_size.len()),
            batch_progress,
            color,
        );
        f.render_widget(batch_gauge, top[0]);

        // Batch progress Bar
        let color = if self.is_error {
            Color::Red
        } else {
            Color::LightBlue
        };
        let run_gauge = progress_gauge(
            "Batch Progress",
            format!(
                "{} / {}",
                self.completed_runs[self.current_batch], self.n_run
            ),
            run_progress,
            color,
        );
        f.render_widget(run_gauge, top[1]);

        // Prefill text infos
        let prefill_latency_block = latency_paragraph(
            &mut self.data.prefill_latencies[self.current_tab],
            "Prefill",
        );
        let prefill_throughput_block =
            throughput_paragraph(&self.data.prefill_throughputs[self.current_tab], "Prefill");

        f.render_widget(prefill_latency_block, prefill_text[0]);
        f.render_widget(prefill_throughput_block, prefill_text[1]);

        // Prefill latency histogram
        let histo_width = 7;
        let bins = if mid[1].width < 2 {
            0
        } else {
            (mid[1].width as usize - 2) / (histo_width + 1)
        }
        .max(2);

        let histo_data =
            latency_histogram_data(&self.data.prefill_latencies[self.current_tab], bins);
        let histo_data_str: Vec<(&str, u64)> =
            histo_data.iter().map(|(l, v)| (l.as_str(), *v)).collect();
        let prefill_histogram =
            latency_histogram(&histo_data_str, "Prefill").bar_width(histo_width as u16);
        f.render_widget(prefill_histogram, mid[1]);

        // Decode text info
        let decode_latency_block = latency_paragraph(
            &mut self.data.decode_latencies[self.current_tab],
            "Decode Total",
        );
        let decode_token_latency_block = latency_paragraph(
            &mut self.data.decode_token_latencies[self.current_tab],
            "Decode Token",
        );
        let decode_throughput_block =
            throughput_paragraph(&self.data.decode_throughputs[self.current_tab], "Decode");
        f.render_widget(decode_latency_block, decode_text_latency[0]);
        f.render_widget(decode_token_latency_block, decode_text_latency[1]);
        f.render_widget(decode_throughput_block, decode_text[1]);

        // Decode latency histogram
        let histo_data =
            latency_histogram_data(&self.data.decode_latencies[self.current_tab], bins);
        let histo_data_str: Vec<(&str, u64)> =
            histo_data.iter().map(|(l, v)| (l.as_str(), *v)).collect();
        let decode_histogram =
            latency_histogram(&histo_data_str, "Decode").bar_width(histo_width as u16);
        f.render_widget(decode_histogram, mid[3]);

        // Prefill latency/throughput chart
        let prefill_latency_throughput_chart = latency_throughput_chart(
            &self.data.prefill_batch_latency_throughput,
            &self.data.batch_size,
            self.zoom,
            "Prefill",
        );
        f.render_widget(prefill_latency_throughput_chart, bottom[0]);

        // Decode latency/throughput chart
        let decode_latency_throughput_chart = latency_throughput_chart(
            &self.data.decode_batch_latency_throughput,
            &self.data.batch_size,
            self.zoom,
            "Decode",
        );
        f.render_widget(decode_latency_throughput_chart, bottom[1]);
    }
}

/// App internal data struct
pub(crate) struct Data {
    pub(crate) batch_size: Vec<u32>,
    pub(crate) prefill_latencies: Vec<Vec<f64>>,
    pub(crate) prefill_throughputs: Vec<Vec<f64>>,
    pub(crate) decode_latencies: Vec<Vec<f64>>,
    pub(crate) decode_token_latencies: Vec<Vec<f64>>,
    pub(crate) decode_throughputs: Vec<Vec<f64>>,
    pub(crate) prefill_batch_latency_throughput: Vec<(f64, f64)>,
    pub(crate) decode_batch_latency_throughput: Vec<(f64, f64)>,
}

impl Data {
    fn new(n_run: usize, batch_size: Vec<u32>) -> Self {
        let prefill_latencies: Vec<Vec<f64>> = (0..batch_size.len())
            .map(|_| Vec::with_capacity(n_run))
            .collect();
        let prefill_throughputs: Vec<Vec<f64>> = prefill_latencies.clone();

        let decode_latencies: Vec<Vec<f64>> = prefill_latencies.clone();
        let decode_token_latencies: Vec<Vec<f64>> = decode_latencies.clone();
        let decode_throughputs: Vec<Vec<f64>> = prefill_throughputs.clone();

        let prefill_batch_latency_throughput: Vec<(f64, f64)> =
            Vec::with_capacity(batch_size.len());
        let decode_batch_latency_throughput: Vec<(f64, f64)> =
            prefill_batch_latency_throughput.clone();

        Self {
            batch_size,
            prefill_latencies,
            prefill_throughputs,
            decode_latencies,
            decode_token_latencies,
            decode_throughputs,
            prefill_batch_latency_throughput,
            decode_batch_latency_throughput,
        }
    }

    fn push_prefill(&mut self, prefill: Prefill, batch_idx: usize) {
        let latency = prefill.latency.as_micros() as f64 / 1000.0;
        self.prefill_latencies[batch_idx].push(latency);
        self.prefill_throughputs[batch_idx].push(prefill.throughput);
    }

    fn push_decode(&mut self, decode: Decode, batch_idx: usize) {
        let latency = decode.latency.as_micros() as f64 / 1000.0;
        let token_latency = decode.token_latency.as_micros() as f64 / 1000.0;
        self.decode_latencies[batch_idx].push(latency);
        self.decode_token_latencies[batch_idx].push(token_latency);
        self.decode_throughputs[batch_idx].push(decode.throughput);
    }

    fn end_batch(&mut self, batch_idx: usize) {
        self.prefill_batch_latency_throughput.push((
            self.prefill_latencies[batch_idx].iter().sum::<f64>()
                / self.prefill_latencies[batch_idx].len() as f64,
            self.prefill_throughputs[batch_idx].iter().sum::<f64>()
                / self.prefill_throughputs[batch_idx].len() as f64,
        ));
        self.decode_batch_latency_throughput.push((
            self.decode_latencies[batch_idx].iter().sum::<f64>()
                / self.decode_latencies[batch_idx].len() as f64,
            self.decode_throughputs[batch_idx].iter().sum::<f64>()
                / self.decode_throughputs[batch_idx].len() as f64,
        ));
    }
}

/// Progress bar
fn progress_gauge(title: &str, label: String, progress: f64, color: Color) -> Gauge<'_> {
    Gauge::default()
        .block(Block::default().title(title).borders(Borders::ALL))
        .gauge_style(Style::default().fg(color))
        .label(Span::raw(label))
        .ratio(progress)
}

/// Throughput paragraph
fn throughput_paragraph<'a>(throughput: &[f64], name: &'static str) -> Paragraph<'a> {
    // Throughput average/high/low texts
    let throughput_texts = statis_spans(throughput, "tokens/secs");

    // Throughput block
    Paragraph::new(throughput_texts).block(
        Block::default()
            .title(Span::raw(format!("{name} Throughput")))
            .borders(Borders::ALL),
    )
}

/// Latency paragraph
fn latency_paragraph<'a>(latency: &mut [f64], name: &'static str) -> Paragraph<'a> {
    // Latency average/high/low texts
    let mut latency_texts = statis_spans(latency, "ms");

    // Sort latency for percentiles
    float_ord::sort(latency);
    let latency_percentiles = crate::utils::percentiles(latency, &[50, 90, 99]);

    // Latency p50/p90/p99 texts
    let colors = [Color::LightGreen, Color::LightYellow, Color::LightRed];
    for (i, (name, value)) in latency_percentiles.iter().enumerate() {
        let span = Line::from(vec![Span::styled(
            format!("{name}:     {value:.2} ms"),
            Style::default().fg(colors[i]),
        )]);
        latency_texts.push(span);
    }

    Paragraph::new(latency_texts).block(
        Block::default()
            .title(Span::raw(format!("{name} Latency")))
            .borders(Borders::ALL),
    )
}

/// Average/High/Low spans
fn statis_spans<'a>(data: &[f64], unit: &'static str) -> Vec<Line<'a>> {
    vec![
        Line::from(vec![Span::styled(
            format!(
                "Average: {:.2} {unit}",
                data.iter().sum::<f64>() / data.len() as f64
            ),
            Style::default().fg(Color::LightBlue),
        )]),
        Line::from(vec![Span::styled(
            format!(
                "Lowest:  {:.2} {unit}",
                data.iter()
                    .min_by(|a, b| a.total_cmp(b))
                    .unwrap_or(&f64::NAN)
            ),
            Style::default().fg(Color::Reset),
        )]),
        Line::from(vec![Span::styled(
            format!(
                "Highest: {:.2} {unit}",
                data.iter()
                    .max_by(|a, b| a.total_cmp(b))
                    .unwrap_or(&f64::NAN)
            ),
            Style::default().fg(Color::Reset),
        )]),
    ]
}

/// Latency histogram data
fn latency_histogram_data(latency: &[f64], bins: usize) -> Vec<(String, u64)> {
    let histo_data: Vec<(String, u64)> = {
        let histo = crate::utils::histogram(latency, bins);
        histo
            .into_iter()
            .map(|(label, v)| (format!("{label:.2}"), v as u64))
            .collect()
    };

    histo_data
}

/// Latency Histogram
fn latency_histogram<'a>(
    histo_data_str: &'a Vec<(&'a str, u64)>,
    name: &'static str,
) -> BarChart<'a> {
    BarChart::default()
        .block(
            Block::default()
                .title(format!("{name} latency histogram"))
                .style(Style::default().fg(Color::LightYellow).bg(Color::Reset))
                .borders(Borders::ALL),
        )
        .data(histo_data_str.as_slice())
}

/// Latency/Throughput chart
fn latency_throughput_chart<'a>(
    latency_throughput: &'a [(f64, f64)],
    batch_sizes: &'a [u32],
    zoom: bool,
    name: &'static str,
) -> Chart<'a> {
    let latency_iter = latency_throughput.iter().map(|(l, _)| l);
    let throughput_iter = latency_throughput.iter().map(|(_, t)| t);

    // Get extreme values
    let min_latency: f64 = *latency_iter
        .clone()
        .min_by(|a, b| a.total_cmp(b))
        .unwrap_or(&f64::NAN);
    let max_latency: f64 = *latency_iter
        .max_by(|a, b| a.total_cmp(b))
        .unwrap_or(&f64::NAN);
    let min_throughput: f64 = *throughput_iter
        .clone()
        .min_by(|a, b| a.total_cmp(b))
        .unwrap_or(&f64::NAN);
    let max_throughput: f64 = *throughput_iter
        .max_by(|a, b| a.total_cmp(b))
        .unwrap_or(&f64::NAN);

    // Char min max values
    let min_x = if zoom {
        ((min_latency - 0.05 * min_latency) / 100.0).floor() * 100.0
    } else {
        0.0
    };
    let max_x = ((max_latency + 0.05 * max_latency) / 100.0).ceil() * 100.0;
    let step_x = (max_x - min_x) / 4.0;

    // Chart min max values
    let min_y = if zoom {
        ((min_throughput - 0.05 * min_throughput) / 100.0).floor() * 100.0
    } else {
        0.0
    };
    let max_y = ((max_throughput + 0.05 * max_throughput) / 100.0).ceil() * 100.0;
    let step_y = (max_y - min_y) / 4.0;

    // Labels
    let mut x_labels = vec![Span::styled(
        format!("{min_x:.2}"),
        Style::default()
            .add_modifier(Modifier::BOLD)
            .fg(Color::Gray)
            .bg(Color::Reset),
    )];
    for i in 0..3 {
        x_labels.push(Span::styled(
            format!("{:.2}", min_x + ((i + 1) as f64 * step_x)),
            Style::default().fg(Color::Gray).bg(Color::Reset),
        ));
    }
    x_labels.push(Span::styled(
        format!("{max_x:.2}"),
        Style::default()
            .add_modifier(Modifier::BOLD)
            .fg(Color::Gray)
            .bg(Color::Reset),
    ));

    // Labels
    let mut y_labels = vec![Span::styled(
        format!("{min_y:.2}"),
        Style::default()
            .add_modifier(Modifier::BOLD)
            .fg(Color::Gray)
            .bg(Color::Reset),
    )];
    for i in 0..3 {
        y_labels.push(Span::styled(
            format!("{:.2}", min_y + ((i + 1) as f64 * step_y)),
            Style::default().fg(Color::Gray).bg(Color::Reset),
        ));
    }
    y_labels.push(Span::styled(
        format!("{max_y:.2}"),
        Style::default()
            .add_modifier(Modifier::BOLD)
            .fg(Color::Gray)
            .bg(Color::Reset),
    ));

    // Chart dataset
    let colors = color_vec();
    let datasets: Vec<Dataset> = (0..latency_throughput.len())
        .map(|i| {
            let color_idx = i % colors.len();

            Dataset::default()
                .name(batch_sizes[i].to_string())
                .marker(symbols::Marker::Block)
                .style(Style::default().fg(colors[color_idx]))
                .graph_type(GraphType::Scatter)
                .data(&latency_throughput[i..(i + 1)])
        })
        .collect();

    // Chart
    Chart::new(datasets)
        .style(Style::default().fg(Color::Cyan).bg(Color::Reset))
        .block(
            Block::default()
                .title(Span::styled(
                    format!("{name} throughput over latency"),
                    Style::default().fg(Color::Gray).bg(Color::Reset),
                ))
                .borders(Borders::ALL),
        )
        .x_axis(
            Axis::default()
                .title("ms")
                .style(Style::default().fg(Color::Gray).bg(Color::Reset))
                .labels(x_labels)
                .bounds([min_x, max_x]),
        )
        .y_axis(
            Axis::default()
                .title("tokens/secs")
                .style(Style::default().fg(Color::Gray).bg(Color::Reset))
                .labels(y_labels)
                .bounds([min_y, max_y]),
        )
}

// Colors for latency/throughput chart
fn color_vec() -> Vec<Color> {
    vec![
        Color::Red,
        Color::Green,
        Color::Yellow,
        Color::Blue,
        Color::Magenta,
        Color::Cyan,
        Color::Gray,
        Color::DarkGray,
        Color::LightRed,
        Color::LightGreen,
        Color::LightYellow,
        Color::LightBlue,
        Color::LightMagenta,
        Color::LightCyan,
    ]
}


================================================
FILE: benchmark/src/event.rs
================================================
/// Inspired by https://github.com/orhun/rust-tui-template/blob/472aa515119d4c94903eac12d9784417281dc7f5/src/event.rs
use ratatui::crossterm::event;
use std::time::{Duration, Instant};
use tokio::sync::{broadcast, mpsc};

/// Events
#[derive(Debug)]
pub(crate) enum Event {
    /// Terminal tick.
    Tick,
    /// Key press.
    Key(event::KeyEvent),
    /// Terminal resize.
    Resize,
}

pub(crate) async fn terminal_event_task(
    fps: u32,
    event_sender: mpsc::Sender<Event>,
    mut shutdown_receiver: broadcast::Receiver<()>,
    _shutdown_guard_sender: mpsc::Sender<()>,
) {
    // End task if a message is received on shutdown_receiver
    // _shutdown_guard_sender will be dropped once the task is finished
    tokio::select! {
        _ = event_loop(fps, event_sender)  => {
        },
        _ = shutdown_receiver.recv() => {}
    }
}

/// Main event loop
async fn event_loop(fps: u32, event_sender: mpsc::Sender<Event>) {
    // Frame budget
    let per_frame = Duration::from_secs(1) / fps;

    // When was last frame executed
    let mut last_frame = Instant::now();

    loop {
        // Sleep to avoid blocking the thread for too long
        if let Some(sleep) = per_frame.checked_sub(last_frame.elapsed()) {
            tokio::time::sleep(sleep).await;
        }

        // Get crossterm event and send a new one over the channel
        if event::poll(Duration::from_secs(0)).expect("no events available") {
            match event::read().expect("unable to read event") {
                event::Event::Key(e) => event_sender.send(Event::Key(e)).await.unwrap_or(()),
                event::Event::Resize(_w, _h) => {
                    event_sender.send(Event::Resize).await.unwrap_or(())
                }
                _ => (),
            }
        }

        // Frame budget exceeded
        if last_frame.elapsed() >= per_frame {
            // Send tick
            event_sender.send(Event::Tick).await.unwrap_or(());
            // Rest last_frame time
            last_frame = Instant::now();
        }
    }
}


================================================
FILE: benchmark/src/generation.rs
================================================
use std::time::{Duration, Instant};
use text_generation_client::v3::{
    Batch, CachedBatch, NextTokenChooserParameters, Request, ShardedClient,
    StoppingCriteriaParameters,
};
use text_generation_client::{Chunk, ClientError, Input};
use tokenizers::{Tokenizer, TruncationDirection};
use tokio::sync::{broadcast, mpsc};

const LOREM_IPSUM: &str = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.";

#[derive(Debug, Clone)]
pub(crate) struct Prefill {
    pub(crate) latency: Duration,
    pub(crate) throughput: f64,
}

#[derive(Debug, Clone)]
pub(crate) struct Decode {
    pub(crate) latency: Duration,
    pub(crate) token_latency: Duration,
    pub(crate) throughput: f64,
}

#[derive(Debug)]
pub(crate) enum Message {
    Warmup,
    Prefill(Prefill),
    Decode(Decode),
    EndRun,
    EndBatch,
}

/// Benchmarking task
#[allow(clippy::too_many_arguments)]
pub(crate) async fn generation_task(
    tokenizer: Tokenizer,
    batch_size: Vec<u32>,
    sequence_length: u32,
    decode_length: u32,
    top_n_tokens: Option<u32>,
    n_runs: usize,
    warmups: usize,
    parameters: NextTokenChooserParameters,
    client: ShardedClient,
    run_sender: mpsc::Sender<Result<Message, ClientError>>,
    mut shutdown_receiver: broadcast::Receiver<()>,
    _shutdown_guard_sender: mpsc::Sender<()>,
) {
    // End task if a message is received on shutdown_receiver
    // _shutdown_guard_sender will be dropped once the task is finished
    tokio::select! {
        res = generate_runs(tokenizer, batch_size, sequence_length, decode_length, top_n_tokens, n_runs, warmups, parameters, client, run_sender.clone())  => {
            if let Err(err) = res {
                run_sender.send(Err(err)).await.unwrap_or(());
            }
        },
        _ = shutdown_receiver.recv() => {}
    }
}

/// Benchmark prefill/decode
#[allow(clippy::too_many_arguments)]
async fn generate_runs(
    tokenizer: Tokenizer,
    batch_size: Vec<u32>,
    sequence_length: u32,
    decode_length: u32,
    top_n_tokens: Option<u32>,
    n_runs: usize,
    warmups: usize,
    parameters: NextTokenChooserParameters,
    mut client: ShardedClient,
    run_sender: mpsc::Sender<Result<Message, ClientError>>,
) -> Result<(), ClientError> {
    // Create a dummy sequence
    let sequence = create_sequence(sequence_length, tokenizer);

    for b in batch_size {
        // Warmups on batch size
        for _ in 0..warmups {
            let (_, decode_batch) = prefill(
                sequence.clone(),
                sequence_length,
                b,
                decode_length,
                parameters.clone(),
                top_n_tokens,
                &mut client,
            )
            .await?;
            let _ = decode(decode_batch, &mut client).await?;
            // Send warmup message
            run_sender.send(Ok(Message::Warmup)).await.unwrap_or(());
        }

        for _ in 0..n_runs {
            let (prefill, decode_batch) = prefill(
                sequence.clone(),
                sequence_length,
                b,
                decode_length,
                parameters.clone(),
                top_n_tokens,
                &mut client,
            )
            .await?;
            // Send prefill message
            run_sender
                .send(Ok(Message::Prefill(prefill)))
                .await
                .unwrap_or(());

            let decode = decode(decode_batch, &mut client).await?;

            // Send decode message
            run_sender
                .send(Ok(Message::Decode(decode)))
                .await
                .unwrap_or(());

            // Send run ended message
            run_sender.send(Ok(Message::EndRun)).await.unwrap_or(());
        }
        // Batch ended
        run_sender.send(Ok(Message::EndBatch)).await.unwrap_or(());
    }
    Ok(())
}

// Run a prefill step
async fn prefill(
    sequence: String,
    sequence_length: u32,
    batch_size: u32,
    decode_length: u32,
    parameters: NextTokenChooserParameters,
    top_n_tokens: Option<u32>,
    client: &mut ShardedClient,
) -> Result<(Prefill, CachedBatch), ClientError> {
    // Create requests
    let requests = (0..batch_size)
        .map(|id| Request {
            id: id.into(),
            prefill_logprobs: false,
            input_chunks: Some(Input {
                chunks: vec![Chunk::Text(sequence.clone()).into()],
            }),
            inputs: sequence.clone(),
            truncate: sequence_length,
            add_special_tokens: true,
            parameters: Some(parameters.clone()),
            stopping_parameters: Some(StoppingCriteriaParameters {
                max_new_tokens: decode_length,
                stop_sequences: vec![],
                ignore_eos_token: true, // Will not stop even if a eos token is generated
            }),
            top_n_tokens: top_n_tokens.unwrap_or(0),
            blocks: vec![],
            slots: vec![],
            cache_len: 0,
            chunk_len: None,
            adapter_id: None,
        })
        .collect();

    let batch = Batch {
        id: 0,
        requests,
        size: batch_size,
        max_tokens: batch_size * (sequence_length + decode_length),
        max_blocks: 0,
    };

    // Run prefill
    let start_time = Instant::now();
    let (_, decode_batch, _) = client.prefill(batch.clone(), None).await?;

    // Get latency
    let latency = start_time.elapsed();

    // Compute throughput from latency and batch size
    let throughput = (batch_size * sequence_length) as f64 / latency.as_secs_f64();

    // Decode batch cannot be empty
    let decode_batch = decode_batch.expect("decode_batch is None. This is a bug.");

    let step = Prefill {
        latency,
        throughput,
    };

    Ok((step, decode_batch))
}

/// Run a full decode
async fn decode(batch: CachedBatch, client: &mut ShardedClient) -> Result<Decode, ClientError> {
    let mut decode_length = 0;
    let batch_size = batch.size;

    let start_time = Instant::now();

    // Full decode over decode length
    let mut next_batch = Some(batch);
    while let Some(batch) = next_batch {
        let result = client.decode(vec![batch]).await?;
        next_batch = result.1;
        decode_length += 1;
    }

    // Get latency
    let latency = start_time.elapsed();
    let token_latency = latency / decode_length;

    // Compute throughput from latency, batch size and decode length
    let throughput = (batch_size * decode_length) as f64 / latency.as_secs_f64();

    let step = Decode {
        latency,
        token_latency,
        throughput,
    };
    Ok(step)
}

/// Create a dummy sequence of the correct length
fn create_sequence(sequence_length: u32, tokenizer: Tokenizer) -> String {
    let lorem_ipsum_length = tokenizer.encode(LOREM_IPSUM, true).unwrap().len();
    // Repeat lorem ipsum to cover sequence length
    let string_sequence =
        LOREM_IPSUM.repeat((0..sequence_length).step_by(lorem_ipsum_length).len());
    // Encode sequence
    let mut encoding = tokenizer.encode(string_sequence, true).unwrap();
    // Truncate to sequence_length
    encoding.truncate(sequence_length as usize, 0, TruncationDirection::Left);
    // Decode
    tokenizer.decode(encoding.get_ids(), false).unwrap()
}


================================================
FILE: benchmark/src/lib.rs
================================================
mod app;
mod event;
mod generation;
mod table;
mod utils;

use crate::app::App;
use crate::event::Event;
use ratatui::backend::CrosstermBackend;
use ratatui::crossterm::ExecutableCommand;
use ratatui::Terminal;
use std::io;
use text_generation_client::v3::{GrammarType, NextTokenChooserParameters, ShardedClient};
use tokenizers::Tokenizer;
use tokio::sync::{broadcast, mpsc};

/// Run benchmarking app
#[allow(clippy::too_many_arguments)]
pub async fn run(
    tokenizer_name: String,
    tokenizer: Tokenizer,
    batch_size: Vec<u32>,
    sequence_length: u32,
    decode_length: u32,
    top_n_tokens: Option<u32>,
    n_runs: usize,
    warmups: usize,
    temperature: Option<f32>,
    top_k: Option<u32>,
    top_p: Option<f32>,
    typical_p: Option<f32>,
    repetition_penalty: Option<f32>,
    frequency_penalty: Option<f32>,
    watermark: bool,
    do_sample: bool,
    client: ShardedClient,
) -> Result<(), std::io::Error> {
    let parameters = NextTokenChooserParameters {
        temperature: temperature.unwrap_or(1.0),
        top_k: top_k.unwrap_or(0),
        top_p: top_p.unwrap_or(1.0),
        typical_p: typical_p.unwrap_or(1.0),
        do_sample,
        seed: 0,
        repetition_penalty: repetition_penalty.unwrap_or(1.0),
        frequency_penalty: frequency_penalty.unwrap_or(0.0),
        watermark,
        grammar: String::new(),
        grammar_type: GrammarType::None as i32,
    };

    // Initialize terminal properties
    ratatui::crossterm::terminal::enable_raw_mode()?;
    io::stdout().execute(ratatui::crossterm::terminal::EnterAlternateScreen)?;
    io::stdout().execute(ratatui::crossterm::cursor::Hide)?;

    // Initialize terminal
    let mut terminal = {
        let backend = CrosstermBackend::new(io::stdout());
        Terminal::new(backend)?
    };

    // Create message channel between generation_task and app
    let (run_sender, run_receiver) = mpsc::channel(8);
    // Crossterm event channel
    let (event_sender, mut event_receiver) = mpsc::channel(8);
    // Shutdown channel to terminate tasks
    let (shutdown_sender, _) = broadcast::channel(1);
    // Channel to check if tasks terminated
    let (shutdown_guard_sender, mut shutdown_guard_receiver) = mpsc::channel(1);

    // Create generation task
    tokio::spawn(generation::generation_task(
        tokenizer,
        batch_size.clone(),
        sequence_length,
        decode_length,
        top_n_tokens,
        n_runs,
        warmups,
        parameters,
        client,
        run_sender,
        shutdown_sender.subscribe(),
        shutdown_guard_sender.clone(),
    ));

    // Create event task
    tokio::spawn(event::terminal_event_task(
        250,
        event_sender,
        shutdown_sender.subscribe(),
        shutdown_guard_sender.clone(),
    ));

    // Drop our end of shutdown sender
    drop(shutdown_guard_sender);

    // Create App
    let mut app = App::new(
        run_receiver,
        tokenizer_name.clone(),
        sequence_length,
        decode_length,
        n_runs,
        batch_size,
    );

    while app.running {
        // Draw frame
        terminal.draw(|frame| app.render(frame))?;

        // Await a new event from event handling task
        match event_receiver.recv().await {
            None => break,
            // Update app state
            Some(event) => match event {
                Event::Tick => app.tick(),
                Event::Key(key_event) => app.handle_key_event(key_event),
                _ => {}
            },
        }
    }

    // Ask tasks to shutdown
    let _ = shutdown_sender.send(());
    // Wait for tasks to shutdown
    let _ = shutdown_guard_receiver.recv().await;

    // Revert terminal to original view
    io::stdout().execute(ratatui::crossterm::terminal::LeaveAlternateScreen)?;
    ratatui::crossterm::terminal::disable_raw_mode()?;
    io::stdout().execute(ratatui::crossterm::cursor::Show)?;

    let parameters_table = table::parameters_table(
        tokenizer_name,
        sequence_length,
        decode_length,
        top_n_tokens,
        n_runs,
        warmups,
        temperature,
        top_k,
        top_p,
        typical_p,
        repetition_penalty,
        frequency_penalty,
        watermark,
        do_sample,
    );
    println!("\n{parameters_table}\n");

    let latency_table = table::latency_table(&app.data);
    println!("\n{latency_table}\n");

    let throughput_table = table::throughput_table(&app.data);
    println!("\n{throughput_table}\n");

    Ok(())
}


================================================
FILE: benchmark/src/main.rs
================================================
/// Text Generation Inference benchmarking tool
///
/// Inspired by the great Oha app: https://github.com/hatoo/oha
/// and: https://github.com/orhun/rust-tui-template
use clap::Parser;
use std::path::Path;
use text_generation_client::v3::ShardedClient;
use tokenizers::{FromPretrainedParameters, Tokenizer};
use tracing_subscriber::layer::SubscriberExt;
use tracing_subscriber::util::SubscriberInitExt;
use tracing_subscriber::EnvFilter;

/// App Configuration
#[derive(Parser, Debug)]
#[clap(author, version, about, long_about = None)]
struct Args {
    /// The name of the tokenizer (as in model_id on the huggingface hub, or local path).
    #[clap(short, long, env)]
    tokenizer_name: String,

    /// The revision to use for the tokenizer if on the hub.
    #[clap(default_value = "main", long, env)]
    revision: String,

    /// The various batch sizes to benchmark for, the idea is to get enough
    /// batching to start seeing increased latency, this usually means you're
    /// moving from memory bound (usual as BS=1) to compute bound, and this is
    /// a sweet spot for the maximum batch size for the model under test
    #[clap(short, long)]
    batch_size: Option<Vec<u32>>,

    /// This is the initial prompt sent to the text-generation-server length
    /// in token. Longer prompt will slow down the benchmark. Usually the
    /// latency grows somewhat linearly with this for the prefill step.
    ///
    /// Most importantly, the prefill step is usually not the one dominating
    /// your runtime, so it's ok to keep it short.
    #[clap(default_value = "10", short, long, env)]
    sequence_length: u32,

    /// This is how many tokens will be generated by the server and averaged out
    /// to give the `decode` latency. This is the *critical* number you want to optimize for
    /// LLM spend most of their time doing decoding.
    ///
    /// Decode latency is usually quite stable.
    #[clap(default_value = "8", short, long, env)]
    decode_length: u32,

    ///How many runs should we average from
    #[clap(default_value = "10", short, long, env)]
    runs: usize,

    /// Number of warmup cycles
    #[clap(default_value = "1", short, long, env)]
    warmups: usize,

    /// The location of the grpc socket. This benchmark tool bypasses the router
    /// completely and directly talks to the gRPC processes
    #[clap(default_value = "/tmp/text-generation-server-0", short, long, env)]
    master_shard_uds_path: String,

    /// Generation parameter in case you want to specifically test/debug particular
    /// decoding strategies, for full doc refer to the `text-generation-server`
    #[clap(long, env)]
    temperature: Option<f32>,

    /// Generation parameter in case you want to specifically test/debug particular
    /// decoding strategies, for full doc refer to the `text-generation-server`
    #[clap(long, env)]
    top_k: Option<u32>,

    /// Generation parameter in case you want to specifically test/debug particular
    /// decoding strategies, for full doc refer to the `text-generation-server`
    #[clap(long, env)]
    top_p: Option<f32>,

    /// Generation parameter in case you want to specifically test/debug particular
    /// decoding strategies, for full doc refer to the `text-generation-server`
    #[clap(long, env)]
    typical_p: Option<f32>,

    /// Generation parameter in case you want to specifically test/debug particular
    /// decoding strategies, for full doc refer to the `text-generation-server`
    #[clap(long, env)]
    repetition_penalty: Option<f32>,

    /// Generation parameter in case you want to specifically test/debug particular
    /// decoding strategies, for full doc refer to the `text-generation-server`
    #[clap(long, env)]
    frequency_penalty: Option<f32>,

    /// Generation parameter in case you want to specifically test/debug particular
    /// decoding strategies, for full doc refer to the `text-generation-server`
    #[clap(long, env)]
    watermark: bool,

    /// Generation parameter in case you want to specifically test/debug particular
    /// decoding strategies, for full doc refer to the `text-generation-server`
    #[clap(long, env)]
    do_sample: bool,

    /// Generation parameter in case you want to specifically test/debug particular
    /// decoding strategies, for full doc refer to the `text-generation-server`
    #[clap(long, env)]
    top_n_tokens: Option<u32>,
}

fn main() -> Result<(), Box<dyn std::error::Error>> {
    init_logging();

    // Get args
    let args = Args::parse();
    // Pattern match configuration
    let Args {
        tokenizer_name,
        revision,
        batch_size,
        sequence_length,
        decode_length,
        runs,
        warmups,
        temperature,
        top_k,
        top_p,
        typical_p,
        repetition_penalty,
        frequency_penalty,
        watermark,
        do_sample,
        master_shard_uds_path,
        top_n_tokens,
    } = args;

    let batch_size = batch_size.unwrap_or(vec![1, 2, 4, 8, 16, 32]);

    // Tokenizer instance
    // This will only be used to validate payloads
    tracing::info!("Loading tokenizer");
    let local_path = Path::new(&tokenizer_name);
    let tokenizer =
        if local_path.exists() && local_path.is_dir() && local_path.join("tokenizer.json").exists()
        {
            // Load local tokenizer
            tracing::info!("Found local tokenizer");
            Tokenizer::from_file(local_path.join("tokenizer.json")).unwrap()
        } else {
            tracing::info!("Downloading tokenizer");

            // Parse Huggingface hub token
            let token = std::env::var("HF_TOKEN")
                .or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN"))
                .ok();

            // Download and instantiate tokenizer
            // We need to download it outside of the Tokio runtime
            let params = FromPretrainedParameters {
                revision,
                token,
                ..Default::default()
            };
            Tokenizer::from_pretrained(tokenizer_name.clone(), Some(params)).unwrap()
        };
    tracing::info!("Tokenizer loaded");

    // Launch Tokio runtime
    tokio::runtime::Builder::new_multi_thread()
        .enable_all()
        .build()
        .unwrap()
        .block_on(async {
            // Instantiate sharded client from the master unix socket
            tracing::info!("Connect to model server");
            let mut sharded_client = ShardedClient::connect_uds(master_shard_uds_path)
                .await
                .expect("Could not connect to server");
            // Clear the cache; useful if the webserver rebooted
            sharded_client
                .clear_cache(None)
                .await
                .expect("Unable to clear cache");

            tracing::info!("Connected");

            // Run app
            text_generation_benchmark::run(
                tokenizer_name,
                tokenizer,
                batch_size,
                sequence_length,
                decode_length,
                top_n_tokens,
                runs,
                warmups,
                temperature,
                top_k,
                top_p,
                typical_p,
                repetition_penalty,
                frequency_penalty,
                watermark,
                do_sample,
                sharded_client,
            )
            .await
            .unwrap();
        });
    Ok(())
}

/// Init logging using LOG_LEVEL
fn init_logging() {
    // STDOUT/STDERR layer
    let fmt_layer = tracing_subscriber::fmt::layer()
        .with_file(true)
        .with_line_number(true);

    // Filter events with LOG_LEVEL
    let env_filter =
        EnvFilter::try_from_env("LOG_LEVEL").unwrap_or_else(|_| EnvFilter::new("info"));

    tracing_subscriber::registry()
        .with(env_filter)
        .with(fmt_layer)
        .init();
}


================================================
FILE: benchmark/src/table.rs
================================================
use crate::app::Data;
use tabled::settings::Merge;
use tabled::{builder::Builder, settings::Style, Table};

#[allow(clippy::too_many_arguments)]
pub(crate) fn parameters_table(
    tokenizer_name: String,
    sequence_length: u32,
    decode_length: u32,
    top_n_tokens: Option<u32>,
    n_runs: usize,
    warmups: usize,
    temperature: Option<f32>,
    top_k: Option<u32>,
    top_p: Option<f32>,
    typical_p: Option<f32>,
    repetition_penalty: Option<f32>,
    frequency_penalty: Option<f32>,
    watermark: bool,
    do_sample: bool,
) -> Table {
    let mut builder = Builder::default();

    builder.set_header(["Parameter", "Value"]);

    builder.push_record(["Model", &tokenizer_name]);
    builder.push_record(["Sequence Length", &sequence_length.to_string()]);
    builder.push_record(["Decode Length", &decode_length.to_string()]);
    builder.push_record(["Top N Tokens", &format!("{top_n_tokens:?}")]);
    builder.push_record(["N Runs", &n_runs.to_string()]);
    builder.push_record(["Warmups", &warmups.to_string()]);
    builder.push_record(["Temperature", &format!("{temperature:?}")]);
    builder.push_record(["Top K", &format!("{top_k:?}")]);
    builder.push_record(["Top P", &format!("{top_p:?}")]);
    builder.push_record(["Typical P", &format!("{typical_p:?}")]);
    builder.push_record(["Repetition Penalty", &format!("{repetition_penalty:?}")]);
    builder.push_record(["Frequency Penalty", &format!("{frequency_penalty:?}")]);
    builder.push_record(["Watermark", &watermark.to_string()]);
    builder.push_record(["Do Sample", &do_sample.to_string()]);

    let mut table = builder.build();
    table.with(Style::markdown());
    table
}

pub(crate) fn latency_table(data: &Data) -> Table {
    let mut builder = Builder::default();

    builder.set_header([
        "Step",
        "Batch Size",
        "Average",
        "Lowest",
        "Highest",
        "p50",
        "p90",
        "p99",
    ]);

    add_latencies(
        &mut builder,
        "Prefill",
        &data.batch_size,
        &data.prefill_latencies,
    );
    add_latencies(
        &mut builder,
        "Decode (token)",
        &data.batch_size,
        &data.decode_token_latencies,
    );
    add_latencies(
        &mut builder,
        "Decode (total)",
        &data.batch_size,
        &data.decode_latencies,
    );

    let mut table = builder.build();
    table.with(Style::markdown()).with(Merge::vertical());
    table
}

pub(crate) fn throughput_table(data: &Data) -> Table {
    let mut builder = Builder::default();

    builder.set_header(["Step", "Batch Size", "Average", "Lowest", "Highest"]);

    add_throuhgputs(
        &mut builder,
        "Prefill",
        &data.batch_size,
        &data.prefill_throughputs,
    );
    add_throuhgputs(
        &mut builder,
        "Decode",
        &data.batch_size,
        &data.decode_throughputs,
    );

    let mut table = builder.build();
    table.with(Style::markdown()).with(Merge::vertical());
    table
}

fn add_latencies(
    builder: &mut Builder,
    step: &'static str,
    batch_size: &[u32],
    batch_latencies: &[Vec<f64>],
) {
    for (i, b) in batch_size.iter().enumerate() {
        let latencies = &batch_latencies[i];
        let (avg, min, max) = avg_min_max(latencies);

        let row = [
            step,
            &b.to_string(),
            &format_value(avg, "ms"),
            &format_value(min, "ms"),
            &format_value(max, "ms"),
            &format_value(px(latencies, 50), "ms"),
            &format_value(px(latencies, 90), "ms"),
            &format_value(px(latencies, 99), "ms"),
        ];

        builder.push_record(row);
    }
}

fn add_throuhgputs(
    builder: &mut Builder,
    step: &'static str,
    batch_size: &[u32],
    batch_throughputs: &[Vec<f64>],
) {
    for (i, b) in batch_size.iter().enumerate() {
        let throughputs = &batch_throughputs[i];
        let (avg, min, max) = avg_min_max(throughputs);

        let row = [
            step,
            &b.to_string(),
            &format_value(avg, "tokens/secs"),
            &format_value(min, "tokens/secs"),
            &format_value(max, "tokens/secs"),
        ];

        builder.push_record(row);
    }
}

fn avg_min_max(data: &[f64]) -> (f64, f64, f64) {
    let average = data.iter().sum::<f64>() / data.len() as f64;
    let min = data
        .iter()
        .min_by(|a, b| a.total_cmp(b))
        .unwrap_or(&f64::NAN);
    let max = data
        .iter()
        .max_by(|a, b| a.total_cmp(b))
        .unwrap_or(&f64::NAN);
    (average, *min, *max)
}

fn px(data: &[f64], p: u32) -> f64 {
    let i = (f64::from(p) / 100.0 * data.len() as f64) as usize;
    *data.get(i).unwrap_or(&f64::NAN)
}

fn format_value(value: f64, unit: &'static str) -> String {
    format!("{:.2} {unit}", value)
}


================================================
FILE: benchmark/src/utils.rs
================================================
/// MIT License
//
// Copyright (c) 2020 hatoo
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
use std::collections::BTreeMap;

pub(crate) fn histogram(values: &[f64], bins: usize) -> Vec<(f64, usize)> {
    assert!(bins >= 2);
    let mut bucket: Vec<usize> = vec![0; bins];
    let min = values.iter().collect::<average::Min>().min();
    let max = values.iter().collect::<average::Max>().max();
    let step = (max - min) / (bins - 1) as f64;

    for &v in values {
        let i = std::cmp::min(((v - min) / step).ceil() as usize, bins - 1);
        bucket[i] += 1;
    }

    bucket
        .into_iter()
        .enumerate()
        .map(|(i, v)| (min + step * i as f64, v))
        .collect()
}

pub(crate) fn percentiles(values: &[f64], pecents: &[i32]) -> BTreeMap<String, f64> {
    pecents
        .iter()
        .map(|&p| {
            let i = (f64::from(p) / 100.0 * values.len() as f64) as usize;
            (format!("p{p}"), *values.get(i).unwrap_or(&f64::NAN))
        })
        .collect()
}


================================================
FILE: clients/python/.gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
text_generation/__pycache__/
text_generation/pb/__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# poetry
#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
#   This is especially recommended for binary packages to ensure reproducibility, and is more
#   commonly ignored for libraries.
#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# pdm
#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
#   in version control.
#   https://pdm.fming.dev/#use-with-ide
.pdm.toml

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

transformers
safetensors


================================================
FILE: clients/python/Makefile
================================================
unit-tests:
	python -m pytest --cov=text_generation tests

install:
	pip install pip --upgrade
	pip install -e .


================================================
FILE: clients/python/README.md
================================================
# Legacy warning ⚠️
The inference clients from [huggingface_hub](https://huggingface.co/docs/huggingface_hub/guides/inference) are recommended over `text_generation`.

# Text Generation

The Hugging Face Text Generation Python library provides a convenient way of interfacing with a
`text-generation-inference` instance running on
[Hugging Face Inference Endpoints](https://huggingface.co/inference-endpoints) or on the Hugging Face Hub.

## Get Started

### Install

```shell
pip install text-generation
```

### Inference API Usage

```python
from text_generation import InferenceAPIClient

client = InferenceAPIClient("bigscience/bloomz")
text = client.generate("Why is the sky blue?").generated_text
print(text)
# ' Rayleigh scattering'

# Token Streaming
text = ""
for response in client.generate_stream("Why is the sky blue?"):
    if not response.token.special:
        text += response.token.text

print(text)
# ' Rayleigh scattering'
```

or with the asynchronous client:

```python
from text_generation import InferenceAPIAsyncClient

client = InferenceAPIAsyncClient("bigscience/bloomz")
response = await client.generate("Why is the sky blue?")
print(response.generated_text)
# ' Rayleigh scattering'

# Token Streaming
text = ""
async for response in client.generate_stream("Why is the sky blue?"):
    if not response.token.special:
        text += response.token.text

print(text)
# ' Rayleigh scattering'
```

Check all currently deployed models on the Huggingface Inference API with `Text Generation` support:

```python
from text_generation.inference_api import deployed_models

print(deployed_models())
```

### Hugging Face Inference Endpoint usage

```python
from text_generation import Client

endpoint_url = "https://YOUR_ENDPOINT.endpoints.huggingface.cloud"

client = Client(endpoint_url)
text = client.generate("Why is the sky blue?").generated_text
print(text)
# ' Rayleigh scattering'

# Token Streaming
text = ""
for response in client.generate_stream("Why is the sky blue?"):
    if not response.token.special:
        text += response.token.text

print(text)
# ' Rayleigh scattering'
```

or with the asynchronous client:

```python
from text_generation import AsyncClient

endpoint_url = "https://YOUR_ENDPOINT.endpoints.huggingface.cloud"

client = AsyncClient(endpoint_url)
response = await client.generate("Why is the sky blue?")
print(response.generated_text)
# ' Rayleigh scattering'

# Token Streaming
text = ""
async for response in client.generate_stream("Why is the sky blue?"):
    if not response.token.special:
        text += response.token.text

print(text)
# ' Rayleigh scattering'
```

### Types

```python
# enum for grammar type
class GrammarType(Enum):
    Json = "json"
    Regex = "regex"


# Grammar type and value
class Grammar:
    # Grammar type
    type: GrammarType
    # Grammar value
    value: Union[str, dict]

class Parameters:
    # Activate logits sampling
    do_sample: bool
    # Maximum number of generated tokens
    max_new_tokens: int
    # The parameter for repetition penalty. 1.0 means no penalty.
    # See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
    repetition_penalty: Optional[float]
    # The parameter for frequency penalty. 1.0 means no penalty
    # Penalize new tokens based on their existing frequency in the text so far,
    # decreasing the model's likelihood to repeat the same line verbatim.
    frequency_penalty: Optional[float]
    # Whether to prepend the prompt to the generated text
    return_full_text: bool
    # Stop generating tokens if a member of `stop_sequences` is generated
    stop: List[str]
    # Random sampling seed
    seed: Optional[int]
    # The value used to module the logits distribution.
    temperature: Optional[float]
    # The number of highest probability vocabulary tokens to keep for top-k-filtering.
    top_k: Optional[int]
    # If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
    # higher are kept for generation.
    top_p: Optional[float]
    # truncate inputs tokens to the given size
    truncate: Optional[int]
    # Typical Decoding mass
    # See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
    typical_p: Optional[float]
    # Generate best_of sequences and return the one if the highest token logprobs
    best_of: Optional[int]
    # Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
    watermark: bool
    # Get generation details
    details: bool
    # Get decoder input token logprobs and ids
    decoder_input_details: bool
    # Return the N most likely tokens at each step
    top_n_tokens: Optional[int]
    # grammar to use for generation
    grammar: Optional[Grammar]

class Request:
    # Prompt
    inputs: str
    # Generation parameters
    parameters: Optional[Parameters]
    # Whether to stream output tokens
    stream: bool

# Decoder input tokens
class InputToken:
    # Token ID from the model tokenizer
    id: int
    # Token text
    text: str
    # Logprob
    # Optional since the logprob of the first token cannot be computed
    logprob: Optional[float]


# Generated tokens
class Token:
    # Token ID from the model tokenizer
    id: int
    # Token text
    text: str
    # Logprob
    logprob: Optional[float]
    # Is the token a special token
    # Can be used to ignore tokens when concatenating
    special: bool


# Generation finish reason
class FinishReason(Enum):
    # number of generated tokens == `max_new_tokens`
    Length = "length"
    # the model generated its end of sequence token
    EndOfSequenceToken = "eos_token"
    # the model generated a text included in `stop_sequences`
    StopSequence = "stop_sequence"


# Additional sequences when using the `best_of` parameter
class BestOfSequence:
    # Generated text
    generated_text: str
    # Generation finish reason
    finish_reason: FinishReason
    # Number of generated tokens
    generated_tokens: int
    # Sampling seed if sampling was activated
    seed: Optional[int]
    # Decoder input tokens, empty if decoder_input_details is False
    prefill: List[InputToken]
    # Generated tokens
    tokens: List[Token]
    # Most likely tokens
    top_tokens: Optional[List[List[Token]]]


# `generate` details
class Details:
    # Generation finish reason
    finish_reason: FinishReason
    # Number of generated tokens
    generated_tokens: int
    # Sampling seed if sampling was activated
    seed: Optional[int]
    # Decoder input tokens, empty if decoder_input_details is False
    prefill: List[InputToken]
    # Generated tokens
    tokens: List[Token]
    # Most likely tokens
    top_tokens: Optional[List[List[Token]]]
    # Additional sequences when using the `best_of` parameter
    best_of_sequences: Optional[List[BestOfSequence]]


# `generate` return value
class Response:
    # Generated text
    generated_text: str
    # Generation details
    details: Details


# `generate_stream` details
class StreamDetails:
    # Generation finish reason
    finish_reason: FinishReason
    # Number of generated tokens
    generated_tokens: int
    # Sampling seed if sampling was activated
    seed: Optional[int]


# `generate_stream` return value
class StreamResponse:
    # Generated token
    token: Token
    # Most likely tokens
    top_tokens: Optional[List[Token]]
    # Complete generated text
    # Only available when the generation is finished
    generated_text: Optional[str]
    # Generation details
    # Only available when the generation is finished
    details: Optional[StreamDetails]

# Inference API currently deployed model
class DeployedModel:
    model_id: str
    sha: str
```


================================================
FILE: clients/python/pyproject.toml
================================================
[tool.poetry]
name = "text-generation"
version = "0.7.0"
description = "Hugging Face Text Generation Python Client"
license = "Apache-2.0"
authors = ["Olivier Dehaene <olivier@huggingface.co>"]
maintainers = ["Olivier Dehaene <olivier@huggingface.co>"]
readme = "README.md"
homepage = "https://github.com/huggingface/text-generation-inference"
repository = "https://github.com/huggingface/text-generation-inference"


[tool.poetry.dependencies]
python = "^3.9"
pydantic = "> 2, < 3"
aiohttp = "^3.11"
huggingface-hub = ">= 0.12, < 1.0"

[tool.poetry.group.dev.dependencies]
pytest = "^8"
pytest-asyncio = "^0.26"
pytest-cov = "^6.0.0"

[tool.pytest.ini_options]
asyncio_mode = "auto"

[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"

[tool.isort]
profile = "black"


================================================
FILE: clients/python/tests/conftest.py
================================================
import pytest

from text_generation import __version__
from huggingface_hub.utils import build_hf_headers


@pytest.fixture
def flan_t5_xxl():
    return "google/flan-t5-xxl"


@pytest.fixture
def llama_7b():
    return "meta-llama/Llama-2-7b-chat-hf"


@pytest.fixture
def fake_model():
    return "fake/model"


@pytest.fixture
def unsupported_model():
    return "google-bert/bert-base-uncased"


@pytest.fixture
def base_url():
    return "https://api-inference.huggingface.co/models"


@pytest.fixture
def bloom_url(base_url, bloom_model):
    return f"{base_url}/{bloom_model}"


@pytest.fixture
def flan_t5_xxl_url(base_url, flan_t5_xxl):
    return f"{base_url}/{flan_t5_xxl}"


@pytest.fixture
def llama_7b_url(base_url, llama_7b):
    return f"{base_url}/{llama_7b}"


@pytest.fixture
def fake_url(base_url, fake_model):
    return f"{base_url}/{fake_model}"


@pytest.fixture
def unsupported_url(base_url, unsupported_model):
    return f"{base_url}/{unsupported_model}"


@pytest.fixture(scope="session")
def hf_headers():
    return build_hf_headers(
        library_name="text-generation-tests", library_version=__version__
    )


================================================
FILE: clients/python/tests/test_client.py
================================================
import pytest

from text_generation import Client, AsyncClient
from text_generation.errors import NotFoundError, ValidationError
from text_generation.types import FinishReason


def test_generate(llama_7b_url, hf_headers):
    client = Client(llama_7b_url, hf_headers)
    response = client.generate("test", max_new_tokens=1, decoder_input_details=True)

    assert response.generated_text == "_"
    assert response.details.finish_reason == FinishReason.Length
    assert response.details.generated_tokens == 1
    assert response.details.seed is None
    assert len(response.details.prefill) == 0
    # assert response.details.prefill[0] == InputToken(id=1, text="<s>", logprob=None)
    assert len(response.details.tokens) == 1
    assert response.details.tokens[0].id == 29918
    assert response.details.tokens[0].text == "_"
    assert not response.details.tokens[0].special


def test_generate_best_of(llama_7b_url, hf_headers):
    client = Client(llama_7b_url, hf_headers)
    response = client.generate(
        "test", max_new_tokens=1, best_of=2, do_sample=True, decoder_input_details=True
    )

    assert response.details.seed is not None
    assert response.details.best_of_sequences is not None
    assert len(response.details.best_of_sequences) == 1
    assert response.details.best_of_sequences[0].seed is not None


def test_generate_not_found(fake_url, hf_headers):
    client = Client(fake_url, hf_headers)
    with pytest.raises(NotFoundError):
        client.generate("test")


def test_generate_validation_error(llama_7b_url, hf_headers):
    client = Client(llama_7b_url, hf_headers)
    with pytest.raises(ValidationError):
        client.generate("test", max_new_tokens=10_000)


def test_generate_stream(llama_7b_url, hf_headers):
    client = Client(llama_7b_url, hf_headers)
    responses = [
        response for response in client.generate_stream("test", max_new_tokens=1)
    ]

    assert len(responses) == 1
    response = responses[0]

    assert response.generated_text == "_"
    assert response.details.finish_reason == FinishReason.Length
    assert response.details.generated_tokens == 1
    assert response.details.seed is None


def test_generate_stream_not_found(fake_url, hf_headers):
    client = Client(fake_url, hf_headers)
    with pytest.raises(NotFoundError):
        list(client.generate_stream("test"))


def test_generate_stream_validation_error(llama_7b_url, hf_headers):
    client = Client(llama_7b_url, hf_headers)
    with pytest.raises(ValidationError):
        list(client.generate_stream("test", max_new_tokens=10_000))


@pytest.mark.asyncio
async def test_generate_async(llama_7b_url, hf_headers):
    client = AsyncClient(llama_7b_url, hf_headers)
    response = await client.generate(
        "test", max_new_tokens=1, decoder_input_details=True
    )

    assert response.generated_text == "_"
    assert response.details.finish_reason == FinishReason.Length
    assert response.details.generated_tokens == 1
    assert response.details.seed is None
    assert len(response.details.prefill) == 0
    # assert response.details.prefill[0] == InputToken(id=1, text="<s>", logprob=None)
    # assert response.details.prefill[1] == InputToken(
    #     id=1243, text="test", logprob=-10.96875
    # )
    assert len(response.details.tokens) == 1
    assert response.details.tokens[0].id == 29918
    assert response.details.tokens[0].text == "_"
    assert not response.details.tokens[0].special


@pytest.mark.asyncio
async def test_generate_async_best_of(llama_7b_url, hf_headers):
    client = AsyncClient(llama_7b_url, hf_headers)
    response = await client.generate(
        "test", max_new_tokens=1, best_of=2, do_sample=True, decoder_input_details=True
    )

    assert response.details.seed is not None
    assert response.details.best_of_sequences is not None
    assert len(response.details.best_of_sequences) == 1
    assert response.details.best_of_sequences[0].seed is not None


@pytest.mark.asyncio
async def test_generate_async_not_found(fake_url, hf_headers):
    client = AsyncClient(fake_url, hf_headers)
    with pytest.raises(NotFoundError):
        await client.generate("test")


@pytest.mark.asyncio
async def test_generate_async_validation_error(llama_7b_url, hf_headers):
    client = AsyncClient(llama_7b_url, hf_headers)
    with pytest.raises(ValidationError):
        await client.generate("test", max_new_tokens=10_000)


@pytest.mark.asyncio
async def test_generate_stream_async(llama_7b_url, hf_headers):
    client = AsyncClient(llama_7b_url, hf_headers)
    responses = [
        response async for response in client.generate_stream("test", max_new_tokens=1)
    ]

    assert len(responses) == 1
    response = responses[0]

    assert response.generated_text == "_"
    assert response.details.finish_reason == FinishReason.Length
    assert response.details.generated_tokens == 1
    assert response.details.seed is None


@pytest.mark.asyncio
async def test_generate_stream_async_not_found(fake_url, hf_headers):
    client = AsyncClient(fake_url, hf_headers)
    with pytest.raises(NotFoundError):
        async for _ in client.generate_stream("test"):
            pass


@pytest.mark.asyncio
async def test_generate_stream_async_validation_error(llama_7b_url, hf_headers):
    client = AsyncClient(llama_7b_url, hf_headers)
    with pytest.raises(ValidationError):
        async for _ in client.generate_stream("test", max_new_tokens=10_000):
            pass


================================================
FILE: clients/python/tests/test_errors.py
================================================
from text_generation.errors import (
    parse_error,
    GenerationError,
    IncompleteGenerationError,
    OverloadedError,
    ValidationError,
    BadRequestError,
    ShardNotReadyError,
    ShardTimeoutError,
    NotFoundError,
    RateLimitExceededError,
    UnknownError,
)


def test_generation_error():
    payload = {"error_type": "generation", "error": "test"}
    assert isinstance(parse_error(400, payload), GenerationError)


def test_incomplete_generation_error():
    payload = {"error_type": "incomplete_generation", "error": "test"}
    assert isinstance(parse_error(400, payload), IncompleteGenerationError)


def test_overloaded_error():
    payload = {"error_type": "overloaded", "error": "test"}
    assert isinstance(parse_error(400, payload), OverloadedError)


def test_validation_error():
    payload = {"error_type": "validation", "error": "test"}
    assert isinstance(parse_error(400, payload), ValidationError)


def test_bad_request_error():
    payload = {"error": "test"}
    assert isinstance(parse_error(400, payload), BadRequestError)


def test_shard_not_ready_error():
    payload = {"error": "test"}
    assert isinstance(parse_error(403, payload), ShardNotReadyError)
    assert isinstance(parse_error(424, payload), ShardNotReadyError)


def test_shard_timeout_error():
    payload = {"error": "test"}
    assert isinstance(parse_error(504, payload), ShardTimeoutError)


def test_not_found_error():
    payload = {"error": "test"}
    assert isinstance(parse_error(404, payload), NotFoundError)


def test_rate_limit_exceeded_error():
    payload = {"error": "test"}
    assert isinstance(parse_error(429, payload), RateLimitExceededError)


def test_unknown_error():
    payload = {"error": "test"}
    assert isinstance(parse_error(500, payload), UnknownError)


================================================
FILE: clients/python/tests/test_inference_api.py
================================================
# import pytest
#
# from text_generation import (
#     InferenceAPIClient,
#     InferenceAPIAsyncClient,
#     Client,
#     AsyncClient,
# )
# from text_generation.errors import NotSupportedError, NotFoundError
# from text_generation.inference_api import check_model_support, deployed_models
#
#
# def test_check_model_support(flan_t5_xxl, unsupported_model, fake_model):
#     assert check_model_support(flan_t5_xxl)
#     assert not check_model_support(unsupported_model)
#
#     with pytest.raises(NotFoundError):
#         check_model_support(fake_model)
#
#
# def test_deployed_models():
#     deployed_models()
#
#
# def test_client(flan_t5_xxl):
#     client = InferenceAPIClient(flan_t5_xxl)
#     assert isinstance(client, Client)
#
#
# def test_client_unsupported_model(unsupported_model):
#     with pytest.raises(NotSupportedError):
#         InferenceAPIClient(unsupported_model)
#
#
# def test_async_client(flan_t5_xxl):
#     client = InferenceAPIAsyncClient(flan_t5_xxl)
#     assert isinstance(client, AsyncClient)
#
#
# def test_async_client_unsupported_model(unsupported_model):
#     with pytest.raises(NotSupportedError):
#         InferenceAPIAsyncClient(unsupported_model)


================================================
FILE: clients/python/tests/test_types.py
================================================
import pytest

from text_generation.types import Parameters, Request
from text_generation.errors import ValidationError


def test_parameters_validation():
    # Test best_of
    Parameters(best_of=1)
    with pytest.raises(ValidationError):
        Parameters(best_of=0)
    with pytest.raises(ValidationError):
        Parameters(best_of=-1)
    Parameters(best_of=2, do_sample=True)
    with pytest.raises(ValidationError):
        Parameters(best_of=2)
    with pytest.raises(ValidationError):
        Parameters(best_of=2, seed=1)

    # Test repetition_penalty
    Parameters(repetition_penalty=1)
    with pytest.raises(ValidationError):
        Parameters(repetition_penalty=0)
    with pytest.raises(ValidationError):
        Parameters(repetition_penalty=-1)

    # Test seed
    Parameters(seed=1)
    with pytest.raises(ValidationError):
        Parameters(seed=-1)

    # Test temperature
    Parameters(temperature=1)
    with pytest.raises(ValidationError):
        Parameters(temperature=0)
    with pytest.raises(ValidationError):
        Parameters(temperature=-1)

    # Test top_k
    Parameters(top_k=1)
    with pytest.raises(ValidationError):
        Parameters(top_k=0)
    with pytest.raises(ValidationError):
        Parameters(top_k=-1)

    # Test top_p
    Parameters(top_p=0.5)
    with pytest.raises(ValidationError):
        Parameters(top_p=0)
    with pytest.raises(ValidationError):
        Parameters(top_p=-1)
    with pytest.raises(ValidationError):
        Parameters(top_p=1)

    # Test truncate
    Parameters(truncate=1)
    with pytest.raises(ValidationError):
        Parameters(truncate=0)
    with pytest.raises(ValidationError):
        Parameters(truncate=-1)

    # Test typical_p
    Parameters(typical_p=0.5)
    with pytest.raises(ValidationError):
        Parameters(typical_p=0)
    with pytest.raises(ValidationError):
        Parameters(typical_p=-1)
    with pytest.raises(ValidationError):
        Parameters(typical_p=1)


def test_request_validation():
    Request(inputs="test")

    with pytest.raises(ValidationError):
        Request(inputs="")

    Request(inputs="test", stream=True)
    Request(inputs="test", parameters=Parameters(best_of=2, do_sample=True))

    with pytest.raises(ValidationError):
        Request(
            inputs="test", parameters=Parameters(best_of=2, do_sample=True), stream=True
        )


================================================
FILE: clients/python/text_generation/__init__.py
================================================
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

__version__ = "0.7.0"

DEPRECATION_WARNING = (
    "`text_generation` clients are deprecated and will be removed in the near future. "
    "Please use the `InferenceClient` from the `huggingface_hub` package instead."
)

from text_generation.client import Client, AsyncClient  # noqa E402
from text_generation.inference_api import (  # noqa E402
    InferenceAPIClient,
    InferenceAPIAsyncClient,
)

__all__ = [
    "Client",
    "AsyncClient",
    "InferenceAPIClient",
    "InferenceAPIAsyncClient",
]


================================================
FILE: clients/python/text_generation/client.py
================================================
import json
import requests
import warnings

from aiohttp import ClientSession, ClientTimeout
from pydantic import ValidationError
from typing import Dict, Optional, List, AsyncIterator, Iterator, Union

from text_generation import DEPRECATION_WARNING
from text_generation.types import (
    StreamResponse,
    Response,
    Request,
    Parameters,
    Grammar,
    CompletionRequest,
    Completion,
    CompletionComplete,
    ChatRequest,
    ChatCompletionChunk,
    ChatComplete,
    Message,
    Tool,
)
from text_generation.errors import parse_error

# emit deprecation warnings
warnings.simplefilter("always", DeprecationWarning)


class Client:
    """Client to make calls to a text-generation-inference instance

     Example:

     ```python
     >>> from text_generation import Client

     >>> client = Client("https://api-inference.huggingface.co/models/bigscience/bloomz")
     >>> client.generate("Why is the sky blue?").generated_text
     ' Rayleigh scattering'

     >>> result = ""
     >>> for response in client.generate_stream("Why is the sky blue?"):
     >>>     if not response.token.special:
     >>>         result += response.token.text
     >>> result
    ' Rayleigh scattering'
     ```
    """

    def __init__(
        self,
        base_url: str,
        headers: Optional[Dict[str, str]] = None,
        cookies: Optional[Dict[str, str]] = None,
        timeout: int = 10,
    ):
        """
        Args:
            base_url (`str`):
                text-generation-inference instance base url
            headers (`Optional[Dict[str, str]]`):
                Additional headers
            cookies (`Optional[Dict[str, str]]`):
                Cookies to include in the requests
            timeout (`int`):
                Timeout in seconds
        """
        warnings.warn(DEPRECATION_WARNING, DeprecationWarning)
        self.base_url = base_url
        self.headers = headers
        self.cookies = cookies
        self.timeout = timeout

    def completion(
        self,
        prompt: str,
        frequency_penalty: Optional[float] = None,
        max_tokens: Optional[int] = None,
        repetition_penalty: Optional[float] = None,
        seed: Optional[int] = None,
        stream: bool = False,
        temperature: Optional[float] = None,
        top_p: Optional[float] = None,
        stop: Optional[List[str]] = None,
    ):
        """
        Given a prompt, generate a response synchronously

        Args:
            prompt (`str`):
                Prompt
            frequency_penalty (`float`):
                The parameter for frequency penalty. 0.0 means no penalty
                Penalize new tokens based on their existing frequency in the text so far,
                decreasing the model's likelihood to repeat the same line verbatim.
            max_tokens (`int`):
                Maximum number of generated tokens
            repetition_penalty (`float`):
                The parameter for frequency penalty. 0.0 means no penalty. See [this
                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
            seed (`int`):
                Random sampling seed
            stream (`bool`):
                Stream the response
            temperature (`float`):
                The value used to module the logits distribution.
            top_p (`float`):
                If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
                higher are kept for generation
            stop (`List[str]`):
                Stop generating tokens if a member of `stop` is generated
        """
        request = CompletionRequest(
            model="tgi",
            prompt=prompt,
            frequency_penalty=frequency_penalty,
            max_tokens=max_tokens,
            repetition_penalty=repetition_penalty,
            seed=seed,
            stream=stream,
            temperature=temperature,
            top_p=top_p,
            stop=stop,
        )
        if not stream:
            resp = requests.post(
                f"{self.base_url}/v1/completions",
                json=request.dict(),
                headers=self.headers,
                cookies=self.cookies,
                timeout=self.timeout,
            )
            payload = resp.json()
            if resp.status_code != 200:
                raise parse_error(resp.status_code, payload)
            return Completion(**payload)
        else:
            return self._completion_stream_response(request)

    def _completion_stream_response(self, request):
        resp = requests.post(
            f"{self.base_url}/v1/completions",
            json=request.dict(),
            headers=self.headers,
            cookies=self.cookies,
            timeout=self.timeout,
            stream=True,
        )
        # iterate and print stream
        for byte_payload in resp.iter_lines():
            if byte_payload == b"\n":
                continue
            payload = byte_payload.decode("utf-8")
            if payload.startswith("data:"):
                json_payload = json.loads(payload.lstrip("data:").rstrip("\n"))
                try:
                    response = CompletionComplete(**json_payload)
                    yield response
                except ValidationError:
                    raise parse_error(resp.status, json_payload)

    def chat(
        self,
        messages: List[Message],
        repetition_penalty: Optional[float] = None,
        frequency_penalty: Optional[float] = None,
        logit_bias: Optional[List[float]] = None,
        logprobs: Optional[bool] = None,
        top_logprobs: Optional[int] = None,
        max_tokens: Optional[int] = None,
        n: Optional[int] = None,
        presence_penalty: Optional[float] = None,
        stream: bool = False,
        seed: Optional[int] = None,
        temperature: Optional[float] = None,
        top_p: Optional[float] = None,
        tools: Optional[List[Tool]] = None,
        tool_prompt: Optional[str] = None,
        tool_choice: Optional[str] = None,
        stop: Optional[List[str]] = None,
    ):
        """
        Given a list of messages, generate a response asynchronously

        Args:
            messages (`List[Message]`):
                List of messages
            repetition_penalty (`float`):
                The parameter for repetition penalty. 0.0 means no penalty. See [this
                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
            frequency_penalty (`float`):
                The parameter for frequency penalty. 0.0 means no penalty
                Penalize new tokens based on their existing frequency in the text so far,
                decreasing the model's likelihood to repeat the same line verbatim.
            logit_bias (`List[float]`):
                Adjust the likelihood of specified tokens
            logprobs (`bool`):
                Include log probabilities in the response
            top_logprobs (`int`):
                Include the `n` most likely tokens at each step
            max_tokens (`int`):
                Maximum number of generated tokens
            n (`int`):
                Generate `n` completions
            presence_penalty (`float`):
                The parameter for presence penalty. 0.0 means no penalty. See [this
                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
            stream (`bool`):
                Stream the response
            seed (`int`):
                Random sampling seed
            temperature (`float`):
                The value used to module the logits distribution.
            top_p (`float`):
                If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
                higher are kept for generation
            tools (`List[Tool]`):
                List of tools to use
            tool_prompt (`str`):
                A prompt to be appended before the tools
            tool_choice (`str`):
                The tool to use
            stop (`List[str]`):
                Stop generating tokens if a member of `stop` is generated

        """
        request = ChatRequest(
            model="tgi",
            messages=messages,
            repetition_penalty=repetition_penalty,
            frequency_penalty=frequency_penalty,
            logit_bias=logit_bias,
            logprobs=logprobs,
            top_logprobs=top_logprobs,
            max_tokens=max_tokens,
            n=n,
            presence_penalty=presence_penalty,
            stream=stream,
            seed=seed,
            temperature=temperature,
            top_p=top_p,
            tools=tools,
            tool_prompt=tool_prompt,
            tool_choice=tool_choice,
            stop=stop,
        )
        if not stream:
            resp = requests.post(
                f"{self.base_url}/v1/chat/completions",
                json=request.dict(),
                headers=self.headers,
                cookies=self.cookies,
                timeout=self.timeout,
            )
            payload = resp.json()
            if resp.status_code != 200:
                raise parse_error(resp.status_code, payload)
            return ChatComplete(**payload)
        else:
            return self._chat_stream_response(request)

    def _chat_stream_response(self, request):
        resp = requests.post(
            f"{self.base_url}/v1/chat/completions",
            json=request.dict(),
            headers=self.headers,
            cookies=self.cookies,
            timeout=self.timeout,
            stream=True,
        )
        # iterate and print stream
        for byte_payload in resp.iter_lines():
            if byte_payload == b"\n":
                continue
            payload = byte_payload.decode("utf-8")
            if payload.startswith("data:"):
                json_payload = json.loads(payload.lstrip("data:").rstrip("\n"))
                try:
                    response = ChatCompletionChunk(**json_payload)
                    yield response
                except ValidationError:
                    raise parse_error(resp.status, json_payload)

    def generate(
        self,
        prompt: str,
        do_sample: bool = False,
        max_new_tokens: int = 20,
        best_of: Optional[int] = None,
        repetition_penalty: Optional[float] = None,
        frequency_penalty: Optional[float] = None,
        return_full_text: bool = False,
        seed: Optional[int] = None,
        stop_sequences: Optional[List[str]] = None,
        temperature: Optional[float] = None,
        top_k: Optional[int] = None,
        top_p: Optional[float] = None,
        truncate: Optional[int] = None,
        typical_p: Optional[float] = None,
        watermark: bool = False,
        decoder_input_details: bool = False,
        top_n_tokens: Optional[int] = None,
        grammar: Optional[Grammar] = None,
    ) -> Response:
        """
        Given a prompt, generate the following text

        Args:
            prompt (`str`):
                Input text
            do_sample (`bool`):
                Activate logits sampling
            max_new_tokens (`int`):
                Maximum number of generated tokens
            best_of (`int`):
                Generate best_of sequences and return the one if the highest token logprobs
            repetition_penalty (`float`):
                The parameter for repetition penalty. 1.0 means no penalty. See [this
                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
            frequency_penalty (`float`):
                The parameter for frequency penalty. 1.0 means no penalty
                Penalize new tokens based on their existing frequency in the text so far,
                decreasing the model's likelihood to repeat the same line verbatim.
            return_full_text (`bool`):
                Whether to prepend the prompt to the generated text
            seed (`int`):
                Random sampling seed
            stop_sequences (`List[str]`):
                Stop generating tokens if a member of `stop_sequences` is generated
            temperature (`float`):
                The value used to module the logits distribution.
            top_k (`int`):
                The number of highest probability vocabulary tokens to keep for top-k-filtering.
            top_p (`float`):
                If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
                higher are kept for generation.
            truncate (`int`):
                Truncate inputs tokens to the given size
            typical_p (`float`):
                Typical Decoding mass
                See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
            watermark (`bool`):
                Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
            decoder_input_details (`bool`):
                Return the decoder input token logprobs and ids
            top_n_tokens (`int`):
                Return the `n` most likely tokens at each step
            grammar (`Grammar`):
                Whether to use a grammar for the generation and the grammar to use. Grammars will constrain the generation
                of the text to match a regular expression or JSON schema.

        Returns:
            Response: generated response
        """
        # Validate parameters
        parameters = Parameters(
            best_of=best_of,
            details=True,
            do_sample=do_sample,
            max_new_tokens=max_new_tokens,
            repetition_penalty=repetition_penalty,
            frequency_penalty=frequency_penalty,
            return_full_text=return_full_text,
            seed=seed,
            stop=stop_sequences if stop_sequences is not None else [],
            temperature=temperature,
            top_k=top_k,
            top_p=top_p,
            truncate=truncate,
            typical_p=typical_p,
            watermark=watermark,
            decoder_input_details=decoder_input_details,
            top_n_tokens=top_n_tokens,
            grammar=grammar,
        )
        request = Request(inputs=prompt, stream=False, parameters=parameters)

        resp = requests.post(
            self.base_url,
            json=request.dict(),
            headers=self.headers,
            cookies=self.cookies,
            timeout=self.timeout,
        )
        payload = resp.json()
        if resp.status_code != 200:
            raise parse_error(resp.status_code, payload)
        return Response(**payload[0])

    def generate_stream(
        self,
        prompt: str,
        do_sample: bool = False,
        max_new_tokens: int = 20,
        repetition_penalty: Optional[float] = None,
        frequency_penalty: Optional[float] = None,
        return_full_text: bool = False,
        seed: Optional[int] = None,
        stop_sequences: Optional[List[str]] = None,
        temperature: Optional[float] = None,
        top_k: Optional[int] = None,
        top_p: Optional[float] = None,
        truncate: Optional[int] = None,
        typical_p: Optional[float] = None,
        watermark: bool = False,
        top_n_tokens: Optional[int] = None,
        grammar: Optional[Grammar] = None,
    ) -> Iterator[StreamResponse]:
        """
        Given a prompt, generate the following stream of tokens

        Args:
            prompt (`str`):
                Input text
            do_sample (`bool`):
                Activate logits sampling
            max_new_tokens (`int`):
                Maximum number of generated tokens
            repetition_penalty (`float`):
                The parameter for repetition penalty. 1.0 means no penalty. See [this
                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
            frequency_penalty (`float`):
                The parameter for frequency penalty. 1.0 means no penalty
                Penalize new tokens based on their existing frequency in the text so far,
                decreasing the model's likelihood to repeat the same line verbatim.
            return_full_text (`bool`):
                Whether to prepend the prompt to the generated text
            seed (`int`):
                Random sampling seed
            stop_sequences (`List[str]`):
                Stop generating tokens if a member of `stop_sequences` is generated
            temperature (`float`):
                The value used to module the logits distribution.
            top_k (`int`):
                The number of highest probability vocabulary tokens to keep for top-k-filtering.
            top_p (`float`):
                If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
                higher are kept for generation.
            truncate (`int`):
                Truncate inputs tokens to the given size
            typical_p (`float`):
                Typical Decoding mass
                See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
            watermark (`bool`):
                Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
            top_n_tokens (`int`):
                Return the `n` most likely tokens at each step
            grammar (`Grammar`):
                Whether to use a grammar for the generation and the grammar to use. Grammars will constrain the generation
                of the text to match a regular expression or JSON schema.

        Returns:
            Iterator[StreamResponse]: stream of generated tokens
        """
        # Validate parameters
        parameters = Parameters(
            best_of=None,
            details=True,
            decoder_input_details=False,
            do_sample=do_sample,
            max_new_tokens=max_new_tokens,
            repetition_penalty=repetition_penalty,
            frequency_penalty=frequency_penalty,
            return_full_text=return_full_text,
            seed=seed,
            stop=stop_sequences if stop_sequences is not None else [],
            temperature=temperature,
            top_k=top_k,
            top_p=top_p,
            truncate=truncate,
            typical_p=typical_p,
            watermark=watermark,
            top_n_tokens=top_n_tokens,
            grammar=grammar,
        )
        request = Request(inputs=prompt, stream=True, parameters=parameters)

        resp = requests.post(
            self.base_url,
            json=request.dict(),
            headers=self.headers,
            cookies=self.cookies,
            timeout=self.timeout,
            stream=True,
        )

        if resp.status_code != 200:
            raise parse_error(resp.status_code, resp.json())

        # Parse ServerSentEvents
        for byte_payload in resp.iter_lines():
            # Skip line
            if byte_payload == b"\n":
                continue

            payload = byte_payload.decode("utf-8")

            # Event data
            if payload.startswith("data:"):
                # Decode payload
                json_payload = json.loads(payload.lstrip("data:").rstrip("/n"))
                # Parse payload
                try:
                    response = StreamResponse(**json_payload)
                except ValidationError:
                    # If we failed to parse the payload, then it is an error payload
                    raise parse_error(resp.status_code, json_payload)
                yield response


class AsyncClient:
    """Asynchronous Client to make calls to a text-generation-inference instance

     Example:

     ```python
     >>> from text_generation import AsyncClient

     >>> client = AsyncClient("https://api-inference.huggingface.co/models/bigscience/bloomz")
     >>> response = await client.generate("Why is the sky blue?")
     >>> response.generated_text
     ' Rayleigh scattering'

     >>> result = ""
     >>> async for response in client.generate_stream("Why is the sky blue?"):
     >>>     if not response.token.special:
     >>>         result += response.token.text
     >>> result
    ' Rayleigh scattering'
     ```
    """

    def __init__(
        self,
        base_url: str,
        headers: Optional[Dict[str, str]] = None,
        cookies: Optional[Dict[str, str]] = None,
        timeout: int = 10,
    ):
        """
        Args:
            base_url (`str`):
                text-generation-inference instance base url
            headers (`Optional[Dict[str, str]]`):
                Additional headers
            cookies (`Optional[Dict[str, str]]`):
                Cookies to include in the requests
            timeout (`int`):
                Timeout in seconds
        """
        warnings.warn(DEPRECATION_WARNING, DeprecationWarning)
        self.base_url = base_url
        self.headers = headers
        self.cookies = cookies
        self.timeout = ClientTimeout(timeout)

    async def completion(
        self,
        prompt: str,
        frequency_penalty: Optional[float] = None,
        max_tokens: Optional[int] = None,
        repetition_penalty: Optional[float] = None,
        seed: Optional[int] = None,
        stream: bool = False,
        temperature: Optional[float] = None,
        top_p: Optional[float] = None,
        stop: Optional[List[str]] = None,
    ) -> Union[Completion, AsyncIterator[CompletionComplete]]:
        """
        Given a prompt, generate a response asynchronously

        Args:
            prompt (`str`):
                Prompt
            frequency_penalty (`float`):
                The parameter for frequency penalty. 0.0 means no penalty
                Penalize new tokens based on their existing frequency in the text so far,
                decreasing the model's likelihood to repeat the same line verbatim.
            max_tokens (`int`):
                Maximum number of generated tokens
            repetition_penalty (`float`):
                The parameter for frequency penalty. 0.0 means no penalty. See [this
                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
            seed (`int`):
                Random sampling seed
            stream (`bool`):
                Stream the response
            temperature (`float`):
                The value used to module the logits distribution.
            top_p (`float`):
                If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
                higher are kept for generation
            stop (`List[str]`):
                Stop generating tokens if a member of `stop` is generated
        """
        request = CompletionRequest(
            model="tgi",
            prompt=prompt,
            frequency_penalty=frequency_penalty,
            max_tokens=max_tokens,
            repetition_penalty=repetition_penalty,
            seed=seed,
            stream=stream,
            temperature=temperature,
            top_p=top_p,
            stop=stop,
        )
        if not stream:
            return await self._completion_single_response(request)
        else:
            return self._completion_stream_response(request)

    async def _completion_single_response(self, request):
        async with ClientSession(
            headers=self.headers, cookies=self.cookies, timeout=self.timeout
        ) as session:
            async with session.post(
                f"{self.base_url}/v1/completions", json=request.dict()
            ) as resp:
                payload = await resp.json()
                if resp.status != 200:
                    raise parse_error(resp.status, payload)
                return Completion(**payload)

    async def _completion_stream_response(self, request):
        async with ClientSession(
            headers=self.headers, cookies=self.cookies, timeout=self.timeout
        ) as session:
            async with session.post(
                f"{self.base_url}/v1/completions", json=request.dict()
            ) as resp:
                async for byte_payload in resp.content:
                    if byte_payload == b"\n":
                        continue
                    payload = byte_payload.decode("utf-8")
                    if payload.startswith("data:"):
                        json_payload = json.loads(payload.lstrip("data:").rstrip("\n"))
                        try:
                            response = CompletionComplete(**json_payload)
                            yield response
                        except ValidationError:
                            raise parse_error(resp.status, json_payload)

    async def chat(
        self,
        messages: List[Message],
        repetition_penalty: Optional[float] = None,
        frequency_penalty: Optional[float] = None,
        logit_bias: Optional[List[float]] = None,
        logprobs: Optional[bool] = None,
        top_logprobs: Optional[int] = None,
        max_tokens: Optional[int] = None,
        n: Optional[int] = None,
        presence_penalty: Optional[float] = None,
        stream: bool = False,
        seed: Optional[int] = None,
        temperature: Optional[float] = None,
        top_p: Optional[float] = None,
        tools: Optional[List[Tool]] = None,
        tool_prompt: Optional[str] = None,
        tool_choice: Optional[str] = None,
        stop: Optional[List[str]] = None,
    ) -> Union[ChatComplete, AsyncIterator[ChatCompletionChunk]]:
        """
        Given a list of messages, generate a response asynchronously

        Args:
            messages (`List[Message]`):
                List of messages
            repetition_penalty (`float`):
                The parameter for frequency penalty. 0.0 means no penalty. See [this
                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
            frequency_penalty (`float`):
                The parameter for frequency penalty. 0.0 means no penalty
                Penalize new tokens based on their existing frequency in the text so far,
                decreasing the model's likelihood to repeat the same line verbatim.
            logit_bias (`List[float]`):
                Adjust the likelihood of specified tokens
            logprobs (`bool`):
                Include log probabilities in the response
            top_logprobs (`int`):
                Include the `n` most likely tokens at each step
            max_tokens (`int`):
                Maximum number of generated tokens
            n (`int`):
                Generate `n` completions
            presence_penalty (`float`):
                The parameter for presence penalty. 0.0 means no penalty. See [this
                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
            stream (`bool`):
                Stream the response
            seed (`int`):
                Random sampling seed
            temperature (`float`):
                The value used to module the logits distribution.
            top_p (`float`):
                If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
                higher are kept for generation
            tools (`List[Tool]`):
                List of tools to use
            tool_prompt (`str`):
                A prompt to be appended before the tools
            tool_choice (`str`):
                The tool to use
            stop (`List[str]`):
                Stop generating tokens if a member of `stop` is generated

        """
        request = ChatRequest(
            model="tgi",
            messages=messages,
            repetition_penalty=repetition_penalty,
            frequency_penalty=frequency_penalty,
            logit_bias=logit_bias,
            logprobs=logprobs,
            top_logprobs=top_logprobs,
            max_tokens=max_tokens,
            n=n,
            presence_penalty=presence_penalty,
            stream=stream,
            seed=seed,
            temperature=temperature,
            top_p=top_p,
            tools=tools,
            tool_prompt=tool_prompt,
            tool_choice=tool_choice,
            stop=stop,
        )
        if not stream:
            return await self._chat_single_response(request)
        else:
            return self._chat_stream_response(request)

    async def _chat_single_response(self, request):
        async with ClientSession(
            headers=self.headers, cookies=self.cookies, timeout=self.timeout
        ) as session:
            async with session.post(
                f"{self.base_url}/v1/chat/completions", json=request.dict()
            ) as resp:
                payload = await resp.json()
                if resp.status != 200:
                    raise parse_error(resp.status, payload)
                return ChatComplete(**payload)

    async def _chat_stream_response(self, request):
        async with ClientSession(
            headers=self.headers, cookies=self.cookies, timeout=self.timeout
        ) as session:
            async with session.post(
                f"{self.base_url}/v1/chat/completions", json=request.dict()
            ) as resp:
                async for byte_payload in resp.content:
                    if byte_payload == b"\n":
                        continue
                    payload = byte_payload.decode("utf-8")
                    if payload.startswith("data:"):
                        payload_data = (
                            payload.lstrip("data:").rstrip("\n").removeprefix(" ")
                        )
                        if payload_data == "[DONE]":
                            break
                        json_payload = json.loads(payload_data)
                        try:
                            response = ChatCompletionChunk(**json_payload)
                            yield response
                        except ValidationError:
                            raise parse_error(resp.status, json_payload)

    async def generate(
        self,
        prompt: str,
        do_sample: bool = False,
        max_new_tokens: int = 20,
        best_of: Optional[int] = None,
        repetition_penalty: Optional[float] = None,
        frequency_penalty: Optional[float] = None,
        return_full_text: bool = False,
        seed: Optional[int] = None,
        stop_sequences: Optional[List[str]] = None,
        temperature: Optional[float] = None,
        top_k: Optional[int] = None,
        top_p: Optional[float] = None,
        truncate: Optional[int] = None,
        typical_p: Optional[float] = None,
        watermark: bool = False,
        decoder_input_details: bool = False,
        top_n_tokens: Optional[int] = None,
        grammar: Optional[Grammar] = None,
    ) -> Response:
        """
        Given a prompt, generate the following text asynchronously

        Args:
            prompt (`str`):
                Input text
            do_sample (`bool`):
                Activate logits sampling
            max_new_tokens (`int`):
                Maximum number of generated tokens
            best_of (`int`):
                Generate best_of sequences and return the one if the highest token logprobs
            repetition_penalty (`float`):
                The parameter for repetition penalty. 1.0 means no penalty. See [this
                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
            frequency_penalty (`float`):
                The parameter for frequency penalty. 1.0 means no penalty
                Penalize new tokens based on their existing frequency in the text so far,
                decreasing the model's likelihood to repeat the same line verbatim.
            return_full_text (`bool`):
                Whether to prepend the prompt to the generated text
            seed (`int`):
                Random sampling seed
            stop_sequences (`List[str]`):
                Stop generating tokens if a member of `stop_sequences` is generated
            temperature (`float`):
                The value used to module the logits distribution.
            top_k (`int`):
                The number of highest probability vocabulary tokens to keep for top-k-filtering.
            top_p (`float`):
                If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
                higher are kept for generation.
            truncate (`int`):
                Truncate inputs tokens to the given size
            typical_p (`float`):
                Typical Decoding mass
                See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
            watermark (`bool`):
                Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
            decoder_input_details (`bool`):
                Return the decoder input token logprobs and ids
            top_n_tokens (`int`):
                Return the `n` most likely tokens at each step
            grammar (`Grammar`):
                Whether to use a grammar for the generation and the grammar to use. Grammars will constrain the generation
                of the text to match a regular expression or JSON schema.

        Returns:
            Response: generated response
        """

        # Validate parameters
        parameters = Parameters(
            best_of=best_of,
            details=True,
            decoder_input_details=decoder_input_details,
            do_sample=do_sample,
            max_new_tokens=max_new_tokens,
            repetition_penalty=repetition_penalty,
            frequency_penalty=frequency_penalty,
            return_full_text=return_full_text,
            seed=seed,
            stop=stop_sequences if stop_sequences is not None else [],
            temperature=temperature,
            top_k=top_k,
            top_p=top_p,
            truncate=truncate,
            typical_p=typical_p,
            watermark=watermark,
            top_n_tokens=top_n_tokens,
            grammar=grammar,
        )
        request = Request(inputs=prompt, stream=False, parameters=parameters)

        async with ClientSession(
            headers=self.headers, cookies=self.cookies, timeout=self.timeout
        ) as session:
            async with session.post(self.base_url, json=request.model_dump()) as resp:
                payload = await resp.json()

                if resp.status != 200:
                    raise parse_error(resp.status, payload)
                return Response(**payload[0])

    async def generate_stream(
        self,
        prompt: str,
        do_sample: bool = False,
        max_new_tokens: int = 20,
        repetition_penalty: Optional[float] = None,
        frequency_penalty: Optional[float] = None,
        return_full_text: bool = False,
        seed: Optional[int] = None,
        stop_sequences: Optional[List[str]] = None,
        temperature: Optional[float] = None,
        top_k: Optional[int] = None,
        top_p: Optional[float] = None,
        truncate: Optional[int] = None,
        typical_p: Optional[float] = None,
        watermark: bool = False,
        top_n_tokens: Optional[int] = None,
        grammar: Optional[Grammar] = None,
    ) -> AsyncIterator[StreamResponse]:
        """
        Given a prompt, generate the following stream of tokens asynchronously

        Args:
            prompt (`str`):
                Input text
            do_sample (`bool`):
                Activate logits sampling
            max_new_tokens (`int`):
                Maximum number of generated tokens
            repetition_penalty (`float`):
                The parameter for repetition penalty. 1.0 means no penalty. See [this
                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
            frequency_penalty (`float`):
                The parameter for frequency penalty. 1.0 means no penalty
                Penalize new tokens based on their existing frequency in the text so far,
                decreasing the model's likelihood to repeat the same line verbatim.
            return_full_text (`bool`):
                Whether to prepend the prompt to the generated text
            seed (`int`):
                Random sampling seed
            stop_sequences (`List[str]`):
                Stop generating tokens if a member of `stop_sequences` is generated
            temperature (`float`):
                The value used to module the logits distribution.
            top_k (`int`):
                The number of highest probability vocabulary tokens to keep for top-k-filtering.
            top_p (`float`):
                If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
                higher are kept for generation.
            truncate (`int`):
                Truncate inputs tokens to the given size
            typical_p (`float`):
                Typical Decoding mass
                See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
            watermark (`bool`):
                Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
            top_n_tokens (`int`):
                Return the `n` most likely tokens at each step
            grammar (`Grammar`):
                Whether to use a grammar for the generation and the grammar to use. Grammars will constrain the generation
                of the text to match a regular expression or JSON schema.

        Returns:
            AsyncIterator[StreamResponse]: stream of generated tokens
        """
        # Validate parameters
        parameters = Parameters(
            best_of=None,
            details=True,
            decoder_input_details=False,
            do_sample=do_sample,
            max_new_tokens=max_new_tokens,
            repetition_penalty=repetition_penalty,
            frequency_penalty=frequency_penalty,
            return_full_text=return_full_text,
            seed=seed,
            stop=stop_sequences if stop_sequences is not None else [],
            temperature=temperature,
            top_k=top_k,
            top_p=top_p,
            truncate=truncate,
            typical_p=typical_p,
            watermark=watermark,
            top_n_tokens=top_n_tokens,
            grammar=grammar,
        )
        request = Request(inputs=prompt, stream=True, parameters=parameters)

        async with ClientSession(
            headers=self.headers, cookies=self.cookies, timeout=self.timeout
        ) as session:
            async with session.post(self.base_url, json=request.dict()) as resp:
                if resp.status != 200:
                    raise parse_error(resp.status, await resp.json())

                # Parse ServerSentEvents
                async for byte_payload in resp.content:
                    # Skip line
                    if byte_payload == b"\n":
                        continue

                    payload = byte_payload.decode("utf-8")

                    # Event data
                    if payload.startswith("data:"):
                        # Decode payload
                        json_payload = json.loads(payload.lstrip("data:").rstrip("/n"))
                        # Parse payload
                        try:
                            response = StreamResponse(**json_payload)
                        except ValidationError:
                            # If we failed to parse the payload, then it is an error payload
                            raise parse_error(resp.status, json_payload)
                        yield response


================================================
FILE: clients/python/text_generation/errors.py
================================================
from typing import Dict


# Text Generation Inference Errors
class ValidationError(Exception):
    def __init__(self, message: str):
        super().__init__(message)


class GenerationError(Exception):
    def __init__(self, message: str):
        super().__init__(message)


class OverloadedError(Exception):
    def __init__(self, message: str):
        super().__init__(message)


class IncompleteGenerationError(Exception):
    def __init__(self, message: str):
        super().__init__(message)


# API Inference Errors
class BadRequestError(Exception):
    def __init__(self, message: str):
        super().__init__(message)


class ShardNotReadyError(Exception):
    def __init__(self, message: str):
        super().__init__(message)


class ShardTimeoutError(Exception):
    def __init__(self, message: str):
        super().__init__(message)


class NotFoundError(Exception):
    def __init__(self, message: str):
        super().__init__(message)


class RateLimitExceededError(Exception):
    def __init__(self, message: str):
        super().__init__(message)


class NotSupportedError(Exception):
    def __init__(self, model_id: str):
        message = (
            f"Model `{model_id}` is not available for inference with this client. \n"
            "Use `huggingface_hub.inference_api.InferenceApi` instead."
        )
        super(NotSupportedError, self).__init__(message)


# Unknown error
class UnknownError(Exception):
    def __init__(self, message: str):
        super().__init__(message)


def parse_error(status_code: int, payload: Dict[str, str]) -> Exception:
    """
    Parse error given an HTTP status code and a json payload

    Args:
        status_code (`int`):
            HTTP status code
        payload (`Dict[str, str]`):
            Json payload

    Returns:
        Exception: parsed exception

    """
    # Try to parse a Text Generation Inference error
    message = payload["error"]
    if "error_type" in payload:
        error_type = payload["error_type"]
        if error_type == "generation":
            return GenerationError(message)
        if error_type == "incomplete_generation":
            return IncompleteGenerationError(message)
        if error_type == "overloaded":
            return OverloadedError(message)
        if error_type == "validation":
            return ValidationError(message)

    # Try to parse a APIInference error
    if status_code == 400:
        return BadRequestError(message)
    if status_code == 403 or status_code == 424:
        return ShardNotReadyError(message)
    if status_code == 504:
        return ShardTimeoutError(message)
    if status_code == 404:
        return NotFoundError(message)
    if status_code == 429:
        return RateLimitExceededError(message)

    # Fallback to an unknown error
    return UnknownError(message)


================================================
FILE: clients/python/text_generation/inference_api.py
================================================
import os
import requests

from typing import Dict, Optional, List
from huggingface_hub.utils import build_hf_headers

from text_generation import Client, AsyncClient, __version__
from text_generation.types import DeployedModel
from text_generation.errors import NotSupportedError, parse_error

INFERENCE_ENDPOINT = os.environ.get(
    "HF_INFERENCE_ENDPOINT", "https://api-inference.huggingface.co"
)


def deployed_models(headers: Optional[Dict] = None) -> List[DeployedModel]:
    """
    Get all currently deployed models with text-generation-inference-support

    Returns:
        List[DeployedModel]: list of all currently deployed models
    """
    resp = requests.get(
        "https://api-inference.huggingface.co/framework/text-generation-inference",
        headers=headers,
        timeout=5,
    )

    payload = resp.json()
    if resp.status_code != 200:
        raise parse_error(resp.status_code, payload)

    models = [DeployedModel(**raw_deployed_model) for raw_deployed_model in payload]
    return models


def check_model_support(repo_id: str, headers: Optional[Dict] = None) -> bool:
    """
    Check if a given model is supported by text-generation-inference

    Returns:
        bool: whether the model is supported by this client
    """
    resp = requests.get(
        f"https://api-inference.huggingface.co/status/{repo_id}",
        headers=headers,
        timeout=5,
    )

    payload = resp.json()
    if resp.status_code != 200:
        raise parse_error(resp.status_code, payload)

    framework = payload["framework"]
    supported = framework == "text-generation-inference"
    return supported


class InferenceAPIClient(Client):
    """Client to make calls to the HuggingFace Inference API.

     Only supports a subset of the available text-generation or text2text-generation models that are served using
     text-generation-inference

     Example:

     ```python
     >>> from text_generation import InferenceAPIClient

     >>> client = InferenceAPIClient("bigscience/bloomz")
     >>> client.generate("Why is the sky blue?").generated_text
     ' Rayleigh scattering'

     >>> result = ""
     >>> for response in client.generate_stream("Why is the sky blue?"):
     >>>     if not response.token.special:
     >>>         result += response.token.text
     >>> result
    ' Rayleigh scattering'
     ```
    """

    def __init__(self, repo_id: str, token: Optional[str] = None, timeout: int = 10):
        """
        Init headers and API information

        Args:
            repo_id (`str`):
                Id of repository (e.g. `bigscience/bloom`).
            token (`str`, `optional`):
                The API token to use as HTTP bearer authorization. This is not
                the authentication token. You can find the token in
                https://huggingface.co/settings/token. Alternatively, you can
                find both your organizations and personal API tokens using
                `HfApi().whoami(token)`.
            timeout (`int`):
                Timeout in seconds
        """

        headers = build_hf_headers(
            token=token, library_name="text-generation", library_version=__version__
        )

        # Text Generation Inference client only supports a subset of the available hub models
        if not check_model_support(repo_id, headers):
            raise NotSupportedError(repo_id)

        base_url = f"{INFERENCE_ENDPOINT}/models/{repo_id}"

        super(InferenceAPIClient, self).__init__(
            base_url, headers=headers, timeout=timeout
        )


class InferenceAPIAsyncClient(AsyncClient):
    """Aynschronous Client to make calls to the HuggingFace Inference API.

     Only supports a subset of the available text-generation or text2text-generation models that are served using
     text-generation-inference

     Example:

     ```python
     >>> from text_generation import InferenceAPIAsyncClient

     >>> client = InferenceAPIAsyncClient("bigscience/bloomz")
     >>> response = await client.generate("Why is the sky blue?")
     >>> response.generated_text
     ' Rayleigh scattering'

     >>> result = ""
     >>> async for response in client.generate_stream("Why is the sky blue?"):
     >>>     if not response.token.special:
     >>>         result += response.token.text
     >>> result
    ' Rayleigh scattering'
     ```
    """

    def __init__(self, repo_id: str, token: Optional[str] = None, timeout: int = 10):
        """
        Init headers and API information

        Args:
            repo_id (`str`):
                Id of repository (e.g. `bigscience/bloom`).
            token (`str`, `optional`):
                The API token to use as HTTP bearer authorization. This is not
                the authentication token. You can find the token in
                https://huggingface.co/settings/token. Alternatively, you can
                find both your organizations and personal API tokens using
                `HfApi().whoami(token)`.
            timeout (`int`):
                Timeout in seconds
        """
        headers = build_hf_headers(
            token=token, library_name="text-generation", library_version=__version__
        )

        # Text Generation Inference client only supports a subset of the available hub models
        if not check_model_support(repo_id, headers):
            raise NotSupportedError(repo_id)

        base_url = f"{INFERENCE_ENDPOINT}/models/{repo_id}"

        super(InferenceAPIAsyncClient, self).__init__(
            base_url, headers=headers, timeout=timeout
        )


================================================
FILE: clients/python/text_generation/types.py
================================================
from enum import Enum
from pydantic import BaseModel, field_validator, ConfigDict
from typing import Optional, List, Union, Any

from text_generation.errors import ValidationError


# enum for grammar type
class GrammarType(str, Enum):
    Json = "json"
    Regex = "regex"


# Grammar type and value
class Grammar(BaseModel):
    # Grammar type
    type: GrammarType
    # Grammar value
    value: Union[str, dict]


class ToolCall(BaseModel):
    # Id of the tool call
    id: int
    # Type of the tool call
    type: str
    # Function details of the tool call
    function: dict


class Chunk(BaseModel):
    type: str
    text: Optional[str] = None
    image_url: Any = None


class Message(BaseModel):
    # Role of the message sender
    role: str
    # Content of the message
    content: Optional[Union[str, List[Chunk]]] = None
    # Optional name of the message sender
    name: Optional[str] = None
    # Tool calls associated with the chat completion
    tool_calls: Optional[Any] = None


class Tool(BaseModel):
    # Type of the tool
    type: str
    # Function details of the tool
    function: dict


class Function(BaseModel):
    name: Optional[str]
    arguments: str


class ChoiceDeltaToolCall(BaseModel):
    index: int
    id: str
    type: str
    function: Function


class ChoiceDelta(BaseModel):
    role: str
    content: Optional[str] = None
    tool_calls: Optional[List[ChoiceDeltaToolCall]] = None


class Choice(BaseModel):
    index: int
    delta: ChoiceDelta
    logprobs: Optional[dict] = None
    finish_reason: Optional[str] = None


class CompletionRequest(BaseModel):
    # Model identifier
    model: str
    # Prompt
    prompt: str
    # The parameter for repetition penalty. 1.0 means no penalty.
    # See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
    repetition_penalty: Optional[float] = None
    # The parameter for frequency penalty. 1.0 means no penalty
    # Penalize new tokens based on their existing frequency in the text so far,
    # decreasing the model's likelihood to repeat the same line verbatim.
    frequency_penalty: Optional[float] = None
    # Maximum number of tokens to generate
    max_tokens: Optional[int] = None
    # Flag to indicate streaming response
    stream: bool = False
    # Random sampling seed
    seed: Optional[int] = None
    # Sampling temperature
    temperature: Optional[float] = None
    # Top-p value for nucleus sampling
    top_p: Optional[float] = None
    # Stop generating tokens if a member of `stop` is generated
    stop: Optional[List[str]] = None


class CompletionComplete(BaseModel):
    # Index of the chat completion
    index: int
    # Message associated with the chat completion
    text: str
    # Log probabilities for the chat completion
    logprobs: Optional[Any]
    # Reason for completion
    finish_reason: str


class Completion(BaseModel):
    # Completion details
    id: str
    object: str
    created: int
    model: str
    system_fingerprint: str
    choices: List[CompletionComplete]


class ChatRequest(BaseModel):
    # Model identifier
    model: str
    # List of messages in the conversation
    messages: List[Message]
    # The parameter for repetition penalty. 1.0 means no penalty.
    # See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
    repetition_penalty: Optional[float] = None
    # The parameter for frequency penalty. 1.0 means no penalty
    # Penalize new tokens based on their existing frequency in the text so far,
    # decreasing the model's likelihood to repeat the same line verbatim.
    frequency_penalty: Optional[float] = None
    # Bias values for token selection
    logit_bias: Optional[List[float]] = None
    # Whether to return log probabilities
    logprobs: Optional[bool] = None
    # Number of most likely tokens to return at each position
    top_logprobs: Optional[int] = None
    # Maximum number of tokens to generate
    max_tokens: Optional[int] = None
    # Number of chat completion choices to generate
    n: Optional[int] = None
    # Penalty for presence of new tokens
    presence_penalty: Optional[float] = None
    # Flag to indicate streaming response
    stream: bool = False
    # Random sampling seed
    seed: Optional[int] = None
    # Sampling temperature
    temperature: Optional[float] = None
    # Top-p value for nucleus sampling
    top_p: Optional[float] = None
    # List of tools to be used
    tools: Optional[List[Tool]] = None
    # A prompt to be appended before the tools
    tool_prompt: Optional[str] = None
    # Choice of tool to be used
    tool_choice: Optional[str] = None
    # Stop generating tokens if a member of `stop` is generated
    stop: Optional[List[str]] = None


class ChatCompletionComplete(BaseModel):
    # Index of the chat completion
    index: int
    # Message associated with the chat completion
    message: Message
    # Log probabilities for the chat completion
    logprobs: Optional[Any]
    # Reason for completion
    finish_reason: Optional[str]
    # Usage details of the chat completion
    usage: Optional[Any] = None


class ChatComplete(BaseModel):
    # Chat completion details
    id: str
    object: str
    created: int
    model: str
    system_fingerprint: str
    choices: List[ChatCompletionComplete]
    usage: Any


class ChatCompletionChunk(BaseModel):
    id: str
    object: str
    created: int
    model: str
    system_fingerprint: str
    choices: List[Choice]
    usage: Optional[Any] = None


class Parameters(BaseModel):
    # Activate logits sampling
    do_sample: bool = False
    # Maximum number of generated tokens
    max_new_tokens: int = 20
    # The parameter for repetition penalty. 1.0 means no penalty.
    # See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
    repetition_penalty: Optional[float] = None
    # The parameter for frequency penalty. 1.0 means no penalty
    # Penalize new tokens based on their existing frequency in the text so far,
    # decreasing the model's likelihood to repeat the same line verbatim.
    frequency_penalty: Optional[float] = None
    # Whether to prepend the prompt to the generated text
    return_full_text: bool = False
    # Stop generating tokens if a member of `stop_sequences` is generated
    stop: List[str] = []
    # Random sampling seed
    seed: Optional[int] = None
    # The value used to module the logits distribution.
    temperature: Optional[float] = None
    # The number of highest probability vocabulary tokens to keep for top-k-filtering.
    top_k: Optional[int] = None
    # If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
    # higher are kept for generation.
    top_p: Optional[float] = None
    # truncate inputs tokens to the given size
    truncate: Optional[int] = None
    # Typical Decoding mass
    # See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
    typical_p: Optional[float] = None
    # Generate best_of sequences and return the one if the highest token logprobs
    best_of: Optional[int] = None
    # Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
    watermark: bool = False
    # Get generation details
    details: bool = False
    # Get decoder input token logprobs and ids
    decoder_input_details: bool = False
    # Return the N most likely tokens at each step
    top_n_tokens: Optional[int] = None
    # grammar to use for generation
    grammar: Optional[Grammar] = None

    @field_validator("best_of")
    def valid_best_of(cls, field_value, values):
        if field_value is not None:
            if field_value <= 0:
                raise ValidationError("`best_of` must be strictly positive")
            if field_value > 1 and values.data["seed"] is not None:
                raise ValidationError("`seed` must not be set when `best_of` is > 1")
            sampling = (
                values.data["do_sample"]
                | (values.data["temperature"] is not None)
                | (values.data["top_k"] is not None)
                | (values.data["top_p"] is not None)
                | (values.data["typical_p"] is not None)
            )
            if field_value > 1 and not sampling:
                raise ValidationError("you must use sampling when `best_of` is > 1")

        return field_value

    @field_validator("repetition_penalty")
    def valid_repetition_penalty(cls, v):
        if v is not None and v <= 0:
            raise ValidationError("`repetition_penalty` must be strictly positive")
        return v

    @field_validator("frequency_penalty")
    def valid_frequency_penalty(cls, v):
        if v is not None and v <= 0:
            raise ValidationError("`frequency_penalty` must be strictly positive")
        return v

    @field_validator("seed")
    def valid_seed(cls, v):
        if v is not None and v < 0:
            raise ValidationError("`seed` must be positive")
        return v

    @field_validator("temperature")
    def valid_temp(cls, v):
        if v is not None and v <= 0:
            raise ValidationError("`temperature` must be strictly positive")
        return v

    @field_validator("top_k")
    def valid_top_k(cls, v):
        if v is not None and v <= 0:
            raise ValidationError("`top_k` must be strictly positive")
        return v

    @field_validator("top_p")
    def valid_top_p(cls, v):
        if v is not None and (v <= 0 or v >= 1.0):
            raise ValidationError("`top_p` must be > 0.0 and < 1.0")
        return v

    @field_validator("truncate")
    def valid_truncate(cls, v):
        if v is not None and v <= 0:
            raise ValidationError("`truncate` must be strictly positive")
        return v

    @field_validator("typical_p")
    def valid_typical_p(cls, v):
        if v is not None and (v <= 0 or v >= 1.0):
            raise ValidationError("`typical_p` must be > 0.0 and < 1.0")
        return v

    @field_validator("top_n_tokens")
    def valid_top_n_tokens(cls, v):
        if v is not None and v <= 0:
            raise ValidationError("`top_n_tokens` must be strictly positive")
        return v

    @field_validator("grammar")
    def valid_grammar(cls, v):
        if v is not None:
            if v.type == GrammarType.Regex and not v.value:
                raise ValidationError("`value` cannot be empty for `regex` grammar")
            if v.type == GrammarType.Json and not v.value:
                raise ValidationError("`value` cannot be empty for `json` grammar")
        return v


class Request(BaseModel):
    # Prompt
    inputs: str
    # Generation parameters
    parameters: Optional[Parameters] = None
    # Whether to stream output tokens
    stream: bool = False

    @field_validator("inputs")
    def valid_input(cls, v):
        if not v:
            raise ValidationError("`inputs` cannot be empty")
        return v

    @field_validator("stream")
    def valid_best_of_stream(cls, field_value, values):
        parameters = values.data["parameters"]
        if (
            parameters is not None
            and parameters.best_of is not None
            and parameters.best_of > 1
            and field_value
        ):
            raise ValidationError(
                "`best_of` != 1 is not supported when `stream` == True"
            )
        return field_value


# Decoder input tokens
class InputToken(BaseModel):
    # Token ID from the model tokenizer
    id: int
    # Token text
    text: str
    # Logprob
    # Optional since the logprob of the first token cannot be computed
    logprob: Optional[float] = None


# Generated tokens
class Token(BaseModel):
    # Token ID from the model tokenizer
    id: int
    # Token text
    text: str
    # Logprob
    logprob: Optional[float] = None
    # Is the token a special token
    # Can be used to ignore tokens when concatenating
    special: bool


# Generation finish reason
class FinishReason(str, Enum):
    # number of generated tokens == `max_new_tokens`
    Length = "length"
    # the model generated its end of sequence token
    EndOfSequenceToken = "eos_token"
    # the model generated a text included in `stop_sequences`
    StopSequence = "stop_sequence"


# Additional sequences when using the `best_of` parameter
class BestOfSequence(BaseModel):
    # Generated text
    generated_text: str
    # Generation finish reason
    finish_reason: FinishReason
    # Number of generated tokens
    generated_tokens: int
    # Sampling seed if sampling was activated
    seed: Optional[int] = None
    # Decoder input tokens, empty if decoder_input_details is False
    prefill: List[InputToken]
    # Generated tokens
    tokens: List[Token]
    # Most likely tokens
    top_tokens: Optional[List[List[Token]]] = None


# `generate` details
class Details(BaseModel):
    # Generation finish reason
    finish_reason: FinishReason
    # Number of generated tokens
    generated_tokens: int
    # Sampling seed if sampling was activated
    seed: Optional[int] = None
    # Decoder input tokens, empty if decoder_input_details is False
    prefill: List[InputToken]
    # Generated tokens
    tokens: List[Token]
    # Most likely tokens
    top_tokens: Optional[List[List[Token]]] = None
    # Additional sequences when using the `best_of` parameter
    best_of_sequences: Optional[List[BestOfSequence]] = None


# `generate` return value
class Response(BaseModel):
    # Generated text
    generated_text: str
    # Generation details
    details: Details


# `generate_stream` details
class StreamDetails(BaseModel):
    # Generation finish reason
    finish_reason: FinishReason
    # Number of generated tokens
    generated_tokens: int
    # Sampling seed if sampling was activated
    seed: Optional[int] = None


# `generate_stream` return value
class StreamResponse(BaseModel):
    # Generated token
    token: Token
    # Most likely tokens
    top_tokens: Optional[List[Token]] = None
    # Complete generated text
    # Only available when the generation is finished
    generated_text: Optional[str] = None
    # Generation details
    # Only available when the generation is finished
    details: Optional[StreamDetails] = None


# Inference API currently deployed model
class DeployedModel(BaseModel):
    # Disable warning for use of `model_` prefix in `model_id`. Be mindful about adding members
    # with model_ prefixes, since this disables guardrails for colliding fields:
    # https://github.com/pydantic/pydantic/issues/9177
    model_config = ConfigDict(protected_namespaces=())
    model_id: str
    sha: str


================================================
FILE: crate-hashes.json
================================================
{
  "git+https://github.com/dottxt-ai/outlines-core.git?rev=ba10c619fc9bf3c487e43f49bdecb95a24bb465c#outlines-core@0.1.0": "1j9dcd831b0bmmjk2n4aag3x47qnqmkpg4gqpvwwyic7744llbfm"
}

================================================
FILE: docs/README.md
================================================
Documentation available at: https://huggingface.co/docs/text-generation-inference

## Release

When making a release, please update the latest version in the documentation with:
```
export OLD_VERSION="2\.0\.3"
export NEW_VERSION="2\.0\.4"
find . -name '*.md' -exec sed -i -e "s/$OLD_VERSION/$NEW_VERSION/g" {} \;
```


================================================
FILE: docs/index.html
================================================
<html>
    <head>
        <!-- Load the latest Swagger UI code and style from npm using unpkg.com -->
        <script src="https://unpkg.com/swagger-ui-dist@3/swagger-ui-bundle.js"></script>
        <link rel="stylesheet" type="text/css" href="https://unpkg.com/swagger-ui-dist@3/swagger-ui.css"/>
        <title>Text Generation Inference API</title>
    </head>
    <body>
        <div id="swagger-ui"></div> <!-- Div to hold the UI component -->
        <script>
            window.onload = function () {
                // Begin Swagger UI call region
                const ui = SwaggerUIBundle({
                    url: "openapi.json", //Location of Open API spec in the repo
                    dom_id: '#swagger-ui',
                    deepLinking: true,
                    supportedSubmitMethods: [],
                    presets: [
                        SwaggerUIBundle.presets.apis,
                        SwaggerUIBundle.SwaggerUIStandalonePreset
                    ],
                    plugins: [
                        SwaggerUIBundle.plugins.DownloadUrl
                    ],
                })
                window.ui = ui
            }
        </script>
    </body>
</html>


================================================
FILE: docs/openapi.json
================================================
{
  "openapi": "3.0.3",
  "info": {
    "title": "Text Generation Inference",
    "description": "Text Generation Webserver",
    "contact": {
      "name": "Olivier Dehaene"
    },
    "license": {
      "name": "Apache 2.0",
      "url": "https://www.apache.org/licenses/LICENSE-2.0"
    },
    "version": "3.3.6-dev0"
  },
  "paths": {
    "/": {
      "post": {
        "tags": [
          "Text Generation Inference"
        ],
        "summary": "Generate tokens if `stream == false` or a stream of token if `stream == true`",
        "operationId": "compat_generate",
        "requestBody": {
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/CompatGenerateRequest"
              }
            }
          },
          "required": true
        },
        "responses": {
          "200": {
            "description": "Generated Text",
            "content": {
              "application/json": {
                "schema": {
                  "type": "array",
                  "items": {
                    "$ref": "#/components/schemas/GenerateResponse"
                  }
                }
              },
              "text/event-stream": {
                "schema": {
                  "$ref": "#/components/schemas/StreamResponse"
                }
              }
            }
          },
          "422": {
            "description": "Input validation error",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Input validation error",
                  "error_type": "validation"
                }
              }
            }
          },
          "424": {
            "description": "Generation Error",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Request failed during generation",
                  "error_type": "generation"
                }
              }
            }
          },
          "429": {
            "description": "Model is overloaded",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Model is overloaded",
                  "error_type": "overloaded"
                }
              }
            }
          },
          "500": {
            "description": "Incomplete generation",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Incomplete generation",
                  "error_type": "incomplete_generation"
                }
              }
            }
          }
        }
      }
    },
    "/chat_tokenize": {
      "post": {
        "tags": [
          "Text Generation Inference"
        ],
        "summary": "Template and tokenize ChatRequest",
        "operationId": "get_chat_tokenize",
        "requestBody": {
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/ChatRequest"
              }
            }
          },
          "required": true
        },
        "responses": {
          "200": {
            "description": "Templated and tokenized ChatRequest",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ChatTokenizeResponse"
                }
              }
            }
          },
          "404": {
            "description": "Failed to tokenize ChatRequest",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                }
              }
            }
          }
        }
      }
    },
    "/generate": {
      "post": {
        "tags": [
          "Text Generation Inference"
        ],
        "summary": "Generate tokens",
        "operationId": "generate",
        "requestBody": {
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/GenerateRequest"
              }
            }
          },
          "required": true
        },
        "responses": {
          "200": {
            "description": "Generated Text",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/GenerateResponse"
                }
              }
            }
          },
          "422": {
            "description": "Input validation error",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Input validation error",
                  "error_type": "validation"
                }
              }
            }
          },
          "424": {
            "description": "Generation Error",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Request failed during generation",
                  "error_type": "generation"
                }
              }
            }
          },
          "429": {
            "description": "Model is overloaded",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Model is overloaded",
                  "error_type": "overloaded"
                }
              }
            }
          },
          "500": {
            "description": "Incomplete generation",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Incomplete generation",
                  "error_type": "incomplete_generation"
                }
              }
            }
          }
        }
      }
    },
    "/generate_stream": {
      "post": {
        "tags": [
          "Text Generation Inference"
        ],
        "summary": "Generate a stream of token using Server-Sent Events",
        "operationId": "generate_stream",
        "requestBody": {
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/GenerateRequest"
              }
            }
          },
          "required": true
        },
        "responses": {
          "200": {
            "description": "Generated Text",
            "content": {
              "text/event-stream": {
                "schema": {
                  "$ref": "#/components/schemas/StreamResponse"
                }
              }
            }
          },
          "422": {
            "description": "Input validation error",
            "content": {
              "text/event-stream": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Input validation error",
                  "error_type": "validation"
                }
              }
            }
          },
          "424": {
            "description": "Generation Error",
            "content": {
              "text/event-stream": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Request failed during generation",
                  "error_type": "generation"
                }
              }
            }
          },
          "429": {
            "description": "Model is overloaded",
            "content": {
              "text/event-stream": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Model is overloaded",
                  "error_type": "overloaded"
                }
              }
            }
          },
          "500": {
            "description": "Incomplete generation",
            "content": {
              "text/event-stream": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Incomplete generation",
                  "error_type": "incomplete_generation"
                }
              }
            }
          }
        }
      }
    },
    "/health": {
      "get": {
        "tags": [
          "Text Generation Inference"
        ],
        "summary": "Health check method",
        "operationId": "health",
        "responses": {
          "200": {
            "description": "Everything is working fine"
          },
          "503": {
            "description": "Text generation inference is down",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "unhealthy",
                  "error_type": "healthcheck"
                }
              }
            }
          }
        }
      }
    },
    "/info": {
      "get": {
        "tags": [
          "Text Generation Inference"
        ],
        "summary": "Text Generation Inference endpoint info",
        "operationId": "get_model_info",
        "responses": {
          "200": {
            "description": "Served model info",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/Info"
                }
              }
            }
          }
        }
      }
    },
    "/invocations": {
      "post": {
        "tags": [
          "Text Generation Inference"
        ],
        "summary": "Generate tokens from Sagemaker request",
        "operationId": "sagemaker_compatibility",
        "requestBody": {
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/SagemakerRequest"
              }
            }
          },
          "required": true
        },
        "responses": {
          "200": {
            "description": "Generated Chat Completion",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/SagemakerResponse"
                }
              },
              "text/event-stream": {
                "schema": {
                  "$ref": "#/components/schemas/SagemakerStreamResponse"
                }
              }
            }
          },
          "422": {
            "description": "Input validation error",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Input validation error",
                  "error_type": "validation"
                }
              }
            }
          },
          "424": {
            "description": "Generation Error",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Request failed during generation",
                  "error_type": "generation"
                }
              }
            }
          },
          "429": {
            "description": "Model is overloaded",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Model is overloaded",
                  "error_type": "overloaded"
                }
              }
            }
          },
          "500": {
            "description": "Incomplete generation",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Incomplete generation",
                  "error_type": "incomplete_generation"
                }
              }
            }
          }
        }
      }
    },
    "/metrics": {
      "get": {
        "tags": [
          "Text Generation Inference"
        ],
        "summary": "Prometheus metrics scrape endpoint",
        "operationId": "metrics",
        "responses": {
          "200": {
            "description": "Prometheus Metrics",
            "content": {
              "text/plain": {
                "schema": {
                  "type": "string"
                }
              }
            }
          }
        }
      }
    },
    "/tokenize": {
      "post": {
        "tags": [
          "Text Generation Inference"
        ],
        "summary": "Tokenize inputs",
        "operationId": "tokenize",
        "requestBody": {
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/GenerateRequest"
              }
            }
          },
          "required": true
        },
        "responses": {
          "200": {
            "description": "Tokenized ids",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/TokenizeResponse"
                }
              }
            }
          },
          "404": {
            "description": "No tokenizer found",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "No fast tokenizer available"
                }
              }
            }
          }
        }
      }
    },
    "/v1/chat/completions": {
      "post": {
        "tags": [
          "Text Generation Inference"
        ],
        "summary": "Generate tokens",
        "operationId": "chat_completions",
        "requestBody": {
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/ChatRequest"
              }
            }
          },
          "required": true
        },
        "responses": {
          "200": {
            "description": "Generated Chat Completion",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ChatCompletion"
                }
              },
              "text/event-stream": {
                "schema": {
                  "$ref": "#/components/schemas/ChatCompletionChunk"
                }
              }
            }
          },
          "422": {
            "description": "Input validation error",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Input validation error",
                  "error_type": "validation"
                }
              }
            }
          },
          "424": {
            "description": "Generation Error",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Request failed during generation",
                  "error_type": "generation"
                }
              }
            }
          },
          "429": {
            "description": "Model is overloaded",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Model is overloaded",
                  "error_type": "overloaded"
                }
              }
            }
          },
          "500": {
            "description": "Incomplete generation",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Incomplete generation",
                  "error_type": "incomplete_generation"
                }
              }
            }
          }
        }
      }
    },
    "/v1/completions": {
      "post": {
        "tags": [
          "Text Generation Inference"
        ],
        "summary": "Generate tokens",
        "operationId": "completions",
        "requestBody": {
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/CompletionRequest"
              }
            }
          },
          "required": true
        },
        "responses": {
          "200": {
            "description": "Generated Chat Completion",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/CompletionFinal"
                }
              },
              "text/event-stream": {
                "schema": {
                  "$ref": "#/components/schemas/Chunk"
                }
              }
            }
          },
          "422": {
            "description": "Input validation error",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Input validation error",
                  "error_type": "validation"
                }
              }
            }
          },
          "424": {
            "description": "Generation Error",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Request failed during generation",
                  "error_type": "generation"
                }
              }
            }
          },
          "429": {
            "description": "Model is overloaded",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Model is overloaded",
                  "error_type": "overloaded"
                }
              }
            }
          },
          "500": {
            "description": "Incomplete generation",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Incomplete generation",
                  "error_type": "incomplete_generation"
                }
              }
            }
          }
        }
      }
    },
    "/v1/models": {
      "get": {
        "tags": [
          "Text Generation Inference"
        ],
        "summary": "Get model info",
        "operationId": "openai_get_model_info",
        "responses": {
          "200": {
            "description": "Served model info",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ModelInfo"
                }
              }
            }
          },
          "404": {
            "description": "Model not found",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                }
              }
            }
          }
        }
      }
    }
  },
  "components": {
    "schemas": {
      "BestOfSequence": {
        "type": "object",
        "required": [
          "generated_text",
          "finish_reason",
          "generated_tokens",
          "prefill",
          "tokens"
        ],
        "properties": {
          "finish_reason": {
            "$ref": "#/components/schemas/FinishReason"
          },
          "generated_text": {
            "type": "string",
            "example": "test"
          },
          "generated_tokens": {
            "type": "integer",
            "format": "int32",
            "example": 1,
            "minimum": 0
          },
          "prefill": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/PrefillToken"
            }
          },
          "seed": {
            "type": "integer",
            "format": "int64",
            "example": 42,
            "nullable": true,
            "minimum": 0
          },
          "tokens": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/Token"
            }
          },
          "top_tokens": {
            "type": "array",
            "items": {
              "type": "array",
              "items": {
                "$ref": "#/components/schemas/Token"
              }
            }
          }
        }
      },
      "ChatCompletion": {
        "type": "object",
        "required": [
          "id",
          "created",
          "model",
          "system_fingerprint",
          "choices",
          "usage"
        ],
        "properties": {
          "choices": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/ChatCompletionComplete"
            }
          },
          "created": {
            "type": "integer",
            "format": "int64",
            "example": "1706270835",
            "minimum": 0
          },
          "id": {
            "type": "string"
          },
          "model": {
            "type": "string",
            "example": "mistralai/Mistral-7B-Instruct-v0.2"
          },
          "system_fingerprint": {
            "type": "string"
          },
          "usage": {
            "$ref": "#/components/schemas/Usage"
          }
        }
      },
      "ChatCompletionChoice": {
        "type": "object",
        "required": [
          "index",
          "delta"
        ],
        "properties": {
          "delta": {
            "$ref": "#/components/schemas/ChatCompletionDelta"
          },
          "finish_reason": {
            "type": "string",
            "nullable": true
          },
          "index": {
            "type": "integer",
            "format": "int32",
            "minimum": 0
          },
          "logprobs": {
            "allOf": [
              {
                "$ref": "#/components/schemas/ChatCompletionLogprobs"
              }
            ],
            "nullable": true
          }
        }
      },
      "ChatCompletionChunk": {
        "type": "object",
        "required": [
          "id",
          "created",
          "model",
          "system_fingerprint",
          "choices"
        ],
        "properties": {
          "choices": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/ChatCompletionChoice"
            }
          },
          "created": {
            "type": "integer",
            "format": "int64",
            "example": "1706270978",
            "minimum": 0
          },
          "id": {
            "type": "string"
          },
          "model": {
            "type": "string",
            "example": "mistralai/Mistral-7B-Instruct-v0.2"
          },
          "system_fingerprint": {
            "type": "string"
          },
          "usage": {
            "allOf": [
              {
                "$ref": "#/components/schemas/Usage"
              }
            ],
            "nullable": true
          }
        }
      },
      "ChatCompletionComplete": {
        "type": "object",
        "required": [
          "index",
          "message",
          "finish_reason"
        ],
        "properties": {
          "finish_reason": {
            "type": "string"
          },
          "index": {
            "type": "integer",
            "format": "int32",
            "minimum": 0
          },
          "logprobs": {
            "allOf": [
              {
                "$ref": "#/components/schemas/ChatCompletionLogprobs"
              }
            ],
            "nullable": true
          },
          "message": {
            "$ref": "#/components/schemas/OutputMessage"
          }
        }
      },
      "ChatCompletionDelta": {
        "oneOf": [
          {
            "$ref": "#/components/schemas/TextMessage"
          },
          {
            "$ref": "#/components/schemas/ToolCallDelta"
          }
        ]
      },
      "ChatCompletionLogprob": {
        "type": "object",
        "required": [
          "token",
          "logprob",
          "top_logprobs"
        ],
        "properties": {
          "logprob": {
            "type": "number",
            "format": "float"
          },
          "token": {
            "type": "string"
          },
          "top_logprobs": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/ChatCompletionTopLogprob"
            }
          }
        }
      },
      "ChatCompletionLogprobs": {
        "type": "object",
        "required": [
          "content"
        ],
        "properties": {
          "content": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/ChatCompletionLogprob"
            }
          }
        }
      },
      "ChatCompletionTopLogprob": {
        "type": "object",
        "required": [
          "token",
          "logprob"
        ],
        "properties": {
          "logprob": {
            "type": "number",
            "format": "float"
          },
          "token": {
            "type": "string"
          }
        }
      },
      "ChatRequest": {
        "type": "object",
        "required": [
          "messages"
        ],
        "properties": {
          "frequency_penalty": {
            "type": "number",
            "format": "float",
            "description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far,\ndecreasing the model's likelihood to repeat the same line verbatim.",
            "example": "1.0",
            "nullable": true
          },
          "logit_bias": {
            "type": "array",
            "items": {
              "type": "number",
              "format": "float"
            },
            "description": "UNUSED\nModify the likelihood of specified tokens appearing in the completion. Accepts a JSON object that maps tokens\n(specified by their token ID in the tokenizer) to an associated bias value from -100 to 100. Mathematically,\nthe bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model,\nbut values between -1 and 1 should decrease or increase likelihood of selection; values like -100 or 100 should\nresult in a ban or exclusive selection of the relevant token.",
            "nullable": true
          },
          "logprobs": {
            "type": "boolean",
            "description": "Whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each\noutput token returned in the content of message.",
            "example": "false",
            "nullable": true
          },
          "max_tokens": {
            "type": "integer",
            "format": "int32",
            "description": "The maximum number of tokens that can be generated in the chat completion.",
            "default": "1024",
            "example": "32",
            "nullable": true,
            "minimum": 0
          },
          "messages": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/Message"
            },
            "description": "A list of messages comprising the conversation so far.",
            "example": "[{\"role\": \"user\", \"content\": \"What is Deep Learning?\"}]"
          },
          "model": {
            "type": "string",
            "description": "[UNUSED] ID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.",
            "example": "mistralai/Mistral-7B-Instruct-v0.2",
            "nullable": true
          },
          "n": {
            "type": "integer",
            "format": "int32",
            "description": "UNUSED\nHow many chat completion choices to generate for each input message. Note that you will be charged based on the\nnumber of generated tokens across all of the choices. Keep n as 1 to minimize costs.",
            "example": "2",
            "nullable": true,
            "minimum": 0
          },
          "presence_penalty": {
            "type": "number",
            "format": "float",
            "description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far,\nincreasing the model's likelihood to talk about new topics",
            "example": 0.1,
            "nullable": true
          },
          "response_format": {
            "allOf": [
              {
                "$ref": "#/components/schemas/GrammarType"
              }
            ],
            "default": "null",
            "nullable": true
          },
          "seed": {
            "type": "integer",
            "format": "int64",
            "example": 42,
            "nullable": true,
            "minimum": 0
          },
          "stop": {
            "type": "array",
            "items": {
              "type": "string"
            },
            "description": "Up to 4 sequences where the API will stop generating further tokens.",
            "example": "null",
            "nullable": true
          },
          "stream": {
            "type": "boolean"
          },
          "stream_options": {
            "allOf": [
              {
                "$ref": "#/components/schemas/StreamOptions"
              }
            ],
            "nullable": true
          },
          "temperature": {
            "type": "number",
            "format": "float",
            "description": "What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while\nlower values like 0.2 will make it more focused and deterministic.\n\nWe generally recommend altering this or `top_p` but not both.",
            "example": 1.0,
            "nullable": true
          },
          "tool_choice": {
            "allOf": [
              {
                "$ref": "#/components/schemas/ToolChoice"
              }
            ],
            "default": "auto",
            "nullable": true
          },
          "tool_prompt": {
            "type": "string",
            "description": "A prompt to be appended before the tools",
            "example": "Given the functions available, please respond with a JSON for a function call with its proper arguments that best answers the given prompt. Respond in the format {name: function name, parameters: dictionary of argument name and its value}.Do not use variables.",
            "nullable": true
          },
          "tools": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/Tool"
            },
            "description": "A list of tools the model may call. Currently, only functions are supported as a tool. Use this to provide a list of\nfunctions the model may generate JSON inputs for.",
            "example": "null",
            "nullable": true
          },
          "top_logprobs": {
            "type": "integer",
            "format": "int32",
            "description": "An integer between 0 and 5 specifying the number of most likely tokens to return at each token position, each with\nan associated log probability. logprobs must be set to true if this parameter is used.",
            "example": "5",
            "nullable": true,
            "minimum": 0
          },
          "top_p": {
            "type": "number",
            "format": "float",
            "description": "An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the\ntokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.",
            "example": 0.95,
            "nullable": true
          }
        }
      },
      "ChatTokenizeResponse": {
        "type": "object",
        "required": [
          "tokenize_response",
          "templated_text"
        ],
        "properties": {
          "templated_text": {
            "type": "string"
          },
          "tokenize_response": {
            "$ref": "#/components/schemas/TokenizeResponse"
          }
        }
      },
      "Chunk": {
        "type": "object",
        "required": [
          "id",
          "created",
          "choices",
          "model",
          "system_fingerprint"
        ],
        "properties": {
          "choices": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/CompletionComplete"
            }
          },
          "created": {
            "type": "integer",
            "format": "int64",
            "minimum": 0
          },
          "id": {
            "type": "string"
          },
          "model": {
            "type": "string"
          },
          "system_fingerprint": {
            "type": "string"
          }
        }
      },
      "CompatGenerateRequest": {
        "type": "object",
        "required": [
          "inputs"
        ],
        "properties": {
          "inputs": {
            "type": "string",
            "example": "My name is Olivier and I"
          },
          "parameters": {
            "$ref": "#/components/schemas/GenerateParameters"
          },
          "stream": {
            "type": "boolean",
            "default": "false"
          }
        }
      },
      "Completion": {
        "oneOf": [
          {
            "allOf": [
              {
                "$ref": "#/components/schemas/Chunk"
              },
              {
                "type": "object",
                "required": [
                  "object"
                ],
                "properties": {
                  "object": {
                    "type": "string",
                    "enum": [
                      "text_completion"
                    ]
                  }
                }
              }
            ]
          },
          {
            "allOf": [
              {
                "$ref": "#/components/schemas/CompletionFinal"
              },
              {
                "type": "object",
                "required": [
                  "object"
                ],
                "properties": {
                  "object": {
                    "type": "string",
                    "enum": [
                      "text_completion"
                    ]
                  }
                }
              }
            ]
          }
        ],
        "discriminator": {
          "propertyName": "object"
        }
      },
      "CompletionComplete": {
        "type": "object",
        "required": [
          "index",
          "text",
          "finish_reason"
        ],
        "properties": {
          "finish_reason": {
            "type": "string"
          },
          "index": {
            "type": "integer",
            "format": "int32",
            "minimum": 0
          },
          "logprobs": {
            "type": "array",
            "items": {
              "type": "number",
              "format": "float"
            },
            "nullable": true
          },
          "text": {
            "type": "string"
          }
        }
      },
      "CompletionFinal": {
        "type": "object",
        "required": [
          "id",
          "created",
          "model",
          "system_fingerprint",
          "choices",
          "usage"
        ],
        "properties": {
          "choices": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/CompletionComplete"
            }
          },
          "created": {
            "type": "integer",
            "format": "int64",
            "example": "1706270835",
            "minimum": 0
          },
          "id": {
            "type": "string"
          },
          "model": {
            "type": "string",
            "example": "mistralai/Mistral-7B-Instruct-v0.2"
          },
          "system_fingerprint": {
            "type": "string"
          },
          "usage": {
            "$ref": "#/components/schemas/Usage"
          }
        }
      },
      "CompletionRequest": {
        "type": "object",
        "required": [
          "prompt"
        ],
        "properties": {
          "frequency_penalty": {
            "type": "number",
            "format": "float",
            "description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far,\ndecreasing the model's likelihood to repeat the same line verbatim.",
            "example": "1.0",
            "nullable": true
          },
          "max_tokens": {
            "type": "integer",
            "format": "int32",
            "description": "The maximum number of tokens that can be generated in the chat completion.",
            "default": "1024",
            "example": "32",
            "nullable": true,
            "minimum": 0
          },
          "model": {
            "type": "string",
            "description": "UNUSED\nID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.",
            "example": "mistralai/Mistral-7B-Instruct-v0.2",
            "nullable": true
          },
          "prompt": {
            "$ref": "#/components/schemas/Prompt"
          },
          "repetition_penalty": {
            "type": "number",
            "format": "float",
            "nullable": true
          },
          "seed": {
            "type": "integer",
            "format": "int64",
            "example": 42,
            "nullable": true,
            "minimum": 0
          },
          "stop": {
            "type": "array",
            "items": {
              "type": "string"
            },
            "description": "Up to 4 sequences where the API will stop generating further tokens.",
            "example": "null",
            "nullable": true
          },
          "stream": {
            "type": "boolean"
          },
          "suffix": {
            "type": "string",
            "description": "The text to append to the prompt. This is useful for completing sentences or generating a paragraph of text.\nplease see the completion_template field in the model's tokenizer_config.json file for completion template.",
            "nullable": true
          },
          "temperature": {
            "type": "number",
            "format": "float",
            "description": "What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while\nlower values like 0.2 will make it more focused and deterministic. We generally recommend altering this or `top_p` but not both.",
            "example": 1.0,
            "nullable": true
          },
          "top_p": {
            "type": "number",
            "format": "float",
            "description": "An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the\ntokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.",
            "example": 0.95,
            "nullable": true
          }
        }
      },
      "DeltaToolCall": {
        "type": "object",
        "required": [
          "index",
          "id",
          "type",
          "function"
        ],
        "properties": {
          "function": {
            "$ref": "#/components/schemas/Function"
          },
          "id": {
            "type": "string"
          },
          "index": {
            "type": "integer",
            "format": "int32",
            "minimum": 0
          },
          "type": {
            "type": "string"
          }
        }
      },
      "Details": {
        "type": "object",
        "required": [
          "finish_reason",
          "generated_tokens",
          "prefill",
          "tokens"
        ],
        "properties": {
          "best_of_sequences": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/BestOfSequence"
            },
            "nullable": true
          },
          "finish_reason": {
            "$ref": "#/components/schemas/FinishReason"
          },
          "generated_tokens": {
            "type": "integer",
            "format": "int32",
            "example": 1,
            "minimum": 0
          },
          "prefill": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/PrefillToken"
            }
          },
          "seed": {
            "type": "integer",
            "format": "int64",
            "example": 42,
            "nullable": true,
            "minimum": 0
          },
          "tokens": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/Token"
            }
          },
          "top_tokens": {
            "type": "array",
            "items": {
              "type": "array",
              "items": {
                "$ref": "#/components/schemas/Token"
              }
            }
          }
        }
      },
      "ErrorResponse": {
        "type": "object",
        "required": [
          "error",
          "error_type"
        ],
        "properties": {
          "error": {
            "type": "string"
          },
          "error_type": {
            "type": "string"
          }
        }
      },
      "FinishReason": {
        "type": "string",
        "enum": [
          "length",
          "eos_token",
          "stop_sequence"
        ],
        "example": "Length"
      },
      "Function": {
        "type": "object",
        "required": [
          "arguments"
        ],
        "properties": {
          "arguments": {
            "type": "string"
          },
          "name": {
            "type": "string",
            "nullable": true
          }
        }
      },
      "FunctionDefinition": {
        "type": "object",
        "required": [
          "name",
          "arguments"
        ],
        "properties": {
          "arguments": {},
          "description": {
            "type": "string",
            "nullable": true
          },
          "name": {
            "type": "string"
          }
        }
      },
      "FunctionName": {
        "type": "object",
        "required": [
          "name"
        ],
        "properties": {
          "name": {
            "type": "string"
          }
        }
      },
      "GenerateParameters": {
        "type": "object",
        "properties": {
          "adapter_id": {
            "type": "string",
            "description": "Lora adapter id",
            "default": "null",
            "example": "null",
            "nullable": true
          },
          "best_of": {
            "type": "integer",
            "description": "Generate best_of sequences and return the one if the highest token logprobs.",
            "default": "null",
            "example": 1,
            "nullable": true,
            "minimum": 0,
            "exclusiveMinimum": 0
          },
          "decoder_input_details": {
            "type": "boolean",
            "description": "Whether to return decoder input token logprobs and ids.",
            "default": "false"
          },
          "details": {
            "type": "boolean",
            "description": "Whether to return generation details.",
            "default": "true"
          },
          "do_sample": {
            "type": "boolean",
            "description": "Activate logits sampling.",
            "default": "false",
            "example": true
          },
          "frequency_penalty": {
            "type": "number",
            "format": "float",
            "description": "The parameter for frequency penalty. 1.0 means no penalty\nPenalize new tokens based on their existing frequency in the text so far,\ndecreasing the model's likelihood to repeat the same line verbatim.",
            "default": "null",
            "example": 0.1,
            "nullable": true,
            "exclusiveMinimum": -2
          },
          "grammar": {
            "allOf": [
              {
                "$ref": "#/components/schemas/GrammarType"
              }
            ],
            "default": "null",
            "nullable": true
          },
          "max_new_tokens": {
            "type": "integer",
            "format": "int32",
            "description": "Maximum number of tokens to generate.",
            "default": "1024",
            "example": "20",
            "nullable": true,
            "minimum": 0
          },
          "repetition_penalty": {
            "type": "number",
            "format": "float",
            "description": "The parameter for repetition penalty. 1.0 means no penalty.\nSee [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.",
            "default": "null",
            "example": 1.03,
            "nullable": true,
            "exclusiveMinimum": 0
          },
          "return_full_text": {
            "type": "boolean",
            "description": "Whether to prepend the prompt to the generated text",
            "default": "null",
            "example": false,
            "nullable": true
          },
          "seed": {
            "type": "integer",
            "format": "int64",
            "description": "Random sampling seed.",
            "default": "null",
            "example": "null",
            "nullable": true,
            "minimum": 0,
            "exclusiveMinimum": 0
          },
          "stop": {
            "type": "array",
            "items": {
              "type": "string"
            },
            "description": "Stop generating tokens if a member of `stop` is generated.",
            "example": [
              "photographer"
            ],
            "maxItems": 4
          },
          "temperature": {
            "type": "number",
            "format": "float",
            "description": "The value used to module the logits distribution.",
            "default": "null",
            "example": 0.5,
            "nullable": true,
            "exclusiveMinimum": 0
          },
          "top_k": {
            "type": "integer",
            "format": "int32",
            "description": "The number of highest probability vocabulary tokens to keep for top-k-filtering.",
            "default": "null",
            "example": 10,
            "nullable": true,
            "exclusiveMinimum": 0
          },
          "top_n_tokens": {
            "type": "integer",
            "format": "int32",
            "description": "The number of highest probability vocabulary tokens to keep for top-n-filtering.",
            "default": "null",
            "example": 5,
            "nullable": true,
            "minimum": 0,
            "exclusiveMinimum": 0
          },
          "top_p": {
            "type": "number",
            "format": "float",
            "description": "Top-p value for nucleus sampling.",
            "default": "null",
            "example": 0.95,
            "nullable": true,
            "maximum": 1,
            "exclusiveMinimum": 0
          },
          "truncate": {
            "type": "integer",
            "description": "Truncate inputs tokens to the given size.",
            "default": "null",
            "example": "null",
            "nullable": true,
            "minimum": 0
          },
          "typical_p": {
            "type": "number",
            "format": "float",
            "description": "Typical Decoding mass\nSee [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information.",
            "default": "null",
            "example": 0.95,
            "nullable": true,
            "maximum": 1,
            "exclusiveMinimum": 0
          },
          "watermark": {
            "type": "boolean",
            "description": "Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226).",
            "default": "false",
            "example": true
          }
        }
      },
      "GenerateRequest": {
        "type": "object",
        "required": [
          "inputs"
        ],
        "properties": {
          "inputs": {
            "type": "string",
            "example": "My name is Olivier and I"
          },
          "parameters": {
            "$ref": "#/components/schemas/GenerateParameters"
          }
        }
      },
      "GenerateResponse": {
        "type": "object",
        "required": [
          "generated_text"
        ],
        "properties": {
          "details": {
            "allOf": [
              {
                "$ref": "#/components/schemas/Details"
              }
            ],
            "nullable": true
          },
          "generated_text": {
            "type": "string",
            "example": "test"
          }
        }
      },
      "GrammarType": {
        "oneOf": [
          {
            "type": "object",
            "required": [
              "type",
              "value"
            ],
            "properties": {
              "type": {
                "type": "string",
                "enum": [
                  "json"
                ]
              },
              "value": {
                "description": "A string that represents a [JSON Schema](https://json-schema.org/).\n\nJSON Schema is a declarative language that allows to annotate JSON documents\nwith types and descriptions."
              }
            }
          },
          {
            "type": "object",
            "required": [
              "type",
              "value"
            ],
            "properties": {
              "type": {
                "type": "string",
                "enum": [
                  "regex"
                ]
              },
              "value": {
                "type": "string"
              }
            }
          },
          {
            "type": "object",
            "required": [
              "type",
              "value"
            ],
            "properties": {
              "type": {
                "type": "string",
                "enum": [
                  "json_schema"
                ]
              },
              "value": {
                "$ref": "#/components/schemas/JsonSchemaConfig"
              }
            }
          }
        ],
        "discriminator": {
          "propertyName": "type"
        }
      },
      "Info": {
        "type": "object",
        "required": [
          "model_id",
          "max_concurrent_requests",
          "max_best_of",
          "max_stop_sequences",
          "max_input_tokens",
          "max_total_tokens",
          "validation_workers",
          "max_client_batch_size",
          "router",
          "version"
        ],
        "properties": {
          "docker_label": {
            "type": "string",
            "example": "null",
            "nullable": true
          },
          "max_best_of": {
            "type": "integer",
            "example": "2",
            "minimum": 0
          },
          "max_client_batch_size": {
            "type": "integer",
            "example": "32",
            "minimum": 0
          },
          "max_concurrent_requests": {
            "type": "integer",
            "description": "Router Parameters",
            "example": "128",
            "minimum": 0
          },
          "max_input_tokens": {
            "type": "integer",
            "example": "1024",
            "minimum": 0
          },
          "max_stop_sequences": {
            "type": "integer",
            "example": "4",
            "minimum": 0
          },
          "max_total_tokens": {
            "type": "integer",
            "example": "2048",
            "minimum": 0
          },
          "model_id": {
            "type": "string",
            "description": "Model info",
            "example": "bigscience/blomm-560m"
          },
          "model_pipeline_tag": {
            "type": "string",
            "example": "text-generation",
            "nullable": true
          },
          "model_sha": {
            "type": "string",
            "example": "e985a63cdc139290c5f700ff1929f0b5942cced2",
            "nullable": true
          },
          "router": {
            "type": "string",
            "description": "Router Info",
            "example": "text-generation-router"
          },
          "sha": {
            "type": "string",
            "example": "null",
            "nullable": true
          },
          "validation_workers": {
            "type": "integer",
            "example": "2",
            "minimum": 0
          },
          "version": {
            "type": "string",
            "example": "0.5.0"
          }
        }
      },
      "JsonSchemaConfig": {
        "type": "object",
        "required": [
          "schema"
        ],
        "properties": {
          "name": {
            "type": "string",
            "description": "Optional name identifier for the schema",
            "nullable": true
          },
          "schema": {
            "description": "The actual JSON schema definition"
          }
        }
      },
      "Message": {
        "allOf": [
          {
            "$ref": "#/components/schemas/MessageBody"
          },
          {
            "type": "object",
            "required": [
              "role"
            ],
            "properties": {
              "name": {
                "type": "string",
                "example": "\"David\"",
                "nullable": true
              },
              "role": {
                "type": "string",
                "example": "user"
              }
            }
          }
        ]
      },
      "MessageBody": {
        "oneOf": [
          {
            "type": "object",
            "required": [
              "content"
            ],
            "properties": {
              "content": {
                "$ref": "#/components/schemas/MessageContent"
              }
            }
          },
          {
            "type": "object",
            "required": [
              "tool_calls"
            ],
            "properties": {
              "tool_calls": {
                "type": "array",
                "items": {
                  "$ref": "#/components/schemas/ToolCall"
                }
              }
            }
          }
        ]
      },
      "MessageChunk": {
        "oneOf": [
          {
            "type": "object",
            "required": [
              "text",
              "type"
            ],
            "properties": {
              "text": {
                "type": "string"
              },
              "type": {
                "type": "string",
                "enum": [
                  "text"
                ]
              }
            }
          },
          {
            "type": "object",
            "required": [
              "image_url",
              "type"
            ],
            "properties": {
              "image_url": {
                "$ref": "#/components/schemas/Url"
              },
              "type": {
                "type": "string",
                "enum": [
                  "image_url"
                ]
              }
            }
          }
        ],
        "discriminator": {
          "propertyName": "type"
        }
      },
      "MessageContent": {
        "oneOf": [
          {
            "type": "string"
          },
          {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/MessageChunk"
            }
          }
        ]
      },
      "ModelInfo": {
        "type": "object",
        "required": [
          "id",
          "object",
          "created",
          "owned_by"
        ],
        "properties": {
          "created": {
            "type": "integer",
            "format": "int64",
            "example": 1686935002,
            "minimum": 0
          },
          "id": {
            "type": "string",
            "example": "gpt2"
          },
          "object": {
            "type": "string",
            "example": "model"
          },
          "owned_by": {
            "type": "string",
            "example": "openai"
          }
        }
      },
      "OutputMessage": {
        "oneOf": [
          {
            "$ref": "#/components/schemas/TextMessage"
          },
          {
            "$ref": "#/components/schemas/ToolCallMessage"
          }
        ]
      },
      "PrefillToken": {
        "type": "object",
        "required": [
          "id",
          "text",
          "logprob"
        ],
        "properties": {
          "id": {
            "type": "integer",
            "format": "int32",
            "example": 0,
            "minimum": 0
          },
          "logprob": {
            "type": "number",
            "format": "float",
            "example": -0.34,
            "nullable": true
          },
          "text": {
            "type": "string",
            "example": "test"
          }
        }
      },
      "Prompt": {
        "type": "array",
        "items": {
          "type": "string"
        }
      },
      "SagemakerRequest": {
        "oneOf": [
          {
            "$ref": "#/components/schemas/CompatGenerateRequest"
          },
          {
            "$ref": "#/components/schemas/ChatRequest"
          },
          {
            "$ref": "#/components/schemas/CompletionRequest"
          }
        ]
      },
      "SagemakerResponse": {
        "oneOf": [
          {
            "$ref": "#/components/schemas/GenerateResponse"
          },
          {
            "$ref": "#/components/schemas/ChatCompletion"
          },
          {
            "$ref": "#/components/schemas/CompletionFinal"
          }
        ]
      },
      "SagemakerStreamResponse": {
        "oneOf": [
          {
            "$ref": "#/components/schemas/StreamResponse"
          },
          {
            "$ref": "#/components/schemas/ChatCompletionChunk"
          },
          {
            "$ref": "#/components/schemas/Chunk"
          }
        ]
      },
      "SimpleToken": {
        "type": "object",
        "required": [
          "id",
          "text",
          "start",
          "stop"
        ],
        "properties": {
          "id": {
            "type": "integer",
            "format": "int32",
            "example": 0,
            "minimum": 0
          },
          "start": {
            "type": "integer",
            "example": 0,
            "minimum": 0
          },
          "stop": {
            "type": "integer",
            "example": 2,
            "minimum": 0
          },
          "text": {
            "type": "string",
            "example": "test"
          }
        }
      },
      "StreamDetails": {
        "type": "object",
        "required": [
          "finish_reason",
          "generated_tokens",
          "input_length"
        ],
        "properties": {
          "finish_reason": {
            "$ref": "#/components/schemas/FinishReason"
          },
          "generated_tokens": {
            "type": "integer",
            "format": "int32",
            "example": 1,
            "minimum": 0
          },
          "input_length": {
            "type": "integer",
            "format": "int32",
            "example": 1,
            "minimum": 0
          },
          "seed": {
            "type": "integer",
            "format": "int64",
            "example": 42,
            "nullable": true,
            "minimum": 0
          }
        }
      },
      "StreamOptions": {
        "type": "object",
        "properties": {
          "include_usage": {
            "type": "boolean",
            "description": "If set, an additional chunk will be streamed before the data: [DONE] message. The usage field on this chunk shows the token usage statistics for the entire request, and the choices field will always be an empty array. All other chunks will also include a usage field, but with a null value.",
            "example": "true"
          }
        }
      },
      "StreamResponse": {
        "type": "object",
        "required": [
          "index",
          "token"
        ],
        "properties": {
          "details": {
            "allOf": [
              {
                "$ref": "#/components/schemas/StreamDetails"
              }
            ],
            "default": "null",
            "nullable": true
          },
          "generated_text": {
            "type": "string",
            "default": "null",
            "example": "test",
            "nullable": true
          },
          "index": {
            "type": "integer",
            "format": "int32",
            "minimum": 0
          },
          "token": {
            "$ref": "#/components/schemas/Token"
          },
          "top_tokens": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/Token"
            }
          }
        }
      },
      "TextMessage": {
        "type": "object",
        "required": [
          "role",
          "content"
        ],
        "properties": {
          "content": {
            "type": "string",
            "example": "My name is David and I"
          },
          "role": {
            "type": "string",
            "example": "user"
          },
          "tool_call_id": {
            "type": "string",
            "nullable": true
          }
        }
      },
      "Token": {
        "type": "object",
        "required": [
          "id",
          "text",
          "logprob",
          "special"
        ],
        "properties": {
          "id": {
            "type": "integer",
            "format": "int32",
            "example": 0,
            "minimum": 0
          },
          "logprob": {
            "type": "number",
            "format": "float",
            "example": -0.34,
            "nullable": true
          },
          "special": {
            "type": "boolean",
            "example": "false"
          },
          "text": {
            "type": "string",
            "example": "test"
          }
        }
      },
      "TokenizeResponse": {
        "type": "array",
        "items": {
          "$ref": "#/components/schemas/SimpleToken"
        }
      },
      "Tool": {
        "type": "object",
        "required": [
          "type",
          "function"
        ],
        "properties": {
          "function": {
            "$ref": "#/components/schemas/FunctionDefinition"
          },
          "type": {
            "type": "string",
            "example": "function"
          }
        }
      },
      "ToolCall": {
        "type": "object",
        "required": [
          "id",
          "type",
          "function"
        ],
        "properties": {
          "function": {
            "$ref": "#/components/schemas/FunctionDefinition"
          },
          "id": {
            "type": "string"
          },
          "type": {
            "type": "string"
          }
        }
      },
      "ToolCallDelta": {
        "type": "object",
        "required": [
          "role",
          "tool_calls"
        ],
        "properties": {
          "role": {
            "type": "string",
            "example": "assistant"
          },
          "tool_calls": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/DeltaToolCall"
            }
          }
        }
      },
      "ToolCallMessage": {
        "type": "object",
        "required": [
          "role",
          "tool_calls"
        ],
        "properties": {
          "role": {
            "type": "string",
            "example": "assistant"
          },
          "tool_calls": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/ToolCall"
            }
          }
        }
      },
      "ToolChoice": {
        "oneOf": [
          {
            "type": "string",
            "description": "Means the model can pick between generating a message or calling one or more tools.",
            "enum": [
              "auto"
            ]
          },
          {
            "type": "string",
            "description": "Means the model will not call any tool and instead generates a message.",
            "enum": [
              "none"
            ]
          },
          {
            "type": "string",
            "description": "Means the model must call one or more tools.",
            "enum": [
              "required"
            ]
          },
          {
            "type": "object",
            "required": [
              "function"
            ],
            "properties": {
              "function": {
                "$ref": "#/components/schemas/FunctionName"
              }
            }
          }
        ],
        "description": "<https://platform.openai.com/docs/guides/function-calling/configuring-function-calling-behavior-using-the-tool_choice-parameter>"
      },
      "Url": {
        "type": "object",
        "required": [
          "url"
        ],
        "properties": {
          "url": {
            "type": "string"
          }
        }
      },
      "Usage": {
        "type": "object",
        "required": [
          "prompt_tokens",
          "completion_tokens",
          "total_tokens"
        ],
        "properties": {
          "completion_tokens": {
            "type": "integer",
            "format": "int32",
            "minimum": 0
          },
          "prompt_tokens": {
            "type": "integer",
            "format": "int32",
            "minimum": 0
          },
          "total_tokens": {
            "type": "integer",
            "format": "int32",
            "minimum": 0
          }
        }
      }
    }
  },
  "tags": [
    {
      "name": "Text Generation Inference",
      "description": "Hugging Face Text Generation Inference API"
    }
  ]
}


================================================
FILE: docs/source/_toctree.yml
================================================
- sections:
  - local: index
    title: Text Generation Inference
  - local: quicktour
    title: Quick Tour
  - local: supported_models
    title: Supported Models
  - local: installation_nvidia
    title: Using TGI with Nvidia GPUs
  - local: installation_amd
    title: Using TGI with AMD GPUs
  - local: installation_gaudi
    title: Using TGI with Intel Gaudi
  - local: installation_inferentia
    title: Using TGI with AWS Trainium and Inferentia
  - local: installation_tpu
    title: Using TGI with Google TPUs
  - local: installation_intel
    title: Using TGI with Intel GPUs
  - local: installation
    title: Installation from source
  - local: multi_backend_support
    title: Multi-backend support

  - local: architecture
    title: Internal Architecture
  - local: usage_statistics
    title: Usage Statistics
  title: Getting started
- sections:
  - local: basic_tutorials/consuming_tgi
    title: Consuming TGI
  - local: basic_tutorials/preparing_model
    title: Preparing Model for Serving
  - local: basic_tutorials/gated_model_access
    title: Serving Private & Gated Models
  - local: basic_tutorials/using_cli
    title: Using TGI CLI
  - local: basic_tutorials/non_core_models
    title: Non-core Model Serving
  - local: basic_tutorials/safety
    title: Safety
  - local: basic_tutorials/using_guidance
    title: Using Guidance, JSON, tools
  - local: basic_tutorials/visual_language_models
    title: Visual Language Models
  - local: basic_tutorials/monitoring
    title: Monitoring TGI with Prometheus and Grafana
  - local: basic_tutorials/train_medusa
    title: Train Medusa
  title: Tutorials
- sections:
  - local: backends/neuron
    title: Neuron
  - local: backends/gaudi
    title: Gaudi
  - local: backends/trtllm
    title: TensorRT-LLM
  - local: backends/llamacpp
    title: Llamacpp
  title: Backends
- sections:
  - local: reference/launcher
    title: All TGI CLI options
  - local: reference/metrics
    title: Exported Metrics
  - local: reference/api_reference
    title: API Reference
  title: Reference
- sections:
  - local: conceptual/chunking
    title: V3 update, caching and chunking
  - local: conceptual/streaming
    title: Streaming
  - local: conceptual/quantization
    title: Quantization
  - local: conceptual/tensor_parallelism
    title: Tensor Parallelism
  - local: conceptual/paged_attention
    title: PagedAttention
  - local: conceptual/safetensors
    title: Safetensors
  - local: conceptual/flash_attention
    title: Flash Attention
  - local: conceptual/speculation
    title: Speculation (Medusa, ngram)
  - local: conceptual/guidance
    title: How Guidance Works (via outlines)
  - local: conceptual/lora
    title: LoRA (Low-Rank Adaptation)
  - local: conceptual/external
    title: External Resources


  title: Conceptual Guides


================================================
FILE: docs/source/architecture.md
================================================
# Text Generation Inference Architecture

This document aims at describing the architecture of Text Generation Inference (TGI), by describing the call flow between the separate components.

A high-level architecture diagram can be seen here:

![TGI architecture](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/TGI.png)

This diagram shows well there are these separate components:

- **The router**, also named `webserver`, that receives the client requests, buffers them, creates some batches, and prepares gRPC calls to a model server.
- **The launcher** is a helper that will be able to launch one or several model servers (if model is sharded), and it launches the router with the compatible arguments.
- **The model server**, responsible for receiving the gRPC requests and to process the inference on the model. If the model is sharded across multiple accelerators (e.g.: multiple GPUs), the model server shards might be synchronized via NCCL or equivalent.

Note that for other backends (eg. TRTLLM) the model server and launcher are specific to the backend.

The router and the model server can be two different machines, they do not need to be deployed together.

## The Router

This component is a rust web server binary that accepts HTTP requests using the custom [HTTP API](https://huggingface.github.io/text-generation-inference/), as well as OpenAI's [Messages API](https://huggingface.co/docs/text-generation-inference/messages_api).
The router receives the API calls and handles the "baches" logic (and introduction to batching can be found [here](https://github.com/huggingface/text-generation-inference/blob/main/router/README.md)).
It uses different strategies to reduce latency between requests and responses, especially oriented to decoding latency. It will use queues, schedulers, and block allocators to achieve that and produce batched requests that it will then be sent to the model server.

### Router's command line

The router command line will be the way to pass parameters to it (it does not rely on configuration file):

```
Text Generation Webserver

Usage: text-generation-router [OPTIONS]

Options:
      --max-concurrent-requests <MAX_CONCURRENT_REQUESTS>
          [env: MAX_CONCURRENT_REQUESTS=] [default: 128]
      --max-best-of <MAX_BEST_OF>
          [env: MAX_BEST_OF=] [default: 2]
      --max-stop-sequences <MAX_STOP_SEQUENCES>
          [env: MAX_STOP_SEQUENCES=] [default: 4]
      --max-top-n-tokens <MAX_TOP_N_TOKENS>
          [env: MAX_TOP_N_TOKENS=] [default: 5]
      --max-input-tokens <MAX_INPUT_TOKENS>
          [env: MAX_INPUT_TOKENS=] [default: 1024]
      --max-total-tokens <MAX_TOTAL_TOKENS>
          [env: MAX_TOTAL_TOKENS=] [default: 2048]
      --waiting-served-ratio <WAITING_SERVED_RATIO>
          [env: WAITING_SERVED_RATIO=] [default: 1.2]
      --max-batch-prefill-tokens <MAX_BATCH_PREFILL_TOKENS>
          [env: MAX_BATCH_PREFILL_TOKENS=] [default: 4096]
      --max-batch-total-tokens <MAX_BATCH_TOTAL_TOKENS>
          [env: MAX_BATCH_TOTAL_TOKENS=]
      --max-waiting-tokens <MAX_WAITING_TOKENS>
          [env: MAX_WAITING_TOKENS=] [default: 20]
      --max-batch-size <MAX_BATCH_SIZE>
          [env: MAX_BATCH_SIZE=]
      --hostname <HOSTNAME>
          [env: HOSTNAME=] [default: 0.0.0.0]
  -p, --port <PORT>
          [env: PORT=] [default: 3000]
      --master-shard-uds-path <MASTER_SHARD_UDS_PATH>
          [env: MASTER_SHARD_UDS_PATH=] [default: /tmp/text-generation-server-0]
      --tokenizer-name <TOKENIZER_NAME>
          [env: TOKENIZER_NAME=] [default: bigscience/bloom]
      --tokenizer-config-path <TOKENIZER_CONFIG_PATH>
          [env: TOKENIZER_CONFIG_PATH=]
      --revision <REVISION>
          [env: REVISION=]
      --validation-workers <VALIDATION_WORKERS>
          [env: VALIDATION_WORKERS=] [default: 2]
      --json-output
          [env: JSON_OUTPUT=]
      --otlp-endpoint <OTLP_ENDPOINT>
          [env: OTLP_ENDPOINT=]
      --otlp-service-name <OTLP_SERVICE_NAME>
          [env: OTLP_SERVICE_NAME=]
      --cors-allow-origin <CORS_ALLOW_ORIGIN>
          [env: CORS_ALLOW_ORIGIN=]
      --ngrok
          [env: NGROK=]
      --ngrok-authtoken <NGROK_AUTHTOKEN>
          [env: NGROK_AUTHTOKEN=]
      --ngrok-edge <NGROK_EDGE>
          [env: NGROK_EDGE=]
      --messages-api-enabled
          [env: MESSAGES_API_ENABLED=]
      --disable-grammar-support
          [env: DISABLE_GRAMMAR_SUPPORT=]
      --max-client-batch-size <MAX_CLIENT_BATCH_SIZE>
          [env: MAX_CLIENT_BATCH_SIZE=] [default: 4]
  -h, --help
          Print help
  -V, --version
          Print version
```

## The Model Server

The model server is a python server, capable of starting a server waiting for gRPC requests, loads a given model, perform sharding to provide [tensor parallelism](https://huggingface.co/docs/text-generation-inference/conceptual/tensor_parallelism), and stays alive while waiting for new requests.
The model server supports models instantiated using Pytorch and optimized for inference mainly on CUDA/ROCM.

### Model Server Variants

Several variants of the model server exist that are actively supported by Hugging Face:

- By default, the model server will attempt building [a server optimized for Nvidia GPUs with CUDA](https://huggingface.co/docs/text-generation-inference/installation_nvidia). The code for this version is hosted in the [main TGI repository](https://github.com/huggingface/text-generation-inference).
- A [version optimized for AMD with ROCm](https://huggingface.co/docs/text-generation-inference/installation_amd) is hosted in the main TGI repository. Some model features differ.
- A [version optimized for Intel GPUs](https://huggingface.co/docs/text-generation-inference/installation_intel) is hosted in the main TGI repository. Some model features differ.
- The [version for Intel Gaudi](https://huggingface.co/docs/text-generation-inference/installation_gaudi) is maintained on a forked repository, often resynchronized with the main [TGI repository](https://github.com/huggingface/tgi-gaudi).
- A [version for Neuron (AWS Inferentia2)](https://huggingface.co/docs/text-generation-inference/installation_inferentia) is maintained in the main TGI repository. Some model features differ.
- A version for Google TPUs is maintained as part of [Optimum TPU](https://github.com/huggingface/optimum-tpu/tree/main/text-generation-inference).

Not all variants provide the same features, as hardware and middleware capabilities do not provide the same optimizations.

### Command Line Interface

The official command line interface (CLI) for the server supports three subcommands, `download-weights`, `quantize` and `serve`:

- `download-weights` will download weights from the hub and, in some variants it will convert weights to a format that is adapted to the given implementation;
- `quantize` will allow to quantize a model using the `qptq` package. This feature is not available nor supported on all variants;
- `serve` will start the server that load a model (or a model shard), receives gRPC calls from the router, performs an inference and provides a formatted response to the given request.

Serve's command line parameters on the TGI repository are these:

```
 Usage: cli.py serve [OPTIONS] MODEL_ID

╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────────────╮
│ *    model_id      TEXT  [default: None] [required]                                                      │
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────╯
╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────╮
│ --revision                                       TEXT                        [default: None]             │
│ --sharded              --no-sharded                                          [default: no-sharded]       │
│ --quantize                                       [bitsandbytes|bitsandbytes  [default: None]             │
│                                                  -nf4|bitsandbytes-fp4|gptq                              │
│                                                  |awq|eetq|exl2|fp8]                                     │
│ --speculate                                      INTEGER                     [default: None]             │
│ --dtype                                          [float16|bfloat16]          [default: None]             │
│ --trust-remote-code    --no-trust-remote-code                                [default:                   │
│                                                                              no-trust-remote-code]       │
│ --uds-path                                       PATH                        [default:                   │
│                                                                              /tmp/text-generation-serve… │
│ --logger-level                                   TEXT                        [default: INFO]             │
│ --json-output          --no-json-output                                      [default: no-json-output]   │
│ --otlp-endpoint                                  TEXT                        [default: None]             │
│ --otlp-service-name                              TEXT                        [default:                   │
│                                                                              text-generation-inference...│
│ --help                                                                       Show this message and exit. │
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────╯
```

Note that some variants might support different parameters, and they could possibly accept more options that can be passed on using environment variables.

## Call Flow

Once both components are initialized, weights downloaded and model server is up and running, router and model server exchange data and info through the gRPC call. There are currently two supported schemas, [v2](https://github.com/huggingface/text-generation-inference/blob/main/proto/generate.proto) and [v3](https://github.com/huggingface/text-generation-inference/blob/main/proto/v3/generate.proto). These two versions are almost identical, except for:

- input chunks support, for text and image data,
- paged attention support

Here's a diagram that displays the exchanges that follow the router and model server startup.

```mermaid
sequenceDiagram

    Router->>Model Server: service discovery
    Model Server-->>Router: urls for other shards

    Router->>Model Server: get model info
    Model Server-->>Router: shard info

    Router->>Model Server: health check
    Model Server-->>Router: health OK

    Router->>Model Server: warmup(max_input_tokens, max_batch_prefill_tokens, max_total_tokens, max_batch_size)
    Model Server-->>Router: warmup result
```

After these are done, the router is ready to receive generate calls from multiple clients. Here's an example.

```mermaid
sequenceDiagram
    participant Client 1
    participant Client 2
    participant Client 3
    participant Router
    participant Model Server

    Client 1->>Router: generate_stream
    Router->>Model Server: prefill(batch1)
    Model Server-->>Router: generations, cached_batch1, timings
    Router-->>Client 1: token 1

    Router->>Model Server: decode(cached_batch1)
    Model Server-->>Router: generations, cached_batch1, timings
    Router-->>Client 1: token 2

    Router->>Model Server: decode(cached_batch1)
    Model Server-->>Router: generations, cached_batch1, timings
    Router-->>Client 1: token 3

    Client 2->>Router: generate_stream
    Router->>Model Server: prefill(batch2)
    Note right of Model Server: This stops previous batch, that is restarted
    Model Server-->>Router: generations, cached_batch2, timings
    Router-->>Client 2: token 1'

    Router->>Model Server: decode(cached_batch1, cached_batch2)
    Model Server-->>Router: generations, cached_batch1, timings
    Router-->>Client 1: token 4
    Router-->>Client 2: token 2'

    Note left of Client 1: Client 1 leaves
    Router->>Model Server: filter_batch(cached_batch1, request_ids_to_keep=batch2)
    Model Server-->>Router: filtered batch

    Router->>Model Server: decode(cached_batch2)
    Model Server-->>Router: generations, cached_batch2, timings
    Router-->>Client 2: token 3'

    Client 3->>Router: generate_stream
    Note right of Model Server: This stops previous batch, that is restarted
    Router->>Model Server: prefill(batch3)
    Note left of Client 1: Client 3 leaves without receiving any batch
    Router->>Model Server: clear_cache(batch3)
    Note right of Model Server: This stops previous batch, that is restarted

    Router->>Model Server: decode(cached_batch3)
    Note right of Model Server: Last token (stopping criteria)
    Model Server-->>Router: generations, cached_batch3, timings
    Router-->>Client 2: token 4'


```


================================================
FILE: docs/source/backends/gaudi.mdx
================================================
# Gaudi Backend for Text Generation Inference

## Overview
Text Generation Inference (TGI) has been optimized to run on Gaudi hardware via the Gaudi backend for TGI.

## Supported Hardware
- **Gaudi1**: Available on [AWS EC2 DL1 instances](https://aws.amazon.com/ec2/instance-types/dl1/)
- **Gaudi2**: Available on [Intel Cloud](https://console.cloud.intel.com/docs/reference/ai_instances.html)
- **Gaudi3**: Available on [Intel Cloud](https://console.cloud.intel.com/docs/reference/ai_instances.html)

## Tutorial: Getting Started with TGI on Gaudi

### Basic Usage
The easiest way to run TGI on Gaudi is to use the official Docker image:

```bash
model=meta-llama/Meta-Llama-3.1-8B-Instruct
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
hf_token=YOUR_HF_ACCESS_TOKEN

docker run --runtime=habana --cap-add=sys_nice --ipc=host \
    -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \
    ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \
    --model-id $model
```

Once you see the `connected` log, the server is ready to accept requests:
> 2024-05-22T19:31:48.302239Z  INFO text_generation_router: router/src/main.rs:378: Connected

You can find your `YOUR_HF_ACCESS_TOKEN` at [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens). This is necessary to access gated models like llama3.1.

### Making Your First Request
You can send a request from a separate terminal:

```bash
curl 127.0.0.1:8080/generate \
    -X POST \
    -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":32}}' \
    -H 'Content-Type: application/json'
```

## How-to Guides

You can view the full list of supported models in the [Supported Models](https://huggingface.co/docs/text-generation-inference/backends/gaudi#supported-models) section.

For example, to run Llama3.1-8B, you can use the following command:

```bash
model=meta-llama/Meta-Llama-3.1-8B-Instruct
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
hf_token=YOUR_ACCESS_TOKEN

docker run --runtime=habana --cap-add=sys_nice --ipc=host \
    -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \
    ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \
    --model-id $model
    <text-generation-inference-launcher-arguments>
```

For the full list of service parameters, refer to the [launcher-arguments page](https://huggingface.co/docs/text-generation-inference/reference/launcher).

The validated docker commands can be found in the [examples/docker_commands folder](https://github.com/huggingface/text-generation-inference/tree/main/backends/gaudi/examples/docker_commands).

> Note: `--runtime=habana --cap-add=sys_nice --ipc=host ` is required to enable docker to use the Gaudi hardware (more details [here](https://docs.habana.ai/en/latest/Installation_Guide/Additional_Installation/Docker_Installation.html)).

### How to Enable Multi-Card Inference (Sharding)

TGI-Gaudi supports sharding for multi-card inference, allowing you to distribute the load across multiple Gaudi cards. This is recommended to run large models and to speed up inference.

For example, on a machine with 8 Gaudi cards, you can run:

```bash
docker run --runtime=habana --ipc=host --cap-add=sys_nice \
    -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \
    tgi-gaudi \
    --model-id $model --sharded true --num-shard 8
```

<Tip>
We recommend always using sharding when running on a multi-card machine.
</Tip>

### How to Use Different Precision Formats

#### BF16 Precision (Default)
By default, all models run with BF16 precision on Gaudi hardware.

#### FP8 Precision
TGI-Gaudi supports FP8 precision inference, which can significantly reduce memory usage and improve performance for large models. We support model like W8A8 FP compressed-tensors parameters such as [RedHatAI/Mixtral-8x7B-Instruct-v0.1-FP8](https://huggingface.co/RedHatAI/Mixtral-8x7B-Instruct-v0.1-FP8) and AutoFP8 generated model[RedHatAI/Meta-Llama-3-8B-Instruct-FP8](https://huggingface.co/RedHatAI/Meta-Llama-3-8B-Instruct-FP8) .
TGI-Gaudi supports FP8 precision inference with [Intel Neural Compressor (INC)](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html).


### How to Run Vision-Language Models (VLMs)

Gaudi supports VLM inference.

Example for Llava-v1.6-Mistral-7B on 1 card:

Start the TGI server via the following command:
```bash
model=llava-hf/llava-v1.6-mistral-7b-hf
volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run

docker run -p 8080:80 \
   --runtime=habana \
   --cap-add=sys_nice \
   --ipc=host \
   -v $volume:/data \
   ghcr.io/huggingface/text-generation-inference:3.3.5-gaudi \
   --model-id $model \
   --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
   --max-total-tokens 8192 --max-batch-size 4
```

You can then send a request to the server via the following command:
```bash
curl -N 127.0.0.1:8080/generate \
    -X POST \
    -d '{"inputs":"![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png)What is this a picture of?\n\n","parameters":{"max_new_tokens":32}}' \
    -H 'Content-Type: application/json'
```

> Note: In Llava-v1.6-Mistral-7B, an image usually accounts for 2000 input tokens. For example, an image of size 512x512 is represented by 2800 tokens. Thus, `max-input-tokens` must be larger than the number of tokens associated with the image. Otherwise the image may be truncated. The value of `max-batch-prefill-tokens` is 16384, which is calculated as follows: `prefill_batch_size` = `max-batch-prefill-tokens` / `max-input-tokens`.

### How to Benchmark Performance

We recommend using the [inference-benchmarker tool](https://github.com/huggingface/inference-benchmarker) to benchmark performance on Gaudi hardware.

This benchmark tool simulates user requests and measures the performance of the model on realistic scenarios.

To run it on the same machine, you can do the following:
```bash
MODEL=meta-llama/Llama-3.1-8B-Instruct
HF_TOKEN=<your HF READ token>
# run a benchmark to evaluate the performance of the model for chat use case
# we mount results to the current directory
docker run \
    --rm \
    -it \
    --net host \
    -v $(pwd):/opt/inference-benchmarker/results \
    -e "HF_TOKEN=$HF_TOKEN" \
    ghcr.io/huggingface/inference-benchmarker:latest \
    inference-benchmarker \
    --tokenizer-name "$MODEL" \
    --url http://localhost:8080 \
    --profile chat
```

Please refer to the [inference-benchmarker README](https://github.com/huggingface/inference-benchmarker) for more details.

## Explanation: Understanding TGI on Gaudi

### The Warmup Process

Intel Gaudi accelerators perform best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime)
generates optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be highly dependent on input and output tensor shapes, requiring graph recompilation
when encountering tensors with different shapes within the same topology. While these binaries efficiently utilize Gaudi, the compilation process itself can introduce noticeable overhead in end-to-end execution.
In dynamic inference serving scenarios, minimizing the number of graph compilations and reducing the risk of graph compilation occurring during server runtime is important.

To ensure optimal performance, warmup is performed at the beginning of each server run. This process creates queries with various input shapes based on provided parameters and runs basic TGI operations (prefill, decode).

Note: Model warmup can take several minutes, especially for FP8 inference. For faster subsequent runs, refer to [Disk Caching Eviction Policy](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_PyTorch_Models.html#disk-caching-eviction-policy).

### Understanding Parameter Tuning

#### Sequence Length Parameters
- `--max-input-tokens` is the maximum possible input prompt length. Default value is `4095`.
- `--max-total-tokens` is the maximum possible total length of the sequence (input and output). Default value is `4096`.

#### Batch Size Parameters
- For prefill operation, please set `--max-batch-prefill-tokens` as `bs * max-input-tokens`, where `bs` is your expected maximum prefill batch size.
- For decode operation, please set `--max-batch-size` as `bs`, where `bs` is your expected maximum decode batch size.
- Please note that batch size will be always padded to the nearest shapes that has been warmed up. This is done to avoid out of memory issues and to ensure that the graphs are reused efficiently.


## Reference

This section contains reference information about the Gaudi backend.

### Supported Models

Text Generation Inference enables serving optimized models on Gaudi hardware. The following sections list which models (VLMs & LLMs) are supported on Gaudi.

**Large Language Models (LLMs)**
- [deepseek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1)
- [deepseek-v2](https://huggingface.co/deepseek-ai/DeepSeek-V2)
- [Llama2](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b)
- [Llama3](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
- [CodeLlama](https://huggingface.co/codellama/CodeLlama-13b-hf)
- [Mixtral](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)
- [Mistral](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)
- [Qwen 2](https://huggingface.co/collections/Qwen/qwen2-6659360b33528ced941e557f)
- [Qwen 3](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f)
- [Qwen 3 Moe](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f)
- [Phi-1.5](https://huggingface.co/microsoft/phi-1_5)
- [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct)
- [PhiMoe](https://huggingface.co/microsoft/Phi-3.5-MoE-instruct)
- [Gemma](https://huggingface.co/google/gemma-7b-it)
- [Gemma2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
- [Gemma3 Text](https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d)
- [Granite](https://huggingface.co/ibm-granite/granite-3.0-8b-instruct)
- [Cohere](https://huggingface.co/CohereForAI/c4ai-command-r-plus)
- [dbrx](https://huggingface.co/databricks/dbrx-instruct)
- [Starcoder2](https://huggingface.co/bigcode/starcoder2-3b)
- [Falcon](https://huggingface.co/tiiuae/falcon-7b-instruct)
- [GPT-2](https://huggingface.co/openai-community/gpt2)
- [gpt-j-6b](https://huggingface.co/EleutherAI/gpt-j-6b)
- [gpt-bigcode](https://huggingface.co/bigcode/gpt_bigcode-santacoder)
- [Baichuan](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat)


**Vision-Language Models (VLMs)**
- [Llava Next (1.6)](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf)
- [Mllama (Multimodal Llama from Meta)](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)
- [idefics 2](https://huggingface.co/HuggingFaceM4/idefics2-8b)
- [idefics 3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3)
- [PaliGemma](https://huggingface.co/google/paligemma-3b-pt-224)
- [Llama4](https://huggingface.co/collections/meta-llama/llama-4-67f0c30d9fe03840bc9d0164)
- [Gemma3](https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d)
- [Qwen 2.5 VL](https://huggingface.co/collections/Qwen/qwen25-vl-6795ffac22b334a837c0f9a5)
- [Qwen 2 VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d)

If you have an issue with a model, please open an issue on the [Gaudi backend repository](https://github.com/huggingface/text-generation-inference/issues).

### Environment Variables

The following table contains the environment variables that can be used to configure the Gaudi backend:

| Name                        | Value(s)   | Default          | Description                                                                                                                      | Usage                        |
|-----------------------------| :--------- | :--------------- | :------------------------------------------------------------------------------------------------------------------------------- | :--------------------------- |
| LIMIT_HPU_GRAPH             | True/False | True             | Skip HPU graph usage for prefill to save memory, set to `True` for large sequence/decoding lengths(e.g. 300/212)                 | add -e in docker run command |
| SKIP_TOKENIZER_IN_TGI       | True/False | False            | Skip tokenizer for input/output processing                                                                                       | add -e in docker run command |
| VLLM_SKIP_WARMUP              | True/False | False             | Skip graph warmup during server initialization which is not recommended, but could be used for debug.                            | add -e in docker run command |


## Contributing

Contributions to the TGI-Gaudi project are welcome. Please refer to the [contributing guide](https://github.com/huggingface/text-generation-inference/blob/main/CONTRIBUTING.md).

**Guidelines for contributing to Gaudi on TGI:** All changes should be made within the `backends/gaudi` folder. In general, you should avoid modifying the router, launcher, or benchmark to accommodate Gaudi hardware, as all Gaudi-specific logic should be contained within the `backends/gaudi` folder.

### Building the Docker Image from Source

To build the Docker image from source:

```bash
make -C backends/gaudi image
```

This builds the image and saves it as `tgi-gaudi`. You can then run TGI-Gaudi with this image:

```bash
model=meta-llama/Meta-Llama-3.1-8B-Instruct
volume=$PWD/data
hf_token=YOUR_ACCESS_TOKEN

docker run --runtime=habana --ipc=host --cap-add=sys_nice \
    -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \
    tgi-gaudi \
    --model-id $model
```

For more details, see the [README of the Gaudi backend](https://github.com/huggingface/text-generation-inference/blob/main/backends/gaudi/README.md) and the [Makefile of the Gaudi backend](https://github.com/huggingface/text-generation-inference/blob/main/backends/gaudi/Makefile).


================================================
FILE: docs/source/backends/llamacpp.md
================================================
# Llamacpp Backend

The llamacpp backend facilitates the deployment of large language models
(LLMs) by integrating [llama.cpp][llama.cpp], an advanced inference engine
optimized for both CPU and GPU computation. This backend is a component
of Hugging Face’s **Text Generation Inference (TGI)** suite,
specifically designed to streamline the deployment of LLMs in production
environments.

## Key Capabilities

- Full compatibility with GGUF format and all quantization formats
  (GGUF-related constraints may be mitigated dynamically by on-the-fly
  generation in future updates)
- Optimized inference on CPU and GPU architectures
- Containerized deployment, eliminating dependency complexity
- Seamless interoperability with the Hugging Face ecosystem

## Model Compatibility

This backend leverages models formatted in **GGUF**, providing an
optimized balance between computational efficiency and model accuracy.
You will find the best models on [Hugging Face][GGUF].

## Build Docker image

For optimal performance, the Docker image is compiled with native CPU
instructions by default. As a result, it is strongly recommended to run
the container on the same host architecture used during the build
process. Efforts are ongoing to improve portability across different
systems while preserving high computational efficiency.

To build the Docker image, use the following command:

```bash
docker build \
    -t tgi-llamacpp \
    https://github.com/huggingface/text-generation-inference.git \
    -f Dockerfile_llamacpp
```

### Build parameters

| Parameter (with --build-arg)              | Description                      |
| ----------------------------------------- | -------------------------------- |
| `llamacpp_version=bXXXX`                  | Specific version of llama.cpp    |
| `llamacpp_cuda=ON`                        | Enables CUDA acceleration        |
| `llamacpp_native=OFF`                     | Disable automatic CPU detection  |
| `llamacpp_cpu_arm_arch=ARCH[+FEATURE]...` | Specific ARM CPU and features    |
| `cuda_arch=ARCH`                          | Defines target CUDA architecture |

For example, to target Graviton4 when building on another ARM
architecture:

```bash
docker build \
    -t tgi-llamacpp \
    --build-arg llamacpp_native=OFF \
    --build-arg llamacpp_cpu_arm_arch=armv9-a+i8mm \
    https://github.com/huggingface/text-generation-inference.git \
    -f Dockerfile_llamacpp
```

## Run Docker image

### CPU-based inference

```bash
docker run \
    -p 3000:3000 \
    -e "HF_TOKEN=$HF_TOKEN" \
    -v "$HOME/models:/app/models" \
    tgi-llamacpp \
    --model-id "Qwen/Qwen2.5-3B-Instruct"
```

### GPU-Accelerated inference

```bash
docker run \
    --gpus all \
    -p 3000:3000 \
    -e "HF_TOKEN=$HF_TOKEN" \
    -v "$HOME/models:/app/models" \
    tgi-llamacpp \
    --n-gpu-layers 99 \
    --model-id "Qwen/Qwen2.5-3B-Instruct"
```

## Using a custom GGUF

GGUF files are optional as they will be automatically generated at
startup if not already present in the `models` directory. However, if
the default GGUF generation is not suitable for your use case, you can
provide your own GGUF file with `--model-gguf`, for example:

```bash
docker run \
    -p 3000:3000 \
    -e "HF_TOKEN=$HF_TOKEN" \
    -v "$HOME/models:/app/models" \
    tgi-llamacpp \
    --model-id "Qwen/Qwen2.5-3B-Instruct" \
    --model-gguf "models/qwen2.5-3b-instruct-q4_0.gguf"
```

Note that `--model-id` is still required.

## Advanced parameters

A full listing of configurable parameters is available in the `--help`:

```bash
docker run tgi-llamacpp --help

```

The table below summarizes key options:

| Parameter                           | Description                                                            |
|-------------------------------------|------------------------------------------------------------------------|
| `--n-threads`                       | Number of threads to use for generation                                |
| `--n-threads-batch`                 | Number of threads to use for batch processing                          |
| `--n-gpu-layers`                    | Number of layers to store in VRAM                                      |
| `--split-mode`                      | Split the model across multiple GPUs                                   |
| `--defrag-threshold`                | Defragment the KV cache if holes/size > threshold                      |
| `--numa`                            | Enable NUMA optimizations                                              |
| `--disable-mmap`                    | Disable memory mapping for the model                                   |
| `--use-mlock`                       | Use memory locking to prevent swapping                                 |
| `--disable-offload-kqv`             | Disable offloading of KQV operations to the GPU                        |
| `--disable-flash-attention`         | Disable flash attention                                                |
| `--type-k`                          | Data type used for K cache                                             |
| `--type-v`                          | Data type used for V cache                                             |
| `--validation-workers`              | Number of tokenizer workers used for payload validation and truncation |
| `--max-concurrent-requests`         | Maximum number of concurrent requests                                  |
| `--max-input-tokens`                | Maximum number of input tokens per request                             |
| `--max-total-tokens`                | Maximum number of total tokens (input + output) per request            |
| `--max-batch-total-tokens`          | Maximum number of tokens in a batch                                    |
| `--max-physical-batch-total-tokens` | Maximum number of tokens in a physical batch                           |
| `--max-batch-size`                  | Maximum number of requests per batch                                   |

---
[llama.cpp]: https://github.com/ggerganov/llama.cpp
[GGUF]: https://huggingface.co/models?library=gguf&sort=trending


================================================
FILE: docs/source/backends/neuron.md
================================================
# Neuron backend for AWS Trainium and Inferentia

The Neuron backend allows the deployment of TGI on AWS Trainium and Inferentia family of chips.

The following hardware targets are supported:
- Trainium 1,
- Inferentia 2.

## Features

The basic TGI features are supported:

- continuous batching,
- token streaming,
- greedy search and multinomial sampling using [transformers](https://huggingface.co/docs/transformers/generation_strategies#customize-text-generation).


## Deploy the service from the Hugging Face hub

The simplest way to deploy the NeuronX TGI service for a specific model is to follow the
deployment instructions in the model card:

- click on the "Deploy" button on the right,
- select your deployment service ("Inference Endpoints" and "SageMaker" are supported),
- select "AWS Trainum & Inferentia",
- follow the instructions.


## Deploy the service on a dedicated host

The service is launched simply by running the text-generation-inference container with two sets of parameters:

```
docker run <system_parameters> ghcr.io/huggingface/text-generation-inference:3.3.5-neuron <service_parameters>
```

- system parameters are used to map ports, volumes and devices between the host and the service,
- service parameters are forwarded to the `text-generation-launcher`.

When deploying a service, you will need a pre-compiled Neuron model. The Neuron TGI backend supports two main modes of operation:

- you can either deploy the service on a model that has already been exported to Neuron,
- or alternatively you can take advantage of the Neuron Model Cache to export your own model.

### Common system parameters

Whenever you launch a TGI service, we highly recommend you to mount a shared volume mounted as `/data` in the container: this is where
the models will be cached to speed up further instantiations of the service.

Note also that enough neuron devices should be made visible to the container, knowing that each neuron device has two cores (so when deploying on two cores you need to expose at least one device).
The recommended way to expose a device in a production environment is to use explicitly the `--device` option (e.g `--device /dev/neuron0`) repeated as many time as there are devices to be exposed.

Note: alternatively, for a quick local test it is also possible to launch the service in `privileged` mode to get access to all neuron devices.

Finally, you might want to export the `HF_TOKEN` if you want to access gated repositories.

Here is an example of a service instantiation exposing only the first device:

```
docker run -p 8080:80 \
       -v $(pwd)/data:/data \
       --device=/dev/neuron0 \
       -e HF_TOKEN=${HF_TOKEN} \
       ghcr.io/huggingface/text-generation-inference:<VERSION>-neuron \
       <service_parameters>
```

### Using a standard model from the 🤗 [HuggingFace Hub](https://huggingface.co/aws-neuron) (recommended)

We maintain a Neuron Model Cache of the most popular architecture and deployment parameters under [aws-neuron/optimum-neuron-cache](https://huggingface.co/aws-neuron/optimum-neuron-cache).

If you just want to try the service quickly using a model without exporting it to Neuron first, it is thus still possible, pending some conditions:
- you must specify the export parameters when launching the service (or use default parameters),
- the model configuration must be cached.

The snippet below shows how you can deploy a service from a hub standard model:

```
export HF_TOKEN=<YOUR_TOKEN>
docker run -p 8080:80 \
       -v $(pwd)/data:/data \
       --device=/dev/neuron0 \
       --device=/dev/neuron1 \
       --device=/dev/neuron2 \
       --device=/dev/neuron3 \
       -e HF_TOKEN=${HF_TOKEN} \
       -e HF_AUTO_CAST_TYPE="fp16" \
       -e HF_NUM_CORES=8 \
       ghcr.io/huggingface/text-generation-inference:<VERSION>-neuron \
       --model-id meta-llama/Meta-Llama-3-8B \
       --max-batch-size 1 \
       --max-input-length 3164 \
       --max-total-tokens 4096
```

### Using a model exported to a local path

Alternatively, you can first [export the model to neuron format](https://huggingface.co/docs/optimum-neuron/main/en/guides/export_model#exporting-neuron-models-using-text-generation-inference) locally.

You can then deploy the service inside the shared volume:

```
docker run -p 8080:80 \
       -v $(pwd)/data:/data \
       --device=/dev/neuron0 \
       --device=/dev/neuron1 \
       ghcr.io/huggingface/text-generation-inference:<VERSION>-neuron \
       --model-id /data/<neuron_model_path>
```

Note: You don't need to specify any service parameters, as they will all be deduced from the model export configuration. You must however expose enough devices to match the number of cores specified during the export phase.


### Using a neuron model from the 🤗 [HuggingFace Hub](https://huggingface.co/)

The easiest way to share a neuron model inside your organization is to push it on the Hugging Face hub, so that it can be deployed directly without requiring an export.

The snippet below shows how you can deploy a service from a hub neuron model:

```
docker run -p 8080:80 \
       -v $(pwd)/data:/data \
       --device=/dev/neuron0 \
       --device=/dev/neuron1 \
       -e HF_TOKEN=${HF_TOKEN} \
       ghcr.io/huggingface/text-generation-inference:<VERSION>-neuron \
       --model-id <organization>/<neuron-model>
```

### Choosing service parameters

Use the following command to list the available service parameters:

```
docker run ghcr.io/huggingface/text-generation-inference:<VERSION>-neuron --help
```

The configuration of an inference endpoint is always a compromise between throughput and latency: serving more requests in parallel will allow a higher throughput, but it will increase the latency.

The neuron models have static input dimensions `[batch_size, max_length]`.

This adds several restrictions to the following parameters:

- `--max-batch-size` must be set to `batch size`,
- `--max-input-length` must be lower than `max_length`,
- `--max-total-tokens` must be set to `max_length` (it is per-request).

Although not strictly necessary, but important for efficient prefilling:

- `--max-batch-prefill-tokens` should be set to `batch_size` * `max-input-length`.

### Choosing the correct batch size

As seen in the previous paragraph, neuron model static batch size has a direct influence on the endpoint latency and throughput.

Please refer to [text-generation-inference](https://github.com/huggingface/text-generation-inference) for optimization hints.

Note that the main constraint is to be able to fit the model for the specified `batch_size` within the total device memory available
on your instance (16GB per neuron core, with 2 cores per device).

## Query the service

You can query the model using either the `/generate` or `/generate_stream` routes:

```
curl 127.0.0.1:8080/generate \
    -X POST \
    -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
    -H 'Content-Type: application/json'
```

```
curl 127.0.0.1:8080/generate_stream \
    -X POST \
    -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
    -H 'Content-Type: application/json'
```

Note: replace 127.0.0.1:8080 with your actual IP address and port.


================================================
FILE: docs/source/backends/trtllm.md
================================================
# TensorRT-LLM backend

The NVIDIA TensorRT-LLM (TRTLLM) backend is a high-performance backend for LLMs
that uses NVIDIA's TensorRT library for inference acceleration.
It makes use of specific optimizations for NVIDIA GPUs, such as custom kernels.

To use the TRTLLM backend **you need to compile** `engines` for the models you want to use.
Each `engine` must be compiled for a given set of:
- GPU architecture that you will use for inference (e.g. A100, L40, etc.)
- Maximum batch size
- Maximum input length
- Maximum output length
- Maximum beams width

## Supported models

Check the [support matrix](https://nvidia.github.io/TensorRT-LLM/reference/support-matrix.html) to see which models are
supported.

## Compiling engines

You can use [Optimum-NVIDIA](https://github.com/huggingface/optimum-nvidia) to compile engines for the models you
want to use.

```bash
MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct"
DESTINATION="/tmp/engines/$MODEL_NAME"
HF_TOKEN="hf_xxx"
# Compile the engine using Optimum-NVIDIA
# This will create a compiled engine in the /tmp/engines/meta-llama/Llama-3.1-8B-Instruct
# directory for 1 GPU
docker run \
  --rm \
  -it \
  --gpus=1 \
  --shm-size=1g \
  -v "$DESTINATION":/engine \
  -e HF_TOKEN=$HF_TOKEN \
  -e HF_HUB_ENABLE_HF_TRANSFER=1 \
  huggingface/optimum-nvidia:v0.1.0b9-py310 \
    bash -c "optimum-cli export trtllm \
    --tp=1 \
    --pp=1 \
    --max-batch-size=64 \
    --max-input-length 4096 \
    --max-output-length 8192 \
    --max-beams-width=1 \
    --destination /tmp/engine \
    $MODEL_NAME && cp -rL /tmp/engine/* /engine/"
```

Your compiled engine will be saved in the `/tmp/engines/$MODEL_NAME` directory, in a subfolder named after the GPU used to compile the model.

## Using the TRTLLM backend

Run TGI-TRTLLM Docker image with the compiled engine:

```bash
MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct"
DESTINATION="/tmp/engines/$MODEL_NAME"
HF_TOKEN="hf_xxx"
docker run \
  --gpus 1 \
  --shm-size=1g \
  -it \
  --rm \
  -p 3000:3000 \
  -e MODEL=$MODEL_NAME \
  -e PORT=3000 \
  -e HF_TOKEN=$HF_TOKEN \
  -v "$DESTINATION"/<YOUR_GPU_ARCHITECTURE>/engines:/data \
  ghcr.io/huggingface/text-generation-inference:latest-trtllm \
  --model-id /data/ \
  --tokenizer-name $MODEL_NAME
```

## Development

To develop TRTLLM backend, you can use [dev containers](https://containers.dev/) with the following `.devcontainer.json` file:
```json
{
  "name": "CUDA",
  "build": {
    "dockerfile": "Dockerfile_trtllm",
    "context": ".."
  },
  "remoteEnv": {
    "PATH": "${containerEnv:PATH}:/usr/local/cuda/bin",
    "LD_LIBRARY_PATH": "$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64",
    "XLA_FLAGS": "--xla_gpu_cuda_data_dir=/usr/local/cuda"
  },
  "customizations" : {
    "jetbrains" : {
      "backend" : "CLion"
    }
  }
}
```

and `Dockerfile_trtllm`:

```Dockerfile
ARG cuda_arch_list="75-real;80-real;86-real;89-real;90-real"
ARG build_type=release
ARG ompi_version=4.1.7

# CUDA dependent dependencies resolver stage
FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS cuda-builder

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
    build-essential \
    cmake \
    curl \
    gcc-14  \
    g++-14 \
    git \
    git-lfs \
    lld \
    libssl-dev \
    libucx-dev \
    libasan8 \
    libubsan1 \
    ninja-build \
    pkg-config \
    pipx \
    python3 \
    python3-dev \
    python3-setuptools \
    tar \
    wget --no-install-recommends && \
    pipx ensurepath

ENV TGI_INSTALL_PREFIX=/usr/local/tgi
ENV TENSORRT_INSTALL_PREFIX=/usr/local/tensorrt

# Install OpenMPI
FROM cuda-builder AS mpi-builder
WORKDIR /opt/src/mpi

ARG ompi_version
ENV OMPI_VERSION=${ompi_version}
ENV OMPI_TARBALL_FILENAME=openmpi-${OMPI_VERSION}.tar.bz2
ADD --checksum=sha256:54a33cb7ad81ff0976f15a6cc8003c3922f0f3d8ceed14e1813ef3603f22cd34 \
    https://download.open-mpi.org/release/open-mpi/v4.1/${OMPI_TARBALL_FILENAME} .

RUN tar --strip-components=1 -xf ${OMPI_TARBALL_FILENAME} &&\
    ./configure --prefix=/usr/local/mpi --with-cuda=/usr/local/cuda --with-slurm && \
    make -j all && \
    make install && \
    rm -rf ${OMPI_TARBALL_FILENAME}/..

# Install TensorRT
FROM cuda-builder AS trt-builder
COPY backends/trtllm/scripts/install_tensorrt.sh /opt/install_tensorrt.sh
RUN chmod +x /opt/install_tensorrt.sh && \
    /opt/install_tensorrt.sh

# Build Backend
FROM cuda-builder AS tgi-builder
WORKDIR /usr/src/text-generation-inference

# Scoped global args reuse
ARG cuda_arch_list
ARG build_type
ARG sccache_gha_enabled
ARG actions_results_url
ARG actions_runtime_token

# Install Rust
ENV PATH="/root/.cargo/bin:$PATH"
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | bash -s -- -y && \
    chmod -R a+w /root/.rustup && \
    chmod -R a+w /root/.cargo && \
    cargo install sccache --version ">=0.10.0" --locked

ENV LD_LIBRARY_PATH="/usr/local/mpi/lib:$LD_LIBRARY_PATH"
ENV PKG_CONFIG_PATH="/usr/local/mpi/lib/pkgconfig"
ENV CMAKE_PREFIX_PATH="/usr/local/mpi:/usr/local/tensorrt"

ENV USE_LLD_LINKER=ON
ENV CUDA_ARCH_LIST=${cuda_arch_list}
```


================================================
FILE: docs/source/basic_tutorials/consuming_tgi.md
================================================
# Consuming Text Generation Inference

There are many ways to consume Text Generation Inference (TGI) server in your applications. After launching the server, you can use the [Messages API](https://huggingface.co/docs/text-generation-inference/en/messages_api) `/v1/chat/completions` route and make a `POST` request to get results from the server. You can also pass `"stream": true` to the call if you want TGI to return a stream of tokens.

For more information on the API, consult the OpenAPI documentation of `text-generation-inference` available [here](https://huggingface.github.io/text-generation-inference).

You can make the requests using any tool of your preference, such as curl, Python, or TypeScript. For an end-to-end experience, we've open-sourced [ChatUI](https://github.com/huggingface/chat-ui), a chat interface for open-access models.

## curl

After a successful server launch, you can query the model using the `v1/chat/completions` route, to get responses that are compliant to the OpenAI Chat Completion spec:

```bash
curl localhost:8080/v1/chat/completions \
    -X POST \
    -d '{
  "model": "tgi",
  "messages": [
    {
      "role": "system",
      "content": "You are a helpful assistant."
    },
    {
      "role": "user",
      "content": "What is deep learning?"
    }
  ],
  "stream": true,
  "max_tokens": 20
}' \
    -H 'Content-Type: application/json'
```

For non-chat use-cases, you can also use the `/generate` and `/generate_stream` routes.

```bash
curl 127.0.0.1:8080/generate \
    -X POST \
    -d '{
  "inputs":"What is Deep Learning?",
  "parameters":{
    "max_new_tokens":20
  }
}' \
    -H 'Content-Type: application/json'
```

## Python

### Inference Client

[`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/main/en/index) is a Python library to interact with the Hugging Face Hub, including its endpoints. It provides a high-level class, [`huggingface_hub.InferenceClient`](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient), which makes it easy to make calls to TGI's Messages API. `InferenceClient` also takes care of parameter validation and provides a simple-to-use interface.

Install `huggingface_hub` package via pip.

```bash
pip install huggingface_hub
```

You can now use `InferenceClient` the exact same way you would use `OpenAI` client in Python

```python
from huggingface_hub import InferenceClient

client = InferenceClient(
    base_url="http://localhost:8080/v1/",
)

output = client.chat.completions.create(
    model="tgi",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Count to 10"},
    ],
    stream=True,
    max_tokens=1024,
)

for chunk in output:
    print(chunk.choices[0].delta.content)
```

You can check out more details about OpenAI compatibility [here](https://huggingface.co/docs/huggingface_hub/en/guides/inference#openai-compatibility).

There is also an async version of the client, `AsyncInferenceClient`, based on `asyncio` and `aiohttp`. You can find docs for it [here](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.AsyncInferenceClient)

### OpenAI Client

You can directly use the OpenAI [Python](https://github.com/openai/openai-python) or [JS](https://github.com/openai/openai-node) clients to interact with TGI.

Install the OpenAI Python package via pip.

```bash
pip install openai
```

```python
from openai import OpenAI

# init the client but point it to TGI
client = OpenAI(
    base_url="http://localhost:8080/v1/",
    api_key="-"
)

chat_completion = client.chat.completions.create(
    model="tgi",
    messages=[
        {"role": "system", "content": "You are a helpful assistant." },
        {"role": "user", "content": "What is deep learning?"}
    ],
    stream=True
)

# iterate and print stream
for message in chat_completion:
    print(message)
```

## UI

### Gradio

Gradio is a Python library that helps you build web applications for your machine learning models with a few lines of code. It has a `ChatInterface` wrapper that helps create neat UIs for chatbots. Let's take a look at how to create a chatbot with streaming mode using TGI and Gradio. Let's install Gradio and Hub Python library first.

```bash
pip install huggingface-hub gradio
```

Assume you are serving your model on port 8080, we will query through [InferenceClient](consuming_tgi#inference-client).

```python
import gradio as gr
from huggingface_hub import InferenceClient

client = InferenceClient(base_url="http://127.0.0.1:8080")

def inference(message, history):
    partial_message = ""
    output = client.chat.completions.create(
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": message},
        ],
        stream=True,
        max_tokens=1024,
    )

    for chunk in output:
        partial_message += chunk.choices[0].delta.content
        yield partial_message

gr.ChatInterface(
    inference,
    type="messages",
    description="This is the demo for Gradio UI consuming TGI endpoint.",
    title="Gradio 🤝 TGI",
    examples=["Are tomatoes vegetables?"],
).queue().launch()
```

You can check out the UI and try the demo directly here 👇

<div class="block dark:hidden">
	<iframe
        src="https://merve-gradio-tgi-2.hf.space?__theme=light"
        width="850"
        height="750"
    ></iframe>
</div>
<div class="hidden dark:block">
    <iframe
        src="https://merve-gradio-tgi-2.hf.space?__theme=dark"
        width="850"
        height="750"
    ></iframe>
</div>


You can read more about how to customize a `ChatInterface` [here](https://www.gradio.app/guides/creating-a-chatbot-fast).

### ChatUI

[ChatUI](https://github.com/huggingface/chat-ui) is an open-source interface built for consuming LLMs. It offers many customization options, such as web search with SERP API and more. ChatUI can automatically consume the TGI server and even provides an option to switch between different TGI endpoints. You can try it out at [Hugging Chat](https://huggingface.co/chat/), or use the [ChatUI Docker Space](https://huggingface.co/new-space?template=huggingchat/chat-ui-template) to deploy your own Hugging Chat to Spaces.

To serve both ChatUI and TGI in same environment, simply add your own endpoints to the `MODELS` variable in `.env.local` file inside the `chat-ui` repository. Provide the endpoints pointing to where TGI is served.

```
{
// rest of the model config here
"endpoints": [{"url": "https://HOST:PORT/generate_stream"}]
}
```

![ChatUI](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/chatui_screen.png)


================================================
FILE: docs/source/basic_tutorials/gated_model_access.md
================================================
# Serving Private & Gated Models

If the model you wish to serve is behind gated access or the model repository on Hugging Face Hub is private, and you have access to the model, you can provide your Hugging Face Hub access token. You can generate and copy a read token from [Hugging Face Hub tokens page](https://huggingface.co/settings/tokens)

If you're using the CLI, set the `HF_TOKEN` environment variable. For example:

```
export HF_TOKEN=<YOUR READ TOKEN>
```

If you would like to do it through Docker, you can provide your token by specifying `HF_TOKEN` as shown below.

```bash
model=meta-llama/Llama-2-7b-chat-hf
volume=$PWD/data
token=<your READ token>

docker run --gpus all \
    --shm-size 1g \
    -e HF_TOKEN=$token \
    -p 8080:80 \
    -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.5 \
    --model-id $model
```


================================================
FILE: docs/source/basic_tutorials/monitoring.md
================================================
# Monitoring TGI server with Prometheus and Grafana dashboard

TGI server deployment can easily be monitored through a Grafana dashboard, consuming a Prometheus data collection. Example of inspectable metrics are statistics on the effective batch sizes used by TGI, prefill/decode latencies, number of generated tokens, etc.

In this tutorial, we look at how to set up a local Grafana dashboard to monitor TGI usage.

![Grafana dashboard for TGI](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/grafana.png)

## Setup on the server machine

First, on your server machine, TGI needs to be launched as usual. TGI exposes [multiple](https://github.com/huggingface/text-generation-inference/discussions/1127#discussioncomment-7240527) metrics that can be collected by Prometheus monitoring server.

In the rest of this tutorial, we assume that TGI was launched through Docker with `--network host`.

On the server where TGI is hosted, a Prometheus server needs to be installed and launched. To do so, please follow [Prometheus installation instructions](https://prometheus.io/download/#prometheus). For example, at the time of writing on a Linux machine:

```
wget https://github.com/prometheus/prometheus/releases/download/v2.52.0/prometheus-2.52.0.linux-amd64.tar.gz
tar -xvzf prometheus-2.52.0.linux-amd64.tar.gz
cd prometheus
```

Prometheus needs to be configured to listen on TGI's port. To do so, in Prometheus configuration file `prometheus.yml`, one needs to edit the lines:
```
    static_configs:
      - targets: ["0.0.0.0:80"]
```
to use the correct IP address and port.

We suggest to try `curl 0.0.0.0:80/generate -X POST -d '{"inputs":"hey chatbot, how are","parameters":{"max_new_tokens":15}}' -H 'Content-Type: application/json'` on the server side to make sure to configure the correct IP and port.

Once Prometheus is configured, Prometheus server can be launched on the same machine where TGI is launched:
```
./prometheus --config.file="prometheus.yml"
```

In this guide, Prometheus monitoring data will be consumed on a local computer. Hence, we need to forward Prometheus port (by default 9090) to the local computer. To do so, we can for example:
* Use ssh [local port forwarding](https://www.ssh.com/academy/ssh/tunneling-example)
* Use ngrok port tunneling

For simplicity, we will use [Ngrok](https://ngrok.com/docs/) in this guide to tunnel Prometheus port from the TGI server to the outside world.

For that, you should follow the steps at https://dashboard.ngrok.com/get-started/setup/linux, and once Ngrok is installed, use:
```bash
ngrok http http://0.0.0.0:9090
```

As a sanity check, one can make sure that Prometheus server can be accessed at the URL given by Ngrok (in the style of https://d661-4-223-164-145.ngrok-free.app) from a local machine.

## Setup on the monitoring machine

Monitoring is typically done on an other machine than the server one. We use a Grafana dashboard to monitor TGI's server usage.

Two options are available:
* Use Grafana Cloud for an hosted dashboard solution (https://grafana.com/products/cloud/).
* Self-host a grafana dashboard.

In this tutorial, for simplicity, we will self host the dashbard. We recommend installing Grafana Open-source edition following [the official install instructions](https://grafana.com/grafana/download?platform=linux&edition=oss), using the available Linux binaries. For example:

```bash
wget https://dl.grafana.com/oss/release/grafana-11.0.0.linux-amd64.tar.gz
tar -zxvf grafana-11.0.0.linux-amd64.tar.gz
cd grafana-11.0.0
./bin/grafana-server
```

Once the Grafana server is launched, the Grafana interface is available at http://localhost:3000. One needs to log in with the `admin` username and `admin` password.

Once logged in, the Prometheus data source for Grafana needs to be configured, in the option `Add your first data source`. There, a Prometheus data source needs to be added with the Ngrok address we got earlier, that exposes Prometheus port (example: https://d661-4-223-164-145.ngrok-free.app).

Once Prometheus data source is configured, we can finally create our dashboard! From home, go to `Create your first dashboard` and then `Import dashboard`. There, we will use the recommended dashboard template [tgi_grafana.json](https://github.com/huggingface/text-generation-inference/blob/main/assets/tgi_grafana.json) for a dashboard ready to be used, but you may configure your own dashboard as you like.

Community contributed dashboard templates are also available, for example [here](https://grafana.com/grafana/dashboards/19831-text-generation-inference-dashboard/) or [here](https://grafana.com/grafana/dashboards/20246-text-generation-inference/).

Load your dashboard configuration, and your TGI dashboard should be ready to go!


================================================
FILE: docs/source/basic_tutorials/non_core_models.md
================================================
# Non-core Model Serving

TGI supports various LLM architectures (see full list [here](../supported_models)). If you wish to serve a model that is not one of the supported models, TGI will fallback to the `transformers` implementation of that model. This means you will be unable to use some of the features introduced by TGI, such as tensor-parallel sharding or flash attention. However, you can still get many benefits of TGI, such as continuous batching or streaming outputs.

You can serve these models using the same Docker command-line invocation as with fully supported models 👇

```bash
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id gpt2
```

If the model you wish to serve is a custom transformers model, and its weights and implementation are available in the Hub, you can still serve the model by passing the `--trust-remote-code` flag to the `docker run` command like below 👇

```bash
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id <CUSTOM_MODEL_ID> --trust-remote-code
```

Finally, if the model is not on Hugging Face Hub but on your local, you can pass the path to the folder that contains your model like below 👇

```bash
# Make sure your model is in the $volume directory
docker run --shm-size 1g -p 8080:80 -v $volume:/data  ghcr.io/huggingface/text-generation-inference:latest --model-id /data/<PATH-TO-FOLDER>
```

You can refer to [transformers docs on custom models](https://huggingface.co/docs/transformers/main/en/custom_models) for more information.


================================================
FILE: docs/source/basic_tutorials/preparing_model.md
================================================
# Preparing the Model

Text Generation Inference improves the model in several aspects.

## Quantization

TGI supports [bits-and-bytes](https://github.com/TimDettmers/bitsandbytes#bitsandbytes), [GPT-Q](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [Marlin](https://github.com/IST-DASLab/marlin), [EETQ](https://github.com/NetEase-FuXi/EETQ), [EXL2](https://github.com/turboderp/exllamav2), and [fp8](https://developer.nvidia.com/blog/nvidia-arm-and-intel-publish-fp8-specification-for-standardization-as-an-interchange-format-for-ai/) quantization. To speed up inference with quantization, simply set `quantize` flag to `bitsandbytes`, `gptq`, `awq`, `marlin`, `exl2`, `eetq` or `fp8` depending on the quantization technique you wish to use. When using GPT-Q quantization, you need to point to one of the models [here](https://huggingface.co/models?search=gptq). Similarly, when using AWQ quantization, you need to point to one of [these models](https://huggingface.co/models?search=awq). To get more information about quantization, please refer to [quantization guide](./../conceptual/quantization)


## RoPE Scaling

RoPE scaling can be used to increase the sequence length of the model during the inference time without necessarily fine-tuning it. To enable RoPE scaling, simply pass `--rope-scaling`, `--max-input-length` and `--rope-factors` flags when running through CLI. `--rope-scaling` can take the values `linear` or `dynamic`. If your model is not fine-tuned to a longer sequence length, use `dynamic`. `--rope-factor` is the ratio between the intended max sequence length and the model's original max sequence length. Make sure to pass `--max-input-length` to provide maximum input length for extension.

<Tip>

We recommend using `dynamic` RoPE scaling.

</Tip>

## Safetensors

[Safetensors](https://github.com/huggingface/safetensors) is a fast and safe persistence format for deep learning models, and is required for tensor parallelism. TGI supports `safetensors` model loading under the hood. By default, given a repository with `safetensors` and `pytorch` weights, TGI will always load `safetensors`. If there's no `pytorch` weights, TGI will convert the weights to `safetensors` format.


================================================
FILE: docs/source/basic_tutorials/safety.md
================================================
# Model safety.

[Pytorch uses pickle](https://pytorch.org/docs/master/generated/torch.load.html) by default meaning that for quite a long while
*Every* model using that format is potentially executing unintended code while purely loading the model.

There is a big red warning on Python's page for pickle [link](https://docs.python.org/3/library/pickle.html) but for quite a while
this was ignored by the community. Now that AI/ML is getting used much more ubiquitously we need to switch away from this format.

HuggingFace is leading the effort here by creating a new format which contains pure data ([safetensors](https://github.com/huggingface/safetensors))
and moving slowly but surely all the libs to make use of it by default.
The move is intentionnally slow in order to make breaking changes as little impact as possible on users throughout.


# TGI 2.0

Since the release of TGI 2.0, we take the opportunity of this major version increase to break backward compatibility for these pytorch
models (since they are a huge security risk for anyone deploying them).


From now on, TGI will not convert automatically pickle files without having `--trust-remote-code` flag or `TRUST_REMOTE_CODE=true` in the environment variables.
This flag is already used for community defined inference code, and is therefore quite representative of the level of confidence you are giving the model providers.


If you want to use a model that uses pickle, but you still do not want to trust the authors entirely we recommend making a convertion on our space made for that.

https://huggingface.co/spaces/safetensors/convert

This space will create a PR on the original model, which you are use directly regardless of merge status from the original authors. Just use
```
docker run .... --revision refs/pr/#ID # Or use REVISION=refs/pr/#ID in the environment
```


================================================
FILE: docs/source/basic_tutorials/train_medusa.md
================================================
# Train Medusa

This tutorial will show you how to train a Medusa model on a dataset of your choice. Please check out the [speculation documentation](../conceptual/speculation) for more information on how Medusa works and speculation in general.

## What are the benefits of training a Medusa model?

Training Medusa heads can greatly improve the speed of generation. Medusa adds extra "heads" to LLMs to predict multiple future tokens simultaneously. When augmenting a model with Medusa, the original model stays untouched, and only the new heads are fine-tuned during training.

One of the most important things is to have a good dataset (with similar data to what will be used in production) because Medusa has a much higher hit-rate when the generation is in-domain.

If you train Medusa on a dataset that is very different from the one you will use in production then the model will not be able to predict the future tokens accurately and consequently the speedup will be minimal or non-existent.

## Self-distillation (Generating data for training)

There are many methods for preparing data for training, but one of the easiest and most effective ways is to "self-distill" the data. This means that you can use the same model to generate the data that you will use to train the model.

Essentially, you prompt the model with a similar input to what you will use in production and the model will generate the output.

We'll use this output to help train the medusa heads to predict the `n+1`, `n+2`, `n+3`, etc tokens in the sequence.

## Training

The original implementation of Medusa is available at [https://github.com/FasterDecoding/Medusa](https://github.com/FasterDecoding/Medusa) and we'll follow a very similar process to train the model as described on the original repository.

### Getting Started

There are two methods for training the model:

- `torchrun` that is a wrapper around `torch.distributed.launch`
- a forked version of `axlotl` that supports Medusa

In this tutorial we'll use `torchrun` to train the model as it is the most straightforward way to train the model but similar steps can be followed to train the model using `axlotl` if you prefer.

### Training with `torchrun`

```bash
mkdir medusa-training
cd medusa-training

pyenv install 3.10
pyenv local 3.10

uv venv -p 3.10
source .venv/bin/activate
```

Now lets clone the original `Medusa` repository and install the library.

```bash
git clone https://github.com/FasterDecoding/Medusa.git
cd Medusa
pip install -e .
```

Next we'll need some data to train on, we can use the `ShareGPT_Vicuna_unfiltered` dataset that is available on the Hugging Face Hub.

```bash
apt install git-lfs
git lfs install
git clone https://huggingface.co/datasets/Aeala/ShareGPT_Vicuna_unfiltered
```

Currently our directory structure looks like this:

```bash
.
├── assets
├── CITATION.cff
├── create_data.py
├── data_generation
├── deepspeed.json
├── last_run_prepared
├── LICENSE
├── llm_judge
├── medusa
├── medusa_llm.egg-info
├── mistral.json
├── notebooks
├── pyproject.toml
├── README.md
├── ROADMAP.md
├── scripts
├── ShareGPT_Vicuna_unfiltered
│   ├── README.md
│   ├── ShareGPT_2023.05.04v0_Wasteland_Edition.json
│   └── ShareGPT_V4.3_unfiltered_cleaned_split.json
├── simple_gradio_interface.py
├── tiny-llama.json
└── vicuna_7b_qlora_stage1
```

## Start Training

Now the lets generate the data and start training the model. This process will take a while since we are generating data from the model.

First make sure you have an instance of TGI running with the model you want to use for self-distillation.

```bash
model=HuggingFaceH4/zephyr-7b-beta
volume=/home/ubuntu/.cache/huggingface/hub/

docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model
```

Now we can generate the data using the `create_data.py` script.

```bash
python create_data.py \
    --input-filename ShareGPT_Vicuna_unfiltered/ShareGPT_V4.3_unfiltered_cleaned_split.json \
    --output-filename zephyr_self_distill.json
```

At this point our terminal should look like this:

<div class="flex justify-center">
    <img
        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/medusa-train-large.gif"
        width="550"
    />
</div>

> Note: In the screen shot above we are only using a the first 500 examples from the dataset to speed up the process, you should have a much larger dataset for training.

Now we can finally get to the fun part and start training the model!

Using `torchrun` we can easily launch the `medusa` training script with the `zephyr_self_distill.json` configuration file.

> NOTE: If you just self-distilled you may still have the model running, make sure to stop it before starting the training in order to allow all of the resources to be used for training.

```bash
WANDB_MODE=offline torchrun --nproc_per_node=4 medusa/train/train_legacy.py \
    --model_name_or_path HuggingFaceH4/zephyr-7b-beta \
    --data_path zephyr_self_distill.json \
    --bf16 True \
    --output_dir zephyr_out \
    --num_train_epochs 5 \
    --per_device_train_batch_size 4 \
    --per_device_eval_batch_size 4 \
    --gradient_accumulation_steps 4 \
    --evaluation_strategy "no" \
    --save_strategy "no" \
    --learning_rate 1e-3 \
    --weight_decay 0.0 \
    --warmup_ratio 0.1 \
    --lr_scheduler_type "cosine" \
    --logging_steps 1 \
    --tf32 True \
    --model_max_length 2048 \
    --lazy_preprocess True \
    --medusa_num_heads 3 \
    --medusa_num_layers 1 \
    --deepspeed deepspeed.json
```

<div class="flex justify-center">
    <img
        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/medusa-train-heads-large.gif"
        width="550"
    />
</div>

If successful, you should see the similar output to the one below:

```bash
wandb: Run history:
wandb:                    train/epoch ▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
wandb:              train/global_step ▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
wandb:            train/learning_rate ▅███▇▇▆▅▅▄▃▂▂▁▁▁
wandb:                     train/loss ██▆▄▄▃▃▂▂▃▁▁▂▁▁▁
wandb:             train/medusa0_loss ▆▆▇▆▆▅▄▅▃▃▃▃▂▂▂▂▂▃▂▂▂▁▁▁▂▁▁▁▁▁█▁▁▁▂▁▁▁▁▁
wandb:             train/medusa0_top1 ▁▁▁▁▁▁▁▁▃▂▃▃▄▄▄▃▄▃▄▄▅▅▆▅▆▆▇▅▇▇▄▇█▇▅▇█▆▇▇
wandb:             train/medusa1_loss ▇▇█▇▇▆▅▅▃▄▃▃▃▃▃▃▃▃▃▃▂▁▂▂▂▁▁▂▁▁▇▁▁▁▂▁▁▁▁▁
wandb:             train/medusa1_top1 ▁▁▁▁▁▁▁▁▃▂▃▃▃▄▄▃▃▂▃▃▅▅▆▄█▆▇▅▇▇▅█▇▇▅▇█▆▆▇
wandb:             train/medusa2_loss ▃▃▄▄▄▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁█▁▁▁▂▁▁▁▁▁
wandb:             train/medusa2_top1 ▁▁▁▂▁▁▁▁▂▂▃▃▃▄▄▃▃▂▃▃▅▆▅▄█▆▆▅▆▆▄█▇▇▄▇█▆▆▇
wandb:               train/total_flos ▁
wandb:               train/train_loss ▁
wandb:            train/train_runtime ▁
wandb: train/train_samples_per_second ▁
wandb:   train/train_steps_per_second ▁
wandb:
wandb: Run summary:
wandb:                    train/epoch 2.0
wandb:              train/global_step 16
wandb:            train/learning_rate 0.0
wandb:                     train/loss 14.8906
wandb:             train/medusa0_loss 4.25
wandb:             train/medusa0_top1 0.28809
wandb:             train/medusa1_loss 4.8125
wandb:             train/medusa1_top1 0.22727
wandb:             train/medusa2_loss 5.5
wandb:             train/medusa2_top1 0.17293
wandb:               train/total_flos 0.0
wandb:               train/train_loss 23.98242
wandb:            train/train_runtime 396.9266
wandb: train/train_samples_per_second 2.519
wandb:   train/train_steps_per_second 0.04
```

Last but most importantly, don't forget to push this model to the Hugging Face Hub so you can use it in your projects.

```bash
python -m medusa.hf_utils \
    --folder zephyr_out_medusa_mlp_zephyr-7b-beta_medusa_3_lr_0.001_layers_1 \
    --repo drbh/zephyr_medusa_demo
```

Woo, we've successfully trained a Medusa model and pushed it to the Hugging Face Hub! 🎉


================================================
FILE: docs/source/basic_tutorials/using_cli.md
================================================
# Using TGI CLI

You can use TGI command-line interface (CLI) to download weights, serve and quantize models, or get information on serving parameters. To install the CLI, please refer to [the installation section](../installation#install-cli).

`text-generation-server` lets you download the model with `download-weights` command like below 👇

```bash
text-generation-server download-weights MODEL_HUB_ID
```

You can also use it to quantize models like below 👇

```bash
text-generation-server quantize MODEL_HUB_ID OUTPUT_DIR
```

You can use `text-generation-launcher` to serve models.

```bash
text-generation-launcher --model-id MODEL_HUB_ID --port 8080
```

There are many options and parameters you can pass to `text-generation-launcher`. The documentation for CLI is kept minimal and intended to rely on self-generating documentation, which can be found by running

```bash
text-generation-launcher --help
```

You can also find it hosted in this [Swagger UI](https://huggingface.github.io/text-generation-inference/).

Same documentation can be found for `text-generation-server`.

```bash
text-generation-server --help
```


================================================
FILE: docs/source/basic_tutorials/using_guidance.md
================================================
# Guidance

Text Generation Inference (TGI) now supports [JSON and regex grammars](#grammar-and-constraints) and [tools and functions](#tools-and-functions) to help developers guide LLM responses to fit their needs.

These feature are available starting from version `1.4.3`. They are accessible via the [`huggingface_hub`](https://pypi.org/project/huggingface-hub/) library. The tool support is compatible with OpenAI's client libraries. The following guide will walk you through the new features and how to use them!

_note: guidance is supported as grammar in the `/generate` endpoint and as tools in the `v1/chat/completions` endpoint._

## How it works

TGI leverages the [outlines](https://github.com/outlines-dev/outlines) library to efficiently parse and compile the grammatical structures and tools specified by users. This integration transforms the defined grammars into an intermediate representation that acts as a framework to guide and constrain content generation, ensuring that outputs adhere to the specified grammatical rules.

If you are interested in the technical details on how outlines is used in TGI, you can check out the [conceptual guidance documentation](../conceptual/guidance).

## Table of Contents 📚

### Grammar and Constraints

- [The Grammar Parameter](#the-grammar-parameter): Shape your AI's responses with precision.
- [Constrain with Pydantic](#constrain-with-pydantic): Define a grammar using Pydantic models.
- [JSON Schema Integration](#json-schema-integration): Fine-grained control over your requests via JSON schema.
- [Using the client](#using-the-client): Use TGI's client libraries to shape the AI's responses.

### Tools and Functions

- [The Tools Parameter](#the-tools-parameter): Enhance the AI's capabilities with predefined functions.
- [Via the client](#text-generation-inference-client): Use TGI's client libraries to interact with the Messages API and Tool functions.
- [OpenAI integration](#openai-integration): Use OpenAI's client libraries to interact with TGI's Messages API and Tool functions.

## Grammar and Constraints 🛣️

### The Grammar Parameter

In TGI `1.4.3`, we've introduced the grammar parameter, which allows you to specify the format of the response you want from the LLM.

Using curl, you can make a request to TGI's Messages API with the grammar parameter. This is the most primitive way to interact with the API and using [Pydantic](#constrain-with-pydantic) is recommended for ease of use and readability.

```json
curl localhost:3000/generate \
    -X POST \
    -H 'Content-Type: application/json' \
    -d '{
    "inputs": "I saw a puppy a cat and a raccoon during my bike ride in the park",
    "parameters": {
        "repetition_penalty": 1.3,
        "grammar": {
            "type": "json",
            "value": {
                "properties": {
                    "location": {
                        "type": "string"
                    },
                    "activity": {
                        "type": "string"
                    },
                    "animals_seen": {
                        "type": "integer",
                        "minimum": 1,
                        "maximum": 5
                    },
                    "animals": {
                        "type": "array",
                        "items": {
                            "type": "string"
                        }
                    }
                },
                "required": ["location", "activity", "animals_seen", "animals"]
            }
        }
    }
}'
// {"generated_text":"{ \n\n\"activity\": \"biking\",\n\"animals\": [\"puppy\",\"cat\",\"raccoon\"],\n\"animals_seen\": 3,\n\"location\": \"park\"\n}"}

```

### Hugging Face Hub Python Library

The Hugging Face Hub Python library provides a client that makes it easy to interact with the Messages API. Here's an example of how to use the client to send a request with a grammar parameter.

```python
from huggingface_hub import InferenceClient

client = InferenceClient("http://localhost:3000")

schema = {
    "properties": {
        "location": {"title": "Location", "type": "string"},
        "activity": {"title": "Activity", "type": "string"},
        "animals_seen": {
            "maximum": 5,
            "minimum": 1,
            "title": "Animals Seen",
            "type": "integer",
        },
        "animals": {"items": {"type": "string"}, "title": "Animals", "type": "array"},
    },
    "required": ["location", "activity", "animals_seen", "animals"],
    "title": "Animals",
    "type": "object",
}

user_input = "I saw a puppy a cat and a raccoon during my bike ride in the park"
resp = client.text_generation(
    f"convert to JSON: 'f{user_input}'. please use the following schema: {schema}",
    max_new_tokens=100,
    seed=42,
    grammar={"type": "json", "value": schema},
)

print(resp)
# { "activity": "bike ride", "animals": ["puppy", "cat", "raccoon"], "animals_seen": 3, "location": "park" }

```

A grammar can be defined using Pydantic models, JSON schemas, or regular expressions. The LLM will then generate a response that conforms to the specified grammar.

> Note: A grammar must compile to an intermediate representation to constrain the output. Grammar compilation is a computationally expensive and may take a few seconds to complete on the first request. Subsequent requests will use the cached grammar and will be much faster.

### Constrain with Pydantic

Using Pydantic models we can define a similar grammar as the previous example in a shorter and more readable way.

```python
from huggingface_hub import InferenceClient
from pydantic import BaseModel, conint
from typing import List


class Animals(BaseModel):
    location: str
    activity: str
    animals_seen: conint(ge=1, le=5)  # Constrained integer type
    animals: List[str]


client = InferenceClient("http://localhost:3000")

user_input = "I saw a puppy a cat and a raccoon during my bike ride in the park"
resp = client.text_generation(
    f"convert to JSON: 'f{user_input}'. please use the following schema: {Animals.model_json_schema()}",
    max_new_tokens=100,
    seed=42,
    grammar={"type": "json", "value": Animals.model_json_schema()},
)

print(resp)
# { "activity": "bike ride", "animals": ["puppy", "cat", "raccoon"], "animals_seen": 3, "location": "park" }


```

defining a grammar as regular expressions

```python
from huggingface_hub import InferenceClient

client = InferenceClient("http://localhost:3000")

section_regex = "(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)"
regexp = f"HELLO\.{section_regex}\.WORLD\.{section_regex}"

# This is a more realistic example of an ip address regex
# regexp = f"{section_regex}\.{section_regex}\.{section_regex}\.{section_regex}"


resp = client.text_generation(
    f"Whats Googles DNS? Please use the following regex: {regexp}",
    seed=42,
    grammar={
        "type": "regex",
        "value": regexp,
    },
)


print(resp)
# HELLO.255.WORLD.255

```

## Tools and Functions 🛠️

### The Tools Parameter

In addition to the grammar parameter, we've also introduced a set of tools and functions to help you get the most out of the Messages API.

Tools are a set of user defined functions that can be used in tandem with the chat functionality to enhance the LLM's capabilities. Functions, similar to grammar are defined as JSON schema and can be passed as part of the parameters to the Messages API.

```json
curl localhost:3000/v1/chat/completions \
    -X POST \
    -H 'Content-Type: application/json' \
    -d '{
    "model": "tgi",
    "messages": [
        {
            "role": "user",
            "content": "What is the weather like in New York?"
        }
    ],
    "tools": [
        {
            "type": "function",
            "function": {
                "name": "get_current_weather",
                "description": "Get the current weather",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "location": {
                            "type": "string",
                            "description": "The city and state, e.g. San Francisco, CA"
                        },
                        "format": {
                            "type": "string",
                            "enum": ["celsius", "fahrenheit"],
                            "description": "The temperature unit to use. Infer this from the users location."
                        }
                    },
                    "required": ["location", "format"]
                }
            }
        }
    ],
    "tool_choice": "get_current_weather"
}'
// {"id":"","object":"text_completion","created":1709051640,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-native","choices":[{"index":0,"message":{"role":"assistant","tool_calls":{"id":0,"type":"function","function":{"description":null,"name":"tools","parameters":{"format":"celsius","location":"New York"}}}},"logprobs":null,"finish_reason":"eos_token"}],"usage":{"prompt_tokens":157,"completion_tokens":19,"total_tokens":176}}
```

### Chat Completion with Tools

Grammars are supported in the `/generate` endpoint, while tools are supported in the `/chat/completions` endpoint. Here's an example of how to use the client to send a request with a tool parameter.

```python
from huggingface_hub import InferenceClient

client = InferenceClient("http://localhost:3000")

tools = [
    {
        "type": "function",
        "function": {
            "name": "get_current_weather",
            "description": "Get the current weather",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city and state, e.g. San Francisco, CA",
                    },
                    "format": {
                        "type": "string",
                        "enum": ["celsius", "fahrenheit"],
                        "description": "The temperature unit to use. Infer this from the users location.",
                    },
                },
                "required": ["location", "format"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "get_n_day_weather_forecast",
            "description": "Get an N-day weather forecast",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city and state, e.g. San Francisco, CA",
                    },
                    "format": {
                        "type": "string",
                        "enum": ["celsius", "fahrenheit"],
                        "description": "The temperature unit to use. Infer this from the users location.",
                    },
                    "num_days": {
                        "type": "integer",
                        "description": "The number of days to forecast",
                    },
                },
                "required": ["location", "format", "num_days"],
            },
        },
    },
]

chat = client.chat_completion(
    messages=[
        {
            "role": "system",
            "content": "You're a helpful assistant! Answer the users question best you can.",
        },
        {
            "role": "user",
            "content": "What is the weather like in Brooklyn, New York?",
        },
    ],
    tools=tools,
    seed=42,
    max_tokens=100,
)

print(chat.choices[0].message.tool_calls)
# [ChatCompletionOutputToolCall(function=ChatCompletionOutputFunctionDefinition(arguments={'format': 'fahrenheit', 'location': 'Brooklyn, New York', 'num_days': 7}, name='get_n_day_weather_forecast', description=None), id=0, type='function')]

```

### OpenAI integration

TGI exposes an OpenAI-compatible API, which means you can use OpenAI's client libraries to interact with TGI's Messages API and Tool functions.

```python
from openai import OpenAI

# Initialize the client, pointing it to one of the available models
client = OpenAI(
    base_url="http://localhost:3000/v1",
    api_key="_",
)

# NOTE: tools defined above and removed for brevity

chat_completion = client.chat.completions.create(
    model="tgi",
    messages=[
        {
            "role": "system",
            "content": "Don't make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous.",
        },
        {
            "role": "user",
            "content": "What's the weather like the next 3 days in San Francisco, CA?",
        },
    ],
    tools=tools,
    tool_choice="auto",  # tool selected by model
    max_tokens=500,
)


called = chat_completion.choices[0].message.tool_calls
print(called)
# {
#     "id": 0,
#     "type": "function",
#     "function": {
#         "description": None,
#         "name": "tools",
#         "parameters": {
#             "format": "celsius",
#             "location": "San Francisco, CA",
#             "num_days": 3,
#         },
#     },
# }
```

### Tool Choice Configuration

When configuring how the model interacts with tools during a chat completion, there are several options for determining if or how a tool should be called. These options are controlled by the `tool_choice` parameter, which specifies the behavior of the model in relation to tool usage. The following modes are supported:

1. **`auto`**:

   - The model decides whether to call a tool or generate a response message based on the user's input.
   - If tools are provided, this is the default mode.
   - Example usage:
     ```python
     tool_choice="auto"
     ```

2. **`none`**:

   - The model will never call any tools and will only generate a response message.
   - If no tools are provided, this is the default mode.
   - Example usage:
     ```python
     tool_choice="none"
     ```

3. **`required`**:

   - The model must call one or more tools and will not generate a response message on its own.
   - Example usage:
     ```python
     tool_choice="required"
     ```

4. **Specific Tool Call by Function Name**:
   - You can force the model to call a specific tool either by specifying the tool function directly or by using an object definition.
   - Two ways to do this:
     1. Provide the function name as a string:
        ```python
        tool_choice="get_current_weather"
        ```
     2. Use the function object format:
        ```python
        tool_choice={
          "type": "function",
          "function": {
              "name": "get_current_weather"
          }
        }
        ```

These options allow flexibility when integrating tools with the chat completions endpoint. You can configure the model to either rely on tools automatically or force it to follow a predefined behavior, based on the needs of the task at hand.

---

| **Tool Choice Option**                | **Description**                                                                                                                 | **When to Use**                                                                        |
| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------- |
| `auto`                                | The model decides whether to call a tool or generate a message. This is the default if tools are provided.                      | Use when you want the model to decide when a tool is necessary.                        |
| `none`                                | The model generates a message without calling any tools. This is the default if no tools are provided.                          | Use when you do not want the model to call any tools.                                  |
| `required`                            | The model must call one or more tools and will not generate a message on its own.                                               | Use when a tool call is mandatory, and you do not want a regular message generated.    |
| Specific Tool Call (`name` or object) | Force the model to call a specific tool either by specifying its name (`tool_choice="get_current_weather"`) or using an object. | Use when you want to restrict the model to calling a particular tool for the response. |


================================================
FILE: docs/source/basic_tutorials/visual_language_models.md
================================================
# Vision Language Model Inference in TGI

Visual Language Model (VLM) are models that consume both image and text inputs to generate text.

VLM's are trained on a combination of image and text data and can handle a wide range of tasks, such as image captioning, visual question answering, and visual dialog.

> What distinguishes VLMs from other text and image models is their ability to handle long context and generate text that is coherent and relevant to the image even after multiple turns or in some cases, multiple images.

Below are couple of common use cases for vision language models:

- **Image Captioning**: Given an image, generate a caption that describes the image.
- **Visual Question Answering (VQA)**: Given an image and a question about the image, generate an answer to the question.
- **Mulimodal Dialog**: Generate response to multiple turns of images and conversations.
- **Image Information Retrieval**: Given an image, retrieve information from the image.

## How to Use a Vision Language Model?

### Hugging Face Hub Python Library

To infer with vision language models through Python, you can use the [`huggingface_hub`](https://pypi.org/project/huggingface-hub/) library. The `InferenceClient` class provides a simple way to interact with the [Inference API](https://huggingface.co/docs/api-inference/index). Images can be passed as URLs or base64-encoded strings. The `InferenceClient` will automatically detect the image format.

```python
from huggingface_hub import InferenceClient

client = InferenceClient(base_url="http://127.0.0.1:3000")
image = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
prompt = f"![]({image})What is this a picture of?\n\n"
for token in client.text_generation(prompt, max_new_tokens=16, stream=True):
    print(token)

# This is a picture of an anthropomorphic rabbit in a space suit.
```

```python
from huggingface_hub import InferenceClient
import base64
import requests
import io

client = InferenceClient(base_url="http://127.0.0.1:3000")

# read image from local file
image_path = "rabbit.png"
with open(image_path, "rb") as f:
    image = base64.b64encode(f.read()).decode("utf-8")

image = f"data:image/png;base64,{image}"
prompt = f"![]({image})What is this a picture of?\n\n"

for token in client.text_generation(prompt, max_new_tokens=10, stream=True):
    print(token)

# This is a picture of an anthropomorphic rabbit in a space suit.
```

or via the `chat_completion` endpoint:

```python
from huggingface_hub import InferenceClient

client = InferenceClient(base_url="http://127.0.0.1:3000")

chat = client.chat_completion(
    messages=[
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Whats in this image?"},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
                    },
                },
            ],
        },
    ],
    seed=42,
    max_tokens=100,
)

print(chat)
# ChatCompletionOutput(choices=[ChatCompletionOutputComplete(finish_reason='length', index=0, message=ChatCompletionOutputMessage(role='assistant', content=" The image you've provided features an anthropomorphic rabbit in spacesuit attire. This rabbit is depicted with human-like posture and movement, standing on a rocky terrain with a vast, reddish-brown landscape in the background. The spacesuit is detailed with mission patches, circuitry, and a helmet that covers the rabbit's face and ear, with an illuminated red light on the chest area.\n\nThe artwork style is that of a", name=None, tool_calls=None), logprobs=None)], created=1714589614, id='', model='llava-hf/llava-v1.6-mistral-7b-hf', object='text_completion', system_fingerprint='2.0.2-native', usage=ChatCompletionOutputUsage(completion_tokens=100, prompt_tokens=2943, total_tokens=3043))

```

or with OpenAI's [client library](https://github.com/openai/openai-python):

```python
from openai import OpenAI

# init the client but point it to TGI
client = OpenAI(base_url="http://localhost:3000/v1", api_key="-")

chat_completion = client.chat.completions.create(
    model="tgi",
    messages=[
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Whats in this image?"},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
                    },
                },
            ],
        },
    ],
    stream=False,
)

print(chat_completion)
# ChatCompletion(id='', choices=[Choice(finish_reason='eos_token', index=0, logprobs=None, message=ChatCompletionMessage(content=' The image depicts an anthropomorphic rabbit dressed in a space suit with gear that resembles NASA attire. The setting appears to be a solar eclipse with dramatic mountain peaks and a partial celestial body in the sky. The artwork is detailed and vivid, with a warm color palette and a sense of an adventurous bunny exploring or preparing for a journey beyond Earth. ', role='assistant', function_call=None, tool_calls=None))], created=1714589732, model='llava-hf/llava-v1.6-mistral-7b-hf', object='text_completion', system_fingerprint='2.0.2-native', usage=CompletionUsage(completion_tokens=84, prompt_tokens=2943, total_tokens=3027))
```

### Inference Through Sending `cURL` Requests

To use the `generate_stream` endpoint with curl, you can add the `-N` flag. This flag disables curl default buffering and shows data as it arrives from the server.

```bash
curl -N 127.0.0.1:3000/generate_stream \
    -X POST \
    -d '{"inputs":"![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png)What is this a picture of?\n\n","parameters":{"max_new_tokens":16, "seed": 42}}' \
    -H 'Content-Type: application/json'

# ...
# data:{"index":16,"token":{"id":28723,"text":".","logprob":-0.6196289,"special":false},"generated_text":"This is a picture of an anthropomorphic rabbit in a space suit.","details":null}
```

### Inference Through JavaScript

First, we need to install the `@huggingface/inference` library.

```bash
npm install @huggingface/inference
```

Whether you use Inference Providers (our serverless API), or Inference Endpoints, you can call `InferenceClient`.

We can create a `InferenceClient` providing our endpoint URL and [Hugging Face access token](https://huggingface.co/settings/tokens).

```js
import { InferenceClient } from "@huggingface/inference";

const client = new InferenceClient('hf_YOUR_TOKEN', { endpointUrl: 'https://YOUR_ENDPOINT.endpoints.huggingface.cloud' });

const prompt =
  "![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png)What is this a picture of?\n\n";

const stream = client.textGenerationStream({
  inputs: prompt,
  parameters: { max_new_tokens: 16, seed: 42 },
});
for await (const r of stream) {
  // yield the generated token
  process.stdout.write(r.token.text);
}

// This is a picture of an anthropomorphic rabbit in a space suit.
```

## Combining Vision Language Models with Other Features

VLMs in TGI have several advantages, for example these models can be used in tandem with other features for more complex tasks. For example, you can use VLMs with [Guided Generation](/docs/conceptual/guided-generation) to generate specific JSON data from an image.

<div class="flex justify-center">
    <img
        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
        width="400"
    />
</div>

For example we can extract information from the rabbit image and generate a JSON object with the location, activity, number of animals seen, and the animals seen. That would look like this:

```json
{
  "activity": "Standing",
  "animals": ["Rabbit"],
  "animals_seen": 1,
  "location": "Rocky surface with mountains in the background and a red light on the rabbit's chest"
}
```

All we need to do is provide a JSON schema to the VLM model and it will generate the JSON object for us.

```bash
curl localhost:3000/generate \
    -X POST \
    -H 'Content-Type: application/json' \
    -d '{
    "inputs":"![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png)What is this a picture of?\n\n",
    "parameters": {
        "max_new_tokens": 100,
        "seed": 42,
        "grammar": {
            "type": "json",
            "value": {
                "properties": {
                    "location": {
                        "type": "string"
                    },
                    "activity": {
                        "type": "string"
                    },
                    "animals_seen": {
                        "type": "integer",
                        "minimum": 1,
                        "maximum": 5
                    },
                    "animals": {
                        "type": "array",
                        "items": {
                            "type": "string"
                        }
                    }
                },
                "required": ["location", "activity", "animals_seen", "animals"]
            }
        }
    }
}'

# {
#   "generated_text": "{ \"activity\": \"Standing\", \"animals\": [ \"Rabbit\" ], \"animals_seen\": 1, \"location\": \"Rocky surface with mountains in the background and a red light on the rabbit's chest\" }"
# }
```

Want to learn more about how Vision Language Models work? Check out the [awesome blog post on the topic](https://huggingface.co/blog/vlms).


================================================
FILE: docs/source/conceptual/chunking.md
================================================
# TGI v3 overview
## Summary


Performance leap: TGI processes 3x more tokens, 13x faster than vLLM on long prompts. Zero config !

### 3x more tokens.
By reducing our memory footprint, we’re able to ingest many more tokens and more dynamically than before. A single L4 (24GB) can handle 30k tokens on llama 3.1-8B, while vLLM gets barely 10k. A lot of work went into reducing the footprint of the runtime and its effect are best seen on smaller constrained environments.

### 13x faster
On long prompts (200k+ tokens) conversation replies take 27.5s in vLLM, while it takes only 2s in TGI. How so ? We keep the initial conversation around, so when a new reply comes in, we can answer almost instantly. The overhead of the lookup is ~5us. Thanks @Daniël de Kok for the beast data structure.

### Zero config
That’s it. Remove all the flags your are using and you’re likely to get the best performance. By evaluating the hardware and model, TGI carefully selects automatic values to give best performance. In production, we don’t have any flags anymore in our deployments. We kept all existing flags around, they may come in handy in niche scenarios.


## Benchmarks

### Methodology

To ensure accurate and reliable results, we employed a robust benchmarking protocol that addresses common pitfalls in performance evaluation. Specifically:

1.  **Consistent Code**: We used the same codebase to run against different engines, ensuring that any performance differences are attributable to the LLM itself, rather than variations in the testing framework.
2.  **Request-Based Measurement**: Instead of measuring Requests Per Second (RPS) by sending as many requests as possible, we opted for a more consistent approach, sending a fixed number of requests and measuring the time it takes for the server to complete all of them. This method avoids boundary effects and provides a more accurate representation of performance.
3.  **Realistic Combinations**: We selected realistic combinations of LLMs and hardware configurations so we used 8xH100 for a 70B, not a 8B, which would be a waste of money.
4.  **Realistic scenarios** We benchmarked engines with prefix caching on, so we are reporting the results of the 2nd run, not the first one.
During the first run of a benchmark, every request is new, so prefix caching is not working, masking the real world benefits of using it.

Note: Boundary effect is when the benchmarks are flaky because their results depend on fine details of the engine being benchmarked.
For instance, a system ingesting a constant 10RPS, but receiving in the benchmark a single final request at -0.1s before the end of the benchmark, and that single request takes a full 10s to process. Then a benchmark taking 30s would measure 7.5RPS instead of the expected 10, because that single query isn't being parallelized with others. Another very slightly slower engine would receive that request at +0.1s which would get discarded by the benchmark and therefore measure the slower system as being faster.

For more details on benchmarking in general we recommend the documentation of k6: https://grafana.com/docs/k6/latest/.

### Scenarios

We selected a handful of scenarios to simplify the picture, they seem to accurately reflect a larger trend.

1. **Small scenario**: This scenario consists of the first 200 requests from the orca datasets being prompted to the model. The 200 requests total 8k tokens together and are representative of conversation starters. Prefix caching has very limited impact in that scenario and we feel it's a relatively balanced benchmark for simple use cases.
2. **Long scenario**: This scenario consists of 20 requests totalling 200k prompt tokens which are essentially asking for summaries of large chunks for text. In practical scenarios this is really useful when you are feeding large chunks of code, large chunks of business data or documents repeatedly and ask simple questions about them (summarization, classification, or where to find some data). This scenario is the one closest to what a lot of professional use cases seem to be doing by including a lot of information in the prompt itself. Those very long conversations are the ones that benefit the most for our recent changes since we are enable ever larger prompts and ever faster caching.

   ### Hardware

   1. `L4` : This is a single L4 (24GB) which represents small or even home compute capabilities. We tested `meta-llama/Meta-Llama-3.1-8B-Instruct` on it.
   2. `4xL4`: This is a more beefy deployment usually used for either very large requests deployments for 8B models (the ones under test) or it can also easily handle all 30GB models. For this benchmark we tested `meta-llama/Meta-Llama-3.1-8B-Instruct`
   3. `8xH100` This is one of the beefiest deployments possible. We tested  `meta-llama/Meta-Llama-3.1-70B-Instruct` as it's the most representative models of this size. Llama 3.3 wasn't released at the time of benchmarking (it's the exact same model so it doesn't make any difference).


### Replicating the results


The commands to run the benchmarks are as follows:

1. Prepare the datasets:

```bash
cd text-generation-inference/load_tests
make prepare_orca
python long.py
```

2. Launch the engine:

TGI: `text-generation-launcher --model-id $MODEL_ID --num-shard $N --port 8000` (or docker variant)
vLLM: `vllm serve $MODEL_ID --tensor-parallel $N —enable-prefix-caching` (or docker variant)

3. Start scenario:
Small: `MODEL_ID=$MODEL_ID  HOST=localhost:8000 k6 run load_tests/common.js`
Long:  `MODEL_ID=$MODEL_ID  HOST=localhost:8000 k6 run load_tests/long.js`


### Results

![benchmarks_v3](https://raw.githubusercontent.com/huggingface/text-generation-inference/refs/heads/main/assets/v3_benchmarks.png)

Our benchmarking results show significant performance gains, with a 13x speedup over vLLM with prefix caching, and up to 30x speedup without prefix caching. These results are consistent with our production data and demonstrate the effectiveness of our optimized LLM architecture.

Raw results

|   |   |   |   |   |
|---|---|---|---|---|
|2nd run ||**TGI v3** (time in s)|**vLLM** (s)|**Amount of req**|
|**Llama 3.1 8b**|Small test - L4 - 8B|17.5|19.9|200|
|**Llama 3.1 8b**|Long test* - L4 - 8B|53|57|10|
|**Llama 3.1 8b**|Small test - 4xL4 - 8B|4.8|6|200|
|**Llama 3.1 8b**|Long test - 4xL4 - 8B|3.2|12.5|20|
|**Llama 3.1 70b**|Small test - 8XH100 - 70B|6.2|7.4|200|
|**Llama 3.1 70b**|Long test - 8H100 - 70B|2|27.5|20|
||||||
|1st run ||TGI (s)|vLLM (s)|Amount of req|
|**Llama 3.1 8b**|Small test - L4|19.9|19.9|200|
|**Llama 3.1 8b**|Long test (10) - L4|49.8|55|10|
|**Llama 3.1 8b**|Small test - 4xL4|13|12.6|200|
|**Llama 3.1 8b**|Long test - 4xL4|47|50.3|20|
|**Llama 3.1 70b**|Small test - 8XH100|7.5|7.6|200|
|**Llama 3.1 70b**|Long test - 8H100|12.1|28.3|20|


### Caveats and Limitations

While our results are promising, there are some caveats to consider:

1. **Constrained kv-cache**: If a deployment lacks kv-cache space, that means that many queries will require the same slots of kv-cache, leading to contention in the kv-cache. You can limit that effect by limiting `--max-total-tokens` to reduce individual queries impact. You can also use more GPUs or larger GPUs in order to increase the size of the kv-cache.
2.  **Replication**: In scenarios where multiple replicas are behind a single endpoint, there's no reason for every query from a particular user to hit the same replica, therefore the cache will not be present, meaning no speed benefit. You can use sticky sessions load balancing to force every user to send their requests on the same replica. Do not apply this blindly, it's possible this may not be necessary at all.

## Technical Insights

Our performance gains can be attributed to several key factors:

1.  **New Kernels**: Our custom kernels, including `flashinfer` and `flashdecoding`, offer improved performance at large prompt lengths and enable more efficient scheduling.
2.  **Prefix Caching**: Our optimized prefix caching structure allows for fast query matching, even for long prompts. The overhead is roughly 6us.
3.  **Chunking Code**: Our chunking code enables finer control over compute resources, ensuring optimal performance and reduced VRAM usage.
4.  **Kernel Optimizations**: We've implemented various other kernel optimizations, including better kernel selection. Notably we've implemented several small kernels involved in the queries bookkeeping which are particularly efficient on small models. Every kernel launch has an overhead of several milliseconds so fusing them together increases a lot performance when this bookkeeping is important relative to the raw model calculations. This happens typically on oversized compute for a particular model and particularly small models.
5. **VRAM efficiency**: In the realm of very large requests (100k+ tokens) there are a lot of places which start becoming big memory consumers. We've hunted the biggest ones and found ways to reduce/reuse or delete them. The biggest culprit probably is `logits` calculation. Logits for llama 3.1-8b take 25.6GB (=100k tokens * 128k vocabulary * 2(f16)) which is more than the full model which is 16GB. The thing is that in general we do not need every prompt logits, so we simply removed them and removed them from being potentially asked by users by default. We think this is ok since they are mostly used by researchers. You can enable your deployments to have them again by using the `--enable-prefill-logprobs` flag, but you will experience reduced token prompt size.

## Future Directions

While we've made significant progress, there are still opportunities for improvement:

1.  **Special models**: All LLMs come with the aforementioned improvements. Some specific set of features might not (some quantizations, speculation or VLMs for instance are harder to optimize for with the same level of detail).
2.  **KV-Cache Long-Term Retention**: Addressing KV-cache long-term retention is a challenge. There are several solutions envisionned like shared KV-cache (like redis or memcached) solutions or innovative storage approaches. It is an area of ongoing research of ours.
3.  **Multimodal models**: We are also investigating quite a lot other kind of models, like audio-to-audio, image/video generation, and other hybrids, where we see a lot of potential of applying the same principles we've applied in TGI to maximize performance.

By sharing our benchmarking methodology, results, and technical insights, we aim to contribute to the ongoing development of more efficient and effective LLMs.


================================================
FILE: docs/source/conceptual/external.md
================================================
# External Resources

- Adyen wrote a detailed article about the interplay between TGI's main components: router and server.
[LLM inference at scale with TGI (Martin Iglesias Goyanes - Adyen, 2024)](https://www.adyen.com/knowledge-hub/llm-inference-at-scale-with-tgi)


================================================
FILE: docs/source/conceptual/flash_attention.md
================================================
# Flash Attention

Scaling the transformer architecture is heavily bottlenecked by the self-attention mechanism, which has quadratic time and memory complexity. Recent developments in accelerator hardware mainly focus on enhancing compute capacities and not memory and transferring data between hardware. This results in attention operation having a memory bottleneck. **Flash Attention** is an attention algorithm used to reduce this problem and scale transformer-based models more efficiently, enabling faster training and inference.

Standard attention mechanism uses High Bandwidth Memory (HBM) to store, read and write keys, queries and values. HBM is large in memory, but slow in processing, meanwhile SRAM is smaller in memory, but faster in operations. In the standard attention implementation, the cost of loading and writing keys, queries, and values from HBM is high. It loads keys, queries, and values from HBM to GPU on-chip SRAM, performs a single step of the attention mechanism, writes it back to HBM, and repeats this for every single attention step. Instead, Flash Attention loads keys, queries, and values once, fuses the operations of the attention mechanism, and writes them back.

![Flash Attention](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/flash-attn.png)

It is implemented for supported models. You can check out the complete list of models that support Flash Attention [here](https://github.com/huggingface/text-generation-inference/tree/main/server/text_generation_server/models), for models with flash prefix.

You can learn more about Flash Attention by reading the paper in this [link](https://arxiv.org/abs/2205.14135).


================================================
FILE: docs/source/conceptual/guidance.md
================================================
# Guidance

## What is Guidance?

Guidance is a feature that allows users to constrain the generation of a large language model with a specified grammar. This feature is particularly useful when you want to generate text that follows a specific structure or uses a specific set of words or produce output in a specific format. A prominent example is JSON grammar, where the model is forced to output valid JSON.

## How is it used?

Guidance can be implemented in many ways and the community is always finding new ways to use it. Here are some examples of how you can use guidance:

Technically, guidance can be used to generate:

- a specific JSON object
- a function signature
- typed output like a list of integers

However these use cases can span a wide range of applications, such as:

- extracting structured data from unstructured text
- summarizing text into a specific format
- limit output to specific classes of words (act as a LLM powered classifier)
- generate the input to specific APIs or services
- provide reliable and consistent output for downstream tasks
- extract data from multimodal inputs

## How it works?

Diving into the details, guidance is enabled by including a grammar with a generation request that is compiled, and used to modify the chosen tokens.

This process can be broken down into the following steps:

1. A request is sent to the backend, it is processed and placed in batch. Processing includes compiling the grammar into a finite state machine and a grammar state.

<div class="flex justify-center">
    <img
        class="block dark:hidden"
        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/request-to-batch.gif"
    />
    <img
        class="hidden dark:block"
        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/request-to-batch-dark.gif"
    />
</div>

2. The model does a forward pass over the batch. This returns probabilities for each token in the vocabulary for each request in the batch.

3. The process of choosing one of those tokens is called `sampling`. The model samples from the distribution of probabilities to choose the next token. In TGI all of the steps before sampling are called `processor`. Grammars are applied as a processor that masks out tokens that are not allowed by the grammar.

<div class="flex justify-center">
    <img
        class="block dark:hidden"
        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/logit-grammar-mask.gif"
    />
    <img
        class="hidden dark:block"
        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/logit-grammar-mask-dark.gif"
    />
</div>

4. The grammar mask is applied and the model samples from the remaining tokens. Once a token is chosen, we update the grammar state with the new token, to prepare it for the next pass.

<div class="flex justify-center">
    <img
        class="block dark:hidden"
        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/sample-logits.gif"
    />
    <img
        class="hidden dark:block"
        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/sample-logits-dark.gif"
    />
</div>

## How to use Guidance?

There are two main ways to use guidance; you can either use the `/generate` endpoint with a grammar or use the `/chat/completion` endpoint with tools.

Under the hood tools are a special case of grammars that allows the model to choose one or none of the provided tools.

Please refer to [using guidance](../basic_tutorials/using_guidance) for more examples and details on how to use guidance in Python, JavaScript, and cURL.

### Getting the most out of guidance

Depending on how you are using guidance, you may want to make use of different features. Here are some tips to get the most out of guidance:

- If you are using the `/generate` with a `grammar` it is recommended to include the grammar in the prompt prefixed by something like `Please use the following JSON schema to generate the output:`. This will help the model understand the context of the grammar and generate the output accordingly.
- If you are getting a response with many repeated tokens, please use the `frequency_penalty` or `repetition_penalty` to reduce the number of repeated tokens in the output.


================================================
FILE: docs/source/conceptual/lora.md
================================================
# LoRA (Low-Rank Adaptation)

## What is LoRA?

LoRA is a technique that allows for efficent fine-tuning a model while only updating a small portion of the model's weights. This is useful when you have a large model that has been pre-trained on a large dataset, but you want to fine-tune it on a smaller dataset or for a specific task.

LoRA works by adding a small number of additional weights to the model, which are used to adapt the model to the new dataset or task. These additional weights are learned during the fine-tuning process, while the rest of the model's weights are kept fixed.

## How is it used?

LoRA can be used in many ways and the community is always finding new ways to use it. Here are some examples of how you can use LoRA:

Technically, LoRA can be used to fine-tune a large language model on a small dataset. However, these use cases can span a wide range of applications, such as:

- fine-tuning a language model on a small dataset
- fine-tuning a language model on a domain-specific dataset
- fine-tuning a language model on a dataset with limited labels

## Optimizing Inference with LoRA

LoRA's can be used during inference by mutliplying the adapter weights with the model weights at each specified layer. This process can be computationally expensive, but due to awesome work by [punica-ai](https://github.com/punica-ai/punica) and the [lorax](https://github.com/predibase/lorax) team, optimized kernels/and frameworks have been developed to make this process more efficient. TGI leverages these optimizations in order to provide fast and efficient inference with mulitple LoRA models.

## Serving multiple LoRA adapters with TGI

Once a LoRA model has been trained, it can be used to generate text or perform other tasks just like a regular language model. However, because the model has been fine-tuned on a specific dataset, it may perform better on that dataset than a model that has not been fine-tuned.

In practice its often useful to have multiple LoRA models, each fine-tuned on a different dataset or for a different task. This allows you to use the model that is best suited for a particular task or dataset.

Text Generation Inference (TGI) now supports loading multiple LoRA models at startup that can be used in generation requests. This feature is available starting from version `~2.0.6` and is compatible with LoRA models trained using the `peft` library.

### Specifying LoRA models

To use LoRA in TGI, when starting the server, you can specify the list of LoRA models to load using the `LORA_ADAPTERS` environment variable. For example:

```bash
LORA_ADAPTERS=predibase/customer_support,predibase/dbpedia
```

To specify model revision, use `adapter_id@revision`, as follows:

```bash
LORA_ADAPTERS=predibase/customer_support@main,predibase/dbpedia@rev2
```

To use a locally stored lora adapter, use `adapter-name=/path/to/adapter`, as seen below. When you want to use this adapter, set `"parameters": {"adapter_id": "adapter-name"}"`

```bash
LORA_ADAPTERS=myadapter=/some/path/to/adapter,myadapter2=/another/path/to/adapter
```

note it's possible to mix adapter_ids with adapter_id=adapter_path e.g.

```bash
LORA_ADAPTERS=predibase/dbpedia,myadapter=/path/to/dir/
```

In the server logs, you will see the following message:

```txt
Loading adapter weights into model: predibase/customer_support
Loading adapter weights into model: predibase/dbpedia
```

## Generate text

You can then use these models in generation requests by specifying the `lora_model` parameter in the request payload. For example:

```json
curl 127.0.0.1:3000/generate \
    -X POST \
    -H 'Content-Type: application/json' \
    -d '{
  "inputs": "Hello who are you?",
  "parameters": {
    "max_new_tokens": 40,
    "adapter_id": "predibase/customer_support"
  }
}'
```

If you are using a lora adapter stored locally that was set in the following manner: `LORA_ADAPTERS=myadapter=/some/path/to/adapter`, here is an example payload:

```json
curl 127.0.0.1:3000/generate \
    -X POST \
    -H 'Content-Type: application/json' \
    -d '{
  "inputs": "Hello who are you?",
  "parameters": {
    "max_new_tokens": 40,
    "adapter_id": "myadapter"
  }
}'
```


> **Note:** The Lora feature is new and still being improved. If you encounter any issues or have any feedback, please let us know by opening an issue on the [GitHub repository](https://github.com/huggingface/text-generation-inference/issues/new/choose). Additionally documentation and an improved client library will be published soon.

An updated tutorial with detailed examples will be published soon. Stay tuned!


================================================
FILE: docs/source/conceptual/paged_attention.md
================================================
# PagedAttention

LLMs struggle with memory limitations during generation. In the decoding part of generation, all the attention keys and values generated for previous tokens are stored in GPU memory for reuse. This is called _KV cache_, and it may take up a large amount of memory for large models and long sequences.

PagedAttention attempts to optimize memory use by partitioning the KV cache into blocks that are accessed through a lookup table. Thus, the KV cache does not need to be stored in contiguous memory, and blocks are allocated as needed. The memory efficiency can increase GPU utilization on memory-bound workloads, so more inference batches can be supported.

The use of a lookup table to access the memory blocks can also help with KV sharing across multiple generations. This is helpful for techniques such as _parallel sampling_, where multiple outputs are generated simultaneously for the same prompt. In this case, the cached KV blocks can be shared among the generations.

TGI's PagedAttention implementation leverages the custom cuda kernels developed by the [vLLM Project](https://github.com/vllm-project/vllm). You can learn more about this technique in the [project's page](https://vllm.ai/).


================================================
FILE: docs/source/conceptual/quantization.md
================================================
# Quantization

TGI offers many quantization schemes to run LLMs effectively and fast based on your use-case. TGI supports GPTQ, AWQ, bits-and-bytes, EETQ, Marlin, EXL2 and fp8 quantization.

To leverage GPTQ, AWQ, Marlin and EXL2 quants, you must provide pre-quantized weights. Whereas for bits-and-bytes, EETQ and fp8, weights are quantized by TGI on the fly.

We recommend using the official quantization scripts for creating your quants:
1. [AWQ](https://github.com/casper-hansen/AutoAWQ/blob/main/examples/quantize.py)
2. [GPTQ/ Marlin](https://github.com/AutoGPTQ/AutoGPTQ/blob/main/examples/quantization/basic_usage.py)
3. [EXL2](https://github.com/turboderp/exllamav2/blob/master/doc/convert.md)

For on-the-fly quantization you simply need to pass one of the supported quantization types and TGI takes care of the rest.

## Quantization with bitsandbytes, EETQ & fp8

bitsandbytes is a library used to apply 8-bit and 4-bit quantization to models. Unlike GPTQ quantization, bitsandbytes doesn't require a calibration dataset or any post-processing – weights are automatically quantized on load. However, inference with bitsandbytes is slower than GPTQ or FP16 precision.

8-bit quantization enables multi-billion parameter scale models to fit in smaller hardware without degrading performance too much.
In TGI, you can use 8-bit quantization by adding `--quantize bitsandbytes` like below 👇

```bash
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.5 --model-id $model --quantize bitsandbytes
```

4-bit quantization is also possible with bitsandbytes. You can choose one of the following 4-bit data types: 4-bit float (`fp4`), or 4-bit `NormalFloat` (`nf4`). These data types were introduced in the context of parameter-efficient fine-tuning, but you can apply them for inference by automatically converting the model weights on load.

In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇

```bash
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.5 --model-id $model --quantize bitsandbytes-nf4
```

You can get more information about 8-bit quantization by reading this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), and 4-bit quantization by reading [this blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes).

Similarly you can use pass you can pass `--quantize eetq` or `--quantize fp8` for respective quantization schemes.

In addition to this, TGI allows creating GPTQ quants directly by passing the model weights and a calibration dataset.

## Quantization with GPTQ

GPTQ is a post-training quantization method to make the model smaller. It quantizes the layers by finding a compressed version of that weight, that will yield a minimum mean squared error like below 👇

Given a layer \\(l\\) with weight matrix \\(W_{l}\\) and layer input \\(X_{l}\\), find quantized weight \\(\\hat{W}_{l}\\):

$$({\hat{W}_{l}}^{*} = argmin_{\hat{W_{l}}} ||W_{l}X-\hat{W}_{l}X||^{2}_{2})$$


TGI allows you to both run an already GPTQ quantized model (see available models [here](https://huggingface.co/models?search=gptq)) or quantize a model of your choice using quantization script. You can run a quantized model by simply passing --quantize like below 👇

```bash
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.5 --model-id $model --quantize gptq
```

Note that TGI's GPTQ implementation doesn't use [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) under the hood. However, models quantized using AutoGPTQ or Optimum can still be served by TGI.

To quantize a given model using GPTQ with a calibration dataset, simply run

```bash
text-generation-server quantize tiiuae/falcon-40b /data/falcon-40b-gptq
# Add --upload-to-model-id MYUSERNAME/falcon-40b to push the created model to the hub directly
```

This will create a new directory with the quantized files which you can use with,

```bash
text-generation-launcher --model-id /data/falcon-40b-gptq/ --sharded true --num-shard 2 --quantize gptq
```

You can learn more about the quantization options by running `text-generation-server quantize --help`.

If you wish to do more with GPTQ models (e.g. train an adapter on top), you can read about transformers GPTQ integration [here](https://huggingface.co/blog/gptq-integration).
You can learn more about GPTQ from the [paper](https://arxiv.org/pdf/2210.17323.pdf).


================================================
FILE: docs/source/conceptual/safetensors.md
================================================
# Safetensors

Safetensors is a model serialization format for deep learning models. It is [faster](https://huggingface.co/docs/safetensors/speed) and safer compared to other serialization formats like pickle (which is used under the hood in many deep learning libraries).

TGI depends on safetensors format mainly to enable [tensor parallelism sharding](./tensor_parallelism). For a given model repository during serving, TGI looks for safetensors weights. If there are no safetensors weights, TGI converts the PyTorch weights to safetensors format.

You can learn more about safetensors by reading the [safetensors documentation](https://huggingface.co/docs/safetensors/index).


================================================
FILE: docs/source/conceptual/speculation.md
================================================
## Speculation


Speculative decoding, assisted generation, Medusa, and others are a few different names for the same idea.
The idea is to generate tokens *before* the large model actually runs, and only *check* if those tokens where valid.

So you are making *more* computations on your LLM, but if you are correct you produce 1, 2, 3 etc.. tokens on a single LLM pass. Since LLMs are usually memory bound (and not compute bound), provided your guesses are correct enough, this is a 2-3x faster inference (It can be much more for code oriented tasks for instance).

You can check a more [detailed explanation](https://huggingface.co/blog/assisted-generation).

Text-generation inference supports 2 main speculative methods:

- Medusa
- N-gram


### Medusa


Medusa is a [simple method](https://arxiv.org/abs/2401.10774) to create many tokens in a single pass using fine-tuned LM heads in addition to your existing models.


You can check a few existing  fine-tunes for popular models:

- [text-generation-inference/gemma-7b-it-medusa](https://huggingface.co/text-generation-inference/gemma-7b-it-medusa)
- [text-generation-inference/Mixtral-8x7B-Instruct-v0.1-medusa](https://huggingface.co/text-generation-inference/Mixtral-8x7B-Instruct-v0.1-medusa)
- [text-generation-inference/Mistral-7B-Instruct-v0.2-medusa](https://huggingface.co/text-generation-inference/Mistral-7B-Instruct-v0.2-medusa)


In order to create your own medusa heads for your own finetune, you should check own the original medusa repo. Read for more in [Train Medusa](../basic_tutorials/train_medusa#training).


In order to use medusa models in TGI, simply point to a medusa enabled model, and everything will load automatically.


### N-gram


If you don't have a medusa model, or don't have the resource to fine-tune, you can try to use `n-gram`.
N-gram works by trying to find matching tokens in the previous sequence, and use those as speculation for generating new tokens. For example, if the tokens "np.mean" appear multiple times in the sequence, the model can speculate that the next continuation of the tokens "np." is probably also "mean".

This is an extremely simple method, which works best for code, or highly repetitive text. This might not be beneficial, if the speculation misses too much.


In order to enable n-gram speculation simply use

`--speculate 2` in your flags.

[Details about the flag](https://huggingface.co/docs/text-generation-inference/basic_tutorials/launcher#speculate)


================================================
FILE: docs/source/conceptual/streaming.md
================================================
# Streaming


## What is Streaming?

Token streaming is the mode in which the server returns the tokens one by one as the model generates them. This enables showing progressive generations to the user rather than waiting for the whole generation. Streaming is an essential aspect of the end-user experience as it reduces latency, one of the most critical aspects of a smooth experience.

<div class="flex justify-center">
    <img
        class="block dark:hidden"
        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/streaming-generation-visual_360.gif"
    />
    <img
        class="hidden dark:block"
        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/streaming-generation-visual-dark_360.gif"
    />
</div>

With token streaming, the server can start returning the tokens one by one before having to generate the whole response. Users can have a sense of the generation's quality before the end of the generation. This has different positive effects:

* Users can get results orders of magnitude earlier for extremely long queries.
* Seeing something in progress allows users to stop the generation if it's not going in the direction they expect.
* Perceived latency is lower when results are shown in the early stages.
* When used in conversational UIs, the experience feels more natural.

For example, a system can generate 100 tokens per second. If the system generates 1000 tokens, with the non-streaming setup, users need to wait 10 seconds to get results. On the other hand, with the streaming setup, users get initial results immediately, and although end-to-end latency will be the same, they can see half of the generation after five seconds. Below you can see an interactive demo that shows non-streaming vs streaming side-by-side. Click **generate** below.

<div class="block dark:hidden">
	<iframe
        src="https://huggingface-streaming-vs-non-streaming.hf.space?__theme=light"
        width="850"
        height="350"
    ></iframe>
</div>
<div class="hidden dark:block">
    <iframe
        src="https://huggingface-streaming-vs-non-streaming.hf.space?__theme=dark"
        width="850"
        height="350"
    ></iframe>
</div>

## How to use Streaming?

### Streaming with Python

To stream tokens with `InferenceClient`, simply pass `stream=True` and iterate over the response.

```python
from huggingface_hub import InferenceClient

client = InferenceClient(base_url="http://127.0.0.1:8080")
output = client.chat.completions.create(
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Count to 10"},
    ],
    stream=True,
    max_tokens=1024,
)

for chunk in output:
    print(chunk.choices[0].delta.content)

# 1
# 2
# 3
# 4
# 5
# 6
# 7
# 8
# 9
# 10
```

The `huggingface_hub` library also comes with an `AsyncInferenceClient` in case you need to handle the requests concurrently.

```python
from huggingface_hub import AsyncInferenceClient

client = AsyncInferenceClient(base_url="http://127.0.0.1:8080")
async def main():
    stream = await client.chat.completions.create(
        messages=[{"role": "user", "content": "Say this is a test"}],
        stream=True,
    )
    async for chunk in stream:
        print(chunk.choices[0].delta.content or "", end="")

asyncio.run(main())

# This
# is
# a
# test
#.
```

### Streaming with cURL

To use the OpenAI Chat Completions compatible Messages API `v1/chat/completions` endpoint with curl, you can add the `-N` flag, which disables curl default buffering and shows data as it arrives from the server

```curl
curl localhost:8080/v1/chat/completions \
    -X POST \
    -d '{
  "model": "tgi",
  "messages": [
    {
      "role": "system",
      "content": "You are a helpful assistant."
    },
    {
      "role": "user",
      "content": "What is deep learning?"
    }
  ],
  "stream": true,
  "max_tokens": 20
}' \
    -H 'Content-Type: application/json'
```

### Streaming with JavaScript

First, we need to install the `@huggingface/inference` library.

```bash
npm install @huggingface/inference
```

Whether you use Inference Providers (our serverless API), or Inference Endpoints, you can call `InferenceClient`.


```js
import { InferenceClient } from '@huggingface/inference';

const client = new InferenceClient('hf_YOUR_TOKEN', { endpointUrl: 'https://YOUR_ENDPOINT.endpoints.huggingface.cloud' });

// prompt
const prompt = 'What can you do in Nuremberg, Germany? Give me 3 Tips';

const stream = client.textGenerationStream({ inputs: prompt });
for await (const r of stream) {
  // yield the generated token
  process.stdout.write(r.token.text);
}
```

## How does Streaming work under the hood?

Under the hood, TGI uses Server-Sent Events (SSE). In an SSE Setup, a client sends a request with the data, opening an HTTP connection and subscribing to updates. Afterward, the server sends data to the client. There is no need for further requests; the server will keep sending the data. SSEs are unidirectional, meaning the client does not send other requests to the server. SSE sends data over HTTP, making it easy to use.

SSEs are different than:
* Polling: where the client keeps calling the server to get data. This means that the server might return empty responses and cause overhead.
* Webhooks: where there is a bi-directional connection. The server can send information to the client, but the client can also send data to the server after the first request. Webhooks are more complex to operate as they don’t only use HTTP.

If there are too many requests at the same time, TGI returns an HTTP Error with an `overloaded` error type (`huggingface_hub` returns `OverloadedError`). This allows the client to manage the overloaded server (e.g., it could display a busy error to the user or retry with a new request). To configure the maximum number of concurrent requests, you can specify `--max_concurrent_requests`, allowing clients to handle backpressure.


================================================
FILE: docs/source/conceptual/tensor_parallelism.md
================================================
# Tensor Parallelism

Tensor parallelism is a technique used to fit a large model in multiple GPUs. For example, when multiplying the input tensors with the first weight tensor, the matrix multiplication is equivalent to splitting the weight tensor column-wise, multiplying each column with the input separately, and then concatenating the separate outputs. These outputs are then transferred from the GPUs and concatenated together to get the final result, like below 👇

![Image courtesy of Anton Lozkhov](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/TP.png)


<Tip warning={true}>

Tensor Parallelism only works for [models officially supported](../supported_models), it will not work when falling back to `transformers`. You can get more information about unsupported models [here](../basic_tutorials/non_core_models).

</Tip>

You can learn a lot more details about tensor-parallelism from [the `transformers` docs](https://huggingface.co/docs/transformers/main/en/perf_train_gpu_many#tensor-parallelism).


================================================
FILE: docs/source/index.md
================================================
# Text Generation Inference


> [!CAUTION]
> text-generation-inference is now in maintenance mode. Going forward, we will accept pull requests for minor bug fixes, documentation improvements and lightweight maintenance tasks.
>
> TGI has initiated the movement for optimized inference engines to rely on a `transformers` model architectures. This approach is now adopted by downstream inference engines, which we contribute to and recommend using going forward: [vllm](https://github.com/vllm-project/vllm), [SGLang](https://github.com/sgl-project/sglang), as well as local engines with inter-compatibility such as llama.cpp or MLX.


Text Generation Inference (TGI) is a toolkit for deploying and serving Large Language Models (LLMs). TGI enables high-performance text generation for the most popular open-source LLMs, including Llama, Falcon, StarCoder, BLOOM, GPT-NeoX, and T5.

![Text Generation Inference](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/TGI.png)

Text Generation Inference implements many optimizations and features, such as:

- Simple launcher to serve most popular LLMs
- Production ready (distributed tracing with Open Telemetry, Prometheus metrics)
- Tensor Parallelism for faster inference on multiple GPUs
- Token streaming using Server-Sent Events (SSE)
- Continuous batching of incoming requests for increased total throughput
- Optimized transformers code for inference using [Flash Attention](https://github.com/HazyResearch/flash-attention) and [Paged Attention](https://github.com/vllm-project/vllm) on the most popular architectures
- Quantization with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes) and [GPT-Q](https://arxiv.org/abs/2210.17323)
- [Safetensors](https://github.com/huggingface/safetensors) weight loading
- Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
- Logits warper (temperature scaling, top-p, top-k, repetition penalty)
- Stop sequences
- Log probabilities
- Fine-tuning Support: Utilize fine-tuned models for specific tasks to achieve higher accuracy and performance.
- [Guidance](conceptual/guidance): Enable function calling and tool-use by forcing the model to generate structured outputs based on your own predefined output schemas.

Text Generation Inference is used in production by multiple projects, such as:

- [Hugging Chat](https://github.com/huggingface/chat-ui), an open-source interface for open-access models, such as Open Assistant and Llama
- [OpenAssistant](https://open-assistant.io/), an open-source community effort to train LLMs in the open
- [nat.dev](http://nat.dev/), a playground to explore and compare LLMs.


================================================
FILE: docs/source/installation.md
================================================
# Installation from source

<Tip warning={true}>

Installing TGI from source is not the recommended usage. We strongly recommend to use TGI through Docker, check the [Quick Tour](./quicktour), [Installation for Nvidia GPUs](./installation_nvidia) and [Installation for AMD GPUs](./installation_amd) to learn how to use TGI with Docker.

</Tip>

## Install CLI

You can use TGI command-line interface (CLI) to download weights, serve and quantize models, or get information on serving parameters.

To install the CLI, you need to first clone the TGI repository and then run `make`.

```bash
git clone https://github.com/huggingface/text-generation-inference.git && cd text-generation-inference
make install
```

If you would like to serve models with custom kernels, run

```bash
BUILD_EXTENSIONS=True make install
```

## Local Installation from Source

Before you start, you will need to setup your environment, and install Text Generation Inference. Text Generation Inference is tested on **Python 3.9+**.

Text Generation Inference is available on pypi, conda and GitHub.

To install and launch locally, first [install Rust](https://rustup.rs/) and create a Python virtual environment with at least
Python 3.9, e.g. using conda:

```bash
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh

conda create -n text-generation-inference python=3.9
conda activate text-generation-inference
```

You may also need to install Protoc.

On Linux:

```bash
PROTOC_ZIP=protoc-21.12-linux-x86_64.zip
curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP
sudo unzip -o $PROTOC_ZIP -d /usr/local bin/protoc
sudo unzip -o $PROTOC_ZIP -d /usr/local 'include/*'
rm -f $PROTOC_ZIP
```

On MacOS, using Homebrew:

```bash
brew install protobuf
```

Then run to install Text Generation Inference:

```bash
git clone https://github.com/huggingface/text-generation-inference.git && cd text-generation-inference
BUILD_EXTENSIONS=True make install
```

<Tip warning={true}>

On some machines, you may also need the OpenSSL libraries and gcc. On Linux machines, run:

```bash
sudo apt-get install libssl-dev gcc -y
```

</Tip>

Once installation is done, simply run:

```bash
make run-falcon-7b-instruct
```

This will serve Falcon 7B Instruct model from the port 8080, which we can query.


================================================
FILE: docs/source/installation_amd.md
================================================
# Using TGI with AMD GPUs

TGI is supported and tested on [AMD Instinct MI210](https://www.amd.com/en/products/accelerators/instinct/mi200/mi210.html), [MI250](https://www.amd.com/en/products/accelerators/instinct/mi200/mi250.html) and [MI300](https://www.amd.com/en/products/accelerators/instinct/mi300.html) GPUs. The support may be extended in the future. The recommended usage is through Docker. Make sure to check the [AMD documentation](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html) on how to use Docker with AMD GPUs.

On a server powered by AMD GPUs, TGI can be launched with the following command:

```bash
model=teknium/OpenHermes-2.5-Mistral-7B
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run

docker run --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
    --device=/dev/kfd --device=/dev/dri --group-add video \
    --ipc=host --shm-size 256g --net host -v $volume:/data \
    ghcr.io/huggingface/text-generation-inference:3.3.5-rocm \
    --model-id $model
```

The launched TGI server can then be queried from clients, make sure to check out the [Consuming TGI](./basic_tutorials/consuming_tgi) guide.

## TunableOp

TGI's docker image for AMD GPUs integrates [PyTorch's TunableOp](https://github.com/pytorch/pytorch/tree/main/aten/src/ATen/cuda/tunable), which allows to do an additional warmup to select the best performing matrix multiplication (GEMM) kernel from rocBLAS or hipBLASLt.

Experimentally, on MI300X, we noticed a 6-8% latency improvement when using TunableOp on top of ROCm 6.1 and PyTorch 2.3.

TunableOp is enabled by default, the warmup may take 1-2 minutes. In case you would like to disable TunableOp, please pass `--env PYTORCH_TUNABLEOP_ENABLED="0"` when launcher TGI's docker container.

## Flash attention implementation

Two implementations of Flash Attention are available for ROCm, the first is [ROCm/flash-attention](https://github.com/ROCm/flash-attention) based on a [Composable Kernel](https://github.com/ROCm/composable_kernel) (CK) implementation, and the second is a [Triton implementation](https://github.com/huggingface/text-generation-inference/blob/main/server/text_generation_server/layers/attention/flash_attn_triton.py).

By default, the Composable Kernel implementation is used. However, the Triton implementation has slightly lower latency on MI250 and MI300, but requires a warmup which can be prohibitive as it needs to be done again for each new prompt length. If needed, FA Triton impelmentation can be enabled with `--env ROCM_USE_FLASH_ATTN_V2_TRITON="0"` when launching TGI's docker container.

## Custom PagedAttention

For better performance on ROCm, a custom Paged Attention kernel is available and is enabled by default. To disable it and fall back to the PagedAttention v2 kernel, set the environment variable `ROCM_USE_CUSTOM_PAGED_ATTN=0`.

The custom kernel supports bf16 and fp16 data types, block size of 16, head size of 128, a maximum context length of 16k, and GQA ratios between 1 and 16. For other configurations, we use the PagedAttention v2 kernel.

## Unsupported features

The following features are currently not supported in the ROCm version of TGI, and the support may be extended in the future:
* Loading [AWQ](https://huggingface.co/docs/transformers/quantization#awq) checkpoints.
* Kernel for sliding window attention (Mistral)


================================================
FILE: docs/source/installation_gaudi.md
================================================
# Using TGI with Intel Gaudi

You can use TGI on Intel Gaudi using the [TGI gaudi backend](https://huggingface.co/docs/text-generation-inference/backends/gaudi).


================================================
FILE: docs/source/installation_inferentia.md
================================================
# Using TGI with Inferentia

You can use TGI on AWS Trainium and Inferentia platforms using the [TGI neuron backend](https://huggingface.co/docs/text-generation-inference/backends/neuron).


================================================
FILE: docs/source/installation_intel.md
================================================
# Using TGI with Intel GPUs

TGI optimized models are supported on Intel Data Center GPU [Max1100](https://www.intel.com/content/www/us/en/products/sku/232876/intel-data-center-gpu-max-1100/specifications.html), [Max1550](https://www.intel.com/content/www/us/en/products/sku/232873/intel-data-center-gpu-max-1550/specifications.html), the recommended usage is through Docker.


On a server powered by Intel GPUs, TGI can be launched with the following command:

```bash
model=teknium/OpenHermes-2.5-Mistral-7B
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run

docker run --rm --privileged --cap-add=sys_nice \
    --device=/dev/dri \
    --ipc=host --shm-size 1g --net host -v $volume:/data \
    ghcr.io/huggingface/text-generation-inference:3.3.5-intel-xpu \
    --model-id $model --cuda-graphs 0
```

# Using TGI with Intel CPUs

Intel® Extension for PyTorch (IPEX) also provides further optimizations for Intel CPUs. The IPEX provides optimization operations such as flash attention, page attention, Add + LayerNorm, ROPE and more.

On a server powered by Intel CPU, TGI can be launched with the following command:

```bash
model=teknium/OpenHermes-2.5-Mistral-7B
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run

docker run --rm --privileged --cap-add=sys_nice \
    --device=/dev/dri \
    --ipc=host --shm-size 1g --net host -v $volume:/data \
    ghcr.io/huggingface/text-generation-inference:3.3.5-intel-cpu \
    --model-id $model --cuda-graphs 0
```

The launched TGI server can then be queried from clients, make sure to check out the [Consuming TGI](./basic_tutorials/consuming_tgi) guide.


================================================
FILE: docs/source/installation_nvidia.md
================================================
# Using TGI with Nvidia GPUs

TGI optimized models are supported on NVIDIA [H100](https://www.nvidia.com/en-us/data-center/h100/), [A100](https://www.nvidia.com/en-us/data-center/a100/), [A10G](https://www.nvidia.com/en-us/data-center/products/a10-gpu/) and [T4](https://www.nvidia.com/en-us/data-center/tesla-t4/) GPUs with CUDA 12.2+. Note that you have to install [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) to use it.

For other NVIDIA GPUs, continuous batching will still apply, but some operations like flash attention and paged attention will not be executed.

TGI can be used on NVIDIA GPUs through its official docker image:

```bash
model=teknium/OpenHermes-2.5-Mistral-7B
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run

docker run --gpus all --shm-size 64g -p 8080:80 -v $volume:/data \
    ghcr.io/huggingface/text-generation-inference:3.3.5 \
    --model-id $model
```

The launched TGI server can then be queried from clients, make sure to check out the [Consuming TGI](./basic_tutorials/consuming_tgi) guide.


================================================
FILE: docs/source/installation_tpu.md
================================================
# Using TGI with Google TPUs

Check out this [guide](https://huggingface.co/docs/optimum-tpu) on how to serve models with TGI on TPUs.


================================================
FILE: docs/source/multi_backend_support.md
================================================
# Multi-backend support

TGI (Text Generation Inference) offers flexibility by supporting multiple backends for serving large language models (LLMs).
With multi-backend support, you can choose the backend that best suits your needs,
whether you prioritize performance, ease of use, or compatibility with specific hardware. API interaction with
TGI remains consistent across backends, allowing you to switch between them seamlessly.

**Supported backends:**
* **TGI CUDA backend**: This high-performance backend is optimized for NVIDIA GPUs and serves as the default option
  within TGI. Developed in-house, it boasts numerous optimizations and is used in production by various projects, including those by Hugging Face.
* **[TGI TRTLLM backend](./backends/trtllm)**: This backend leverages NVIDIA's TensorRT library to accelerate LLM inference.
  It utilizes specialized optimizations and custom kernels for enhanced performance.
  However, it requires a model-specific compilation step for each GPU architecture.
* **[TGI Llamacpp backend](./backends/llamacpp)**: This backend facilitates the deployment of large language models
  (LLMs) by integrating [llama.cpp][llama.cpp], an advanced inference engine optimized for both CPU and GPU computation.
* **[TGI Neuron backend](./backends/neuron)**: This backend leverages the [AWS Neuron SDK](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/) to allow the deployment of large language models (LLMs) on [AWS Trainium and Inferentia chips](https://aws.amazon.com/ai/machine-learning/trainium/).


================================================
FILE: docs/source/quicktour.md
================================================
# Quick Tour

The easiest way of getting started is using the official Docker container. Install Docker following [their installation instructions](https://docs.docker.com/get-docker/).

## Launching TGI

Let's say you want to deploy [teknium/OpenHermes-2.5-Mistral-7B](https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B) model with TGI on an Nvidia GPU. Here is an example on how to do that:

```bash
model=teknium/OpenHermes-2.5-Mistral-7B
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run

docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
    ghcr.io/huggingface/text-generation-inference:3.3.5 \
    --model-id $model
```

<Tip>

If you want to serve gated or private models, please refer to
[this guide](https://huggingface.co/docs/text-generation-inference/en/basic_tutorials/gated_model_access)
for detailed instructions.

</Tip>

### Supported hardware

TGI supports various hardware. Make sure to check the [Using TGI with Nvidia GPUs](./installation_nvidia), [Using TGI with AMD GPUs](./installation_amd), [Using TGI with Intel GPUs](./installation_intel), [Using TGI with Gaudi](./installation_gaudi), [Using TGI with Inferentia](./installation_inferentia) guides depending on which hardware you would like to deploy TGI on.

## Consuming TGI

Once TGI is running, you can use the `generate` endpoint or the Open AI Chat Completion API compatible [Messages API](https://huggingface.co/docs/text-generation-inference/en/messages_api) by doing requests. To learn more about how to query the endpoints, check the [Consuming TGI](./basic_tutorials/consuming_tgi) section, where we show examples with utility libraries and UIs. Below you can see a simple snippet to query the endpoint.

<inferencesnippet>
<python>

```python
import requests

headers = {
    "Content-Type": "application/json",
}

data = {
    'inputs': 'What is Deep Learning?',
    'parameters': {
        'max_new_tokens': 20,
    },
}

response = requests.post('http://127.0.0.1:8080/generate', headers=headers, json=data)
print(response.json())
# {'generated_text': '\n\nDeep Learning is a subset of Machine Learning that is concerned with the development of algorithms that can'}
```
</python>
<js>

```js
async function query() {
    const response = await fetch(
        'http://127.0.0.1:8080/generate',
        {
            method: 'POST',
            headers: { 'Content-Type': 'application/json'},
            body: JSON.stringify({
                'inputs': 'What is Deep Learning?',
                'parameters': {
                    'max_new_tokens': 20
                }
            })
        }
    );
}

query().then((response) => {
    console.log(JSON.stringify(response));
});
/// {"generated_text":"\n\nDeep Learning is a subset of Machine Learning that is concerned with the development of algorithms that can"}
```

</js>
<curl>

```curl
curl 127.0.0.1:8080/generate \
    -X POST \
    -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
    -H 'Content-Type: application/json'
```

</curl>
</inferencesnippet>

<Tip>

To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more.

```bash
docker run ghcr.io/huggingface/text-generation-inference:3.3.5 --help
```

</Tip>


================================================
FILE: docs/source/reference/api_reference.md
================================================
# HTTP API Reference

#### Table of Contents

- [Text Generation Inference custom API](#text-generation-inference-custom-api)
- [OpenAI Messages API](#openai-messages-api)
  - [Making a Request](#making-a-request)
  - [Streaming](#streaming)
  - [Synchronous](#synchronous)
  - [Hugging Face Inference Endpoints](#hugging-face-inference-endpoints)
  - [Cloud Providers](#cloud-providers)
      - [Amazon SageMaker](#amazon-sagemaker)

The HTTP API is a RESTful API that allows you to interact with the text-generation-inference component. Two endpoints are available:
* Text Generation Inference [custom API](https://huggingface.github.io/text-generation-inference/)
* OpenAI's [Messages API](#openai-messages-api)


## Text Generation Inference custom API

Check the [API documentation](https://huggingface.github.io/text-generation-inference/) for more information on how to interact with the Text Generation Inference API.

## OpenAI Messages API

Text Generation Inference (TGI) now supports the Messages API, which is fully compatible with the OpenAI Chat Completion API. This feature is available starting from version 1.4.0. You can use OpenAI's client libraries or third-party libraries expecting OpenAI schema to interact with TGI's Messages API. Below are some examples of how to utilize this compatibility.

> **Note:** The Messages API is supported from TGI version 1.4.0 and above. Ensure you are using a compatible version to access this feature.

## Making a Request

You can make a request to TGI's Messages API using `curl`. Here's an example:

```bash
curl localhost:3000/v1/chat/completions \
    -X POST \
    -d '{
  "model": "tgi",
  "messages": [
    {
      "role": "system",
      "content": "You are a helpful assistant."
    },
    {
      "role": "user",
      "content": "What is deep learning?"
    }
  ],
  "stream": true,
  "max_tokens": 20
}' \
    -H 'Content-Type: application/json'
```

## Streaming

You can also use OpenAI's Python client library to make a streaming request. Here's how:

```python
from openai import OpenAI

# init the client but point it to TGI
client = OpenAI(
    base_url="http://localhost:3000/v1",
    api_key="-"
)

chat_completion = client.chat.completions.create(
    model="tgi",
    messages=[
        {"role": "system", "content": "You are a helpful assistant." },
        {"role": "user", "content": "What is deep learning?"}
    ],
    stream=True
)

# iterate and print stream
for message in chat_completion:
    print(message)
```

## Synchronous

If you prefer to make a synchronous request, you can do so like this:

```python
from openai import OpenAI

# init the client but point it to TGI
client = OpenAI(
    base_url="http://localhost:3000/v1",
    api_key="-"
)

chat_completion = client.chat.completions.create(
    model="tgi",
    messages=[
        {"role": "system", "content": "You are a helpful assistant." },
        {"role": "user", "content": "What is deep learning?"}
    ],
    stream=False
)

print(chat_completion)
```

## Hugging Face Inference Endpoints

The Messages API is integrated with [Inference Endpoints](https://huggingface.co/inference-endpoints/dedicated).
Every endpoint that uses "Text Generation Inference" with an LLM, which has a chat template can now be used. Below is an example of how to use IE with TGI using OpenAI's Python client library:

> **Note:** Make sure to replace `base_url` with your endpoint URL and to include `v1/` at the end of the URL. The `api_key` should be replaced with your Hugging Face API key.

```python
from openai import OpenAI

# init the client but point it to TGI
client = OpenAI(
    # replace with your endpoint url, make sure to include "v1/" at the end
    base_url="https://vlzz10eq3fol3429.us-east-1.aws.endpoints.huggingface.cloud/v1/",
    # replace with your API key
    api_key="hf_XXX"
)

chat_completion = client.chat.completions.create(
    model="tgi",
    messages=[
        {"role": "system", "content": "You are a helpful assistant." },
        {"role": "user", "content": "What is deep learning?"}
    ],
    stream=True
)

# iterate and print stream
for message in chat_completion:
    print(message.choices[0].delta.content, end="")
```

## Cloud Providers

TGI can be deployed on various cloud providers for scalable and robust text generation. One such provider is Amazon SageMaker, which has recently added support for TGI. Here's how you can deploy TGI on Amazon SageMaker:

## Amazon SageMaker

Amazon Sagemaker natively supports the message API:

```python
import json
import sagemaker
import boto3
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri

try:
 role = sagemaker.get_execution_role()
except ValueError:
 iam = boto3.client('iam')
 role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

# Hub Model configuration. https://huggingface.co/models
hub = {
 'HF_MODEL_ID':'HuggingFaceH4/zephyr-7b-beta',
 'SM_NUM_GPUS': json.dumps(1),
}

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
 image_uri=get_huggingface_llm_image_uri("huggingface",version="3.3.5"),
 env=hub,
 role=role,
)

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
 initial_instance_count=1,
 instance_type="ml.g5.2xlarge",
 container_startup_health_check_timeout=300,
  )

# send request
predictor.predict({
"messages": [
        {"role": "system", "content": "You are a helpful assistant." },
        {"role": "user", "content": "What is deep learning?"}
    ]
})
```


================================================
FILE: docs/source/reference/launcher.md
================================================
# Text-generation-launcher arguments

<!-- WRAP CODE BLOCKS -->

```shell
Text Generation Launcher

Usage: text-generation-launcher [OPTIONS]

Options:
```
## MODEL_ID
```shell
      --model-id <MODEL_ID>
          The name of the model to load. Can be a MODEL_ID as listed on <https://hf.co/models> like `gpt2` or `OpenAssistant/oasst-sft-1-pythia-12b`. Or it can be a local directory containing the necessary files as saved by `save_pretrained(...)` methods of transformers
          
          [env: MODEL_ID=]
          [default: bigscience/bloom-560m]

```
## REVISION
```shell
      --revision <REVISION>
          The actual revision of the model if you're referring to a model on the hub. You can use a specific commit id or a branch like `refs/pr/2`
          
          [env: REVISION=]

```
## VALIDATION_WORKERS
```shell
      --validation-workers <VALIDATION_WORKERS>
          The number of tokenizer workers used for payload validation and truncation inside the router
          
          [env: VALIDATION_WORKERS=]
          [default: 2]

```
## SHARDED
```shell
      --sharded <SHARDED>
          Whether to shard the model across multiple GPUs By default text-generation-inference will use all available GPUs to run the model. Setting it to `false` deactivates `num_shard`
          
          [env: SHARDED=]
          [possible values: true, false]

```
## NUM_SHARD
```shell
      --num-shard <NUM_SHARD>
          The number of shards to use if you don't want to use all GPUs on a given machine. You can use `CUDA_VISIBLE_DEVICES=0,1 text-generation-launcher... --num_shard 2` and `CUDA_VISIBLE_DEVICES=2,3 text-generation-launcher... --num_shard 2` to launch 2 copies with 2 shard each on a given machine with 4 GPUs for instance
          
          [env: NUM_SHARD=]

```
## QUANTIZE
```shell
      --quantize <QUANTIZE>
          Quantization method to use for the model. It is not necessary to specify this option for pre-quantized models, since the quantization method is read from the model configuration.
          
          Marlin kernels will be used automatically for GPTQ/AWQ models.

          Possible values:
          - awq:                4 bit quantization. Requires a specific AWQ quantized model: <https://hf.co/models?search=awq>. Should replace GPTQ models wherever possible because of the better latency
          - compressed-tensors: Compressed tensors, which can be a mixture of different quantization methods
          - eetq:               8 bit quantization, doesn't require specific model. Should be a drop-in replacement to bitsandbytes with much better performance. Kernels are from <https://github.com/NetEase-FuXi/EETQ.git>
          - exl2:               Variable bit quantization. Requires a specific EXL2 quantized model: <https://hf.co/models?search=exl2>. Requires exllama2 kernels and does not support tensor parallelism (num_shard > 1)
          - gptq:               4 bit quantization. Requires a specific GTPQ quantized model: <https://hf.co/models?search=gptq>. text-generation-inference will use exllama (faster) kernels wherever possible, and use triton kernel (wider support) when it's not. AWQ has faster kernels
          - marlin:             4 bit quantization. Requires a specific Marlin quantized model: <https://hf.co/models?search=marlin>
          - bitsandbytes:       Bitsandbytes 8bit. Can be applied on any model, will cut the memory requirement in half, but it is known that the model will be much slower to run than the native f16
          - bitsandbytes-nf4:   Bitsandbytes 4bit. Can be applied on any model, will cut the memory requirement by 4x, but it is known that the model will be much slower to run than the native f16
          - bitsandbytes-fp4:   Bitsandbytes 4bit. nf4 should be preferred in most cases but maybe this one has better perplexity performance for you model
          - fp8:                [FP8](https://developer.nvidia.com/blog/nvidia-arm-and-intel-publish-fp8-specification-for-standardization-as-an-interchange-format-for-ai/) (e4m3) works on H100 and above This dtype has native ops should be the fastest if available. This is currently not the fastest because of local unpacking + padding to satisfy matrix multiplication limitations
          
          [env: QUANTIZE=]

```
## SPECULATE
```shell
      --speculate <SPECULATE>
          The number of input_ids to speculate on If using a medusa model, the heads will be picked up automatically Other wise, it will use n-gram speculation which is relatively free in terms of compute, but the speedup heavily depends on the task
          
          [env: SPECULATE=]

```
## DTYPE
```shell
      --dtype <DTYPE>
          The dtype to be forced upon the model. This option cannot be used with `--quantize`
          
          [env: DTYPE=]
          [possible values: float16, bfloat16]

```
## KV_CACHE_DTYPE
```shell
      --kv-cache-dtype <KV_CACHE_DTYPE>
          Specify the dtype for the key-value cache. When this option is not provided, the dtype of the model is used (typically `float16` or `bfloat16`). Currently the only supported value are `fp8_e4m3fn` and `fp8_e5m2` on CUDA
          
          [env: KV_CACHE_DTYPE=]
          [possible values: fp8_e4m3fn, fp8_e5m2]

```
## TRUST_REMOTE_CODE
```shell
      --trust-remote-code
          Whether you want to execute hub modelling code. Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision
          
          [env: TRUST_REMOTE_CODE=]

```
## MAX_CONCURRENT_REQUESTS
```shell
      --max-concurrent-requests <MAX_CONCURRENT_REQUESTS>
          The maximum amount of concurrent requests for this particular deployment. Having a low limit will refuse clients requests instead of having them wait for too long and is usually good to handle backpressure correctly
          
          [env: MAX_CONCURRENT_REQUESTS=]
          [default: 128]

```
## MAX_BEST_OF
```shell
      --max-best-of <MAX_BEST_OF>
          This is the maximum allowed value for clients to set `best_of`. Best of makes `n` generations at the same time, and return the best in terms of overall log probability over the entire generated sequence
          
          [env: MAX_BEST_OF=]
          [default: 2]

```
## MAX_STOP_SEQUENCES
```shell
      --max-stop-sequences <MAX_STOP_SEQUENCES>
          This is the maximum allowed value for clients to set `stop_sequences`. Stop sequences are used to allow the model to stop on more than just the EOS token, and enable more complex "prompting" where users can preprompt the model in a specific way and define their "own" stop token aligned with their prompt
          
          [env: MAX_STOP_SEQUENCES=]
          [default: 4]

```
## MAX_TOP_N_TOKENS
```shell
      --max-top-n-tokens <MAX_TOP_N_TOKENS>
          This is the maximum allowed value for clients to set `top_n_tokens`. `top_n_tokens` is used to return information about the the `n` most likely tokens at each generation step, instead of just the sampled token. This information can be used for downstream tasks like for classification or ranking
          
          [env: MAX_TOP_N_TOKENS=]
          [default: 5]

```
## MAX_INPUT_TOKENS
```shell
      --max-input-tokens <MAX_INPUT_TOKENS>
          This is the maximum allowed input length (expressed in number of tokens) for users. The larger this value, the longer prompt users can send which can impact the overall memory required to handle the load. Please note that some models have a finite range of sequence they can handle. Default to min(max_allocatable, max_position_embeddings) - 1
          
          [env: MAX_INPUT_TOKENS=]

```
## MAX_INPUT_LENGTH
```shell
      --max-input-length <MAX_INPUT_LENGTH>
          Legacy version of [`Args::max_input_tokens`]
          
          [env: MAX_INPUT_LENGTH=]

```
## MAX_TOTAL_TOKENS
```shell
      --max-total-tokens <MAX_TOTAL_TOKENS>
          This is the most important value to set as it defines the "memory budget" of running clients requests. Clients will send input sequences and ask to generate `max_new_tokens` on top. with a value of `1512` users can send either a prompt of `1000` and ask for `512` new tokens, or send a prompt of `1` and ask for `1511` max_new_tokens. The larger this value, the larger amount each request will be in your RAM and the less effective batching can be. Default to min(max_allocatable, max_position_embeddings)
          
          [env: MAX_TOTAL_TOKENS=]

```
## WAITING_SERVED_RATIO
```shell
      --waiting-served-ratio <WAITING_SERVED_RATIO>
          This represents the ratio of waiting queries vs running queries where you want to start considering pausing the running queries to include the waiting ones into the same batch. `waiting_served_ratio=1.2` Means when 12 queries are waiting and there's only 10 queries left in the current batch we check if we can fit those 12 waiting queries into the batching strategy, and if yes, then batching happens delaying the 10 running queries by a `prefill` run.
          
          This setting is only applied if there is room in the batch as defined by `max_batch_total_tokens`.
          
          [env: WAITING_SERVED_RATIO=]
          [default: 0.3]

```
## MAX_BATCH_PREFILL_TOKENS
```shell
      --max-batch-prefill-tokens <MAX_BATCH_PREFILL_TOKENS>
          Limits the number of tokens for the prefill operation. Since this operation take the most memory and is compute bound, it is interesting to limit the number of requests that can be sent. Default to `max_input_tokens + 50` to give a bit of room
          
          [env: MAX_BATCH_PREFILL_TOKENS=]

```
## MAX_BATCH_TOTAL_TOKENS
```shell
      --max-batch-total-tokens <MAX_BATCH_TOTAL_TOKENS>
          **IMPORTANT** This is one critical control to allow maximum usage of the available hardware.
          
          This represents the total amount of potential tokens within a batch. When using padding (not recommended) this would be equivalent of `batch_size` * `max_total_tokens`.
          
          However in the non-padded (flash attention) version this can be much finer.
          
          For `max_batch_total_tokens=1000`, you could fit `10` queries of `total_tokens=100` or a single query of `1000` tokens.
          
          Overall this number should be the largest possible amount that fits the remaining memory (after the model is loaded). Since the actual memory overhead depends on other parameters like if you're using quantization, flash attention or the model implementation, text-generation-inference infers this number automatically if not provided ensuring that the value is as large as possible.
          
          [env: MAX_BATCH_TOTAL_TOKENS=]

```
## MAX_WAITING_TOKENS
```shell
      --max-waiting-tokens <MAX_WAITING_TOKENS>
          This setting defines how many tokens can be passed before forcing the waiting queries to be put on the batch (if the size of the batch allows for it). New queries require 1 `prefill` forward, which is different from `decode` and therefore you need to pause the running batch in order to run `prefill` to create the correct values for the waiting queries to be able to join the batch.
          
          With a value too small, queries will always "steal" the compute to run `prefill` and running queries will be delayed by a lot.
          
          With a value too big, waiting queries could wait for a very long time before being allowed a slot in the running batch. If your server is busy that means that requests that could run in ~2s on an empty server could end up running in ~20s because the query had to wait for 18s.
          
          This number is expressed in number of tokens to make it a bit more "model" agnostic, but what should really matter is the overall latency for end users.
          
          [env: MAX_WAITING_TOKENS=]
          [default: 20]

```
## MAX_BATCH_SIZE
```shell
      --max-batch-size <MAX_BATCH_SIZE>
          Enforce a maximum number of requests per batch Specific flag for hardware targets that do not support unpadded inference
          
          [env: MAX_BATCH_SIZE=]

```
## CUDA_GRAPHS
```shell
      --cuda-graphs <CUDA_GRAPHS>
          Specify the batch sizes to compute cuda graphs for. Use "0" to disable. Default = "1,2,4,8,16,32"
          
          [env: CUDA_GRAPHS=]

```
## HOSTNAME
```shell
      --hostname <HOSTNAME>
          The IP address to listen on
          
          [env: HOSTNAME=]
          [default: 0.0.0.0]

```
## PORT
```shell
  -p, --port <PORT>
          The port to listen on
          
          [env: PORT=]
          [default: 3000]

```
## PROMETHEUS_PORT
```shell
  -p, --prometheus-port <PROMETHEUS_PORT>
          The Prometheus port to listen on
          
          [env: PROMETHEUS_PORT=]
          [default: 9000]

```
## SHARD_UDS_PATH
```shell
      --shard-uds-path <SHARD_UDS_PATH>
          The name of the socket for gRPC communication between the webserver and the shards
          
          [env: SHARD_UDS_PATH=]
          [default: /tmp/text-generation-server]

```
## MASTER_ADDR
```shell
      --master-addr <MASTER_ADDR>
          The address the master shard will listen on. (setting used by torch distributed)
          
          [env: MASTER_ADDR=]
          [default: localhost]

```
## MASTER_PORT
```shell
      --master-port <MASTER_PORT>
          The address the master port will listen on. (setting used by torch distributed)
          
          [env: MASTER_PORT=]
          [default: 29500]

```
## HUGGINGFACE_HUB_CACHE
```shell
      --huggingface-hub-cache <HUGGINGFACE_HUB_CACHE>
          The location of the huggingface hub cache. Used to override the location if you want to provide a mounted disk for instance
          
          [env: HUGGINGFACE_HUB_CACHE=]

```
## WEIGHTS_CACHE_OVERRIDE
```shell
      --weights-cache-override <WEIGHTS_CACHE_OVERRIDE>
          The location of the huggingface hub cache. Used to override the location if you want to provide a mounted disk for instance
          
          [env: WEIGHTS_CACHE_OVERRIDE=]

```
## DISABLE_CUSTOM_KERNELS
```shell
      --disable-custom-kernels
          For some models (like bloom), text-generation-inference implemented custom cuda kernels to speed up inference. Those kernels were only tested on A100. Use this flag to disable them if you're running on different hardware and encounter issues
          
          [env: DISABLE_CUSTOM_KERNELS=]

```
## CUDA_MEMORY_FRACTION
```shell
      --cuda-memory-fraction <CUDA_MEMORY_FRACTION>
          Limit the CUDA available memory. The allowed value equals the total visible memory multiplied by cuda-memory-fraction
          
          [env: CUDA_MEMORY_FRACTION=]
          [default: 1.0]

```
## ROPE_SCALING
```shell
      --rope-scaling <ROPE_SCALING>
          Rope scaling will only be used for RoPE models and allow rescaling the position rotary to accomodate for larger prompts.
          
          Goes together with `rope_factor`.
          
          `--rope-factor 2.0` gives linear scaling with a factor of 2.0 `--rope-scaling dynamic` gives dynamic scaling with a factor of 1.0 `--rope-scaling linear` gives linear scaling with a factor of 1.0 (Nothing will be changed basically)
          
          `--rope-scaling linear --rope-factor` fully describes the scaling you want
          
          [env: ROPE_SCALING=]
          [possible values: linear, dynamic]

```
## ROPE_FACTOR
```shell
      --rope-factor <ROPE_FACTOR>
          Rope scaling will only be used for RoPE models See `rope_scaling`
          
          [env: ROPE_FACTOR=]

```
## JSON_OUTPUT
```shell
      --json-output
          Outputs the logs in JSON format (useful for telemetry)
          
          [env: JSON_OUTPUT=]

```
## OTLP_ENDPOINT
```shell
      --otlp-endpoint <OTLP_ENDPOINT>
          [env: OTLP_ENDPOINT=]

```
## OTLP_SERVICE_NAME
```shell
      --otlp-service-name <OTLP_SERVICE_NAME>
          [env: OTLP_SERVICE_NAME=]
          [default: text-generation-inference.router]

```
## CORS_ALLOW_ORIGIN
```shell
      --cors-allow-origin <CORS_ALLOW_ORIGIN>
          [env: CORS_ALLOW_ORIGIN=]

```
## API_KEY
```shell
      --api-key <API_KEY>
          [env: API_KEY=]

```
## WATERMARK_GAMMA
```shell
      --watermark-gamma <WATERMARK_GAMMA>
          [env: WATERMARK_GAMMA=]

```
## WATERMARK_DELTA
```shell
      --watermark-delta <WATERMARK_DELTA>
          [env: WATERMARK_DELTA=]

```
## NGROK
```shell
      --ngrok
          Enable ngrok tunneling
          
          [env: NGROK=]

```
## NGROK_AUTHTOKEN
```shell
      --ngrok-authtoken <NGROK_AUTHTOKEN>
          ngrok authentication token
          
          [env: NGROK_AUTHTOKEN=]

```
## NGROK_EDGE
```shell
      --ngrok-edge <NGROK_EDGE>
          ngrok edge
          
          [env: NGROK_EDGE=]

```
## TOKENIZER_CONFIG_PATH
```shell
      --tokenizer-config-path <TOKENIZER_CONFIG_PATH>
          The path to the tokenizer config file. This path is used to load the tokenizer configuration which may include a `chat_template`. If not provided, the default config will be used from the model hub
          
          [env: TOKENIZER_CONFIG_PATH=]

```
## DISABLE_GRAMMAR_SUPPORT
```shell
      --disable-grammar-support
          Disable outlines grammar constrained generation. This is a feature that allows you to generate text that follows a specific grammar
          
          [env: DISABLE_GRAMMAR_SUPPORT=]

```
## ENV
```shell
  -e, --env
          Display a lot of information about your runtime environment

```
## MAX_CLIENT_BATCH_SIZE
```shell
      --max-client-batch-size <MAX_CLIENT_BATCH_SIZE>
          Control the maximum number of inputs that a client can send in a single request
          
          [env: MAX_CLIENT_BATCH_SIZE=]
          [default: 4]

```
## LORA_ADAPTERS
```shell
      --lora-adapters <LORA_ADAPTERS>
          Lora Adapters a list of adapter ids i.e. `repo/adapter1,repo/adapter2` to load during startup that will be available to callers via the `adapter_id` field in a request
          
          [env: LORA_ADAPTERS=]

```
## USAGE_STATS
```shell
      --usage-stats <USAGE_STATS>
          Control if anonymous usage stats are collected. Options are "on", "off" and "no-stack" Defaul is on

          Possible values:
          - on:       Default option, usage statistics are collected anonymously
          - off:      Disables all collection of usage statistics
          - no-stack: Doesn't send the error stack trace or error type, but allows sending a crash event
          
          [env: USAGE_STATS=]
          [default: on]

```
## PAYLOAD_LIMIT
```shell
      --payload-limit <PAYLOAD_LIMIT>
          Payload size limit in bytes
          
          Default is 2MB
          
          [env: PAYLOAD_LIMIT=]
          [default: 2000000]

```
## ENABLE_PREFILL_LOGPROBS
```shell
      --enable-prefill-logprobs
          Enables prefill logprobs
          
          Logprobs in the prompt are deactivated by default because they consume a large amount of VRAM (especially for long prompts). Using this flag reallows users to ask for them.
          
          [env: ENABLE_PREFILL_LOGPROBS=]

```
## GRACEFUL_TERMINATION_TIMEOUT
```shell
  -g, --graceful-termination-timeout <GRACEFUL_TERMINATION_TIMEOUT>
          Change timeout of graceful termination of the TGI server
          
          [env: GRACEFUL_TERMINATION_TIMEOUT=]
          [default: 90]

```
## HELP
```shell
  -h, --help
          Print help (see a summary with '-h')

```
## VERSION
```shell
  -V, --version
          Print version

```


================================================
FILE: docs/source/reference/metrics.md
================================================
# Metrics

TGI exposes multiple metrics that can be collected via the `/metrics` Prometheus endpoint.
These metrics can be used to monitor the performance of TGI, autoscale deployment and to help identify bottlenecks.

The following metrics are exposed:

| Metric Name                                | Description                                                                              | Type      | Unit    |
|--------------------------------------------|------------------------------------------------------------------------------------------|-----------|---------|
| `tgi_batch_current_max_tokens`             | Maximum tokens for the current batch                                                     | Gauge     | Count   |
| `tgi_batch_current_size`                   | Current batch size                                                                       | Gauge     | Count   |
| `tgi_batch_decode_duration`                | Time spent decoding a batch per method (prefill or decode)                               | Histogram | Seconds |
| `tgi_batch_filter_duration`                | Time spent filtering batches and sending generated tokens per method (prefill or decode) | Histogram | Seconds |
| `tgi_batch_forward_duration`               | Batch forward duration per method (prefill or decode)                                    | Histogram | Seconds |
| `tgi_batch_inference_count`                | Inference calls per method (prefill or decode)                                           | Counter   | Count   |
| `tgi_batch_inference_duration`             | Batch inference duration                                                                 | Histogram | Seconds |
| `tgi_batch_inference_success`              | Number of successful inference calls per method (prefill or decode)                      | Counter   | Count   |
| `tgi_batch_next_size`                      | Batch size of the next batch                                                             | Histogram | Count   |
| `tgi_queue_size`                           | Current queue size                                                                       | Gauge     | Count   |
| `tgi_request_count`                        | Total number of requests                                                                 | Counter   | Count   |
| `tgi_request_duration`                     | Total time spent processing the request (e2e latency)                                    | Histogram | Seconds |
| `tgi_request_generated_tokens`             | Generated tokens per request                                                             | Histogram | Count   |
| `tgi_request_inference_duration`           | Request inference duration                                                               | Histogram | Seconds |
| `tgi_request_input_length`                 | Input token length per request                                                           | Histogram | Count   |
| `tgi_request_max_new_tokens`               | Maximum new tokens per request                                                           | Histogram | Count   |
| `tgi_request_mean_time_per_token_duration` | Mean time per token per request (inter-token latency)                                    | Histogram | Seconds |
| `tgi_request_queue_duration`               | Time spent in the queue per request                                                      | Histogram | Seconds |
| `tgi_request_skipped_tokens`               | Speculated tokens per request                                                            | Histogram | Count   |
| `tgi_request_success`                      | Number of successful requests                                                            | Counter   |         |
| `tgi_request_validation_duration`          | Time spent validating the request                                                        | Histogram | Seconds |


================================================
FILE: docs/source/supported_models.md
================================================

# Supported Models

Text Generation Inference enables serving optimized models. The following sections list which models (VLMs & LLMs) are supported.

- [Deepseek V2](https://huggingface.co/deepseek-ai/DeepSeek-V2)
- [Deepseek V3](https://huggingface.co/deepseek-ai/DeepSeek-V3)
- [Idefics 2](https://huggingface.co/HuggingFaceM4/idefics2-8b) (Multimodal)
- [Idefics 3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3) (Multimodal)
- [Llava Next (1.6)](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) (Multimodal)
- [Llama](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
- [Llama4](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
- [Phi 3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct)
- [Granite](https://huggingface.co/ibm-granite/granite-3.0-8b-instruct)
- [Gemma](https://huggingface.co/google/gemma-7b)
- [PaliGemma](https://huggingface.co/google/paligemma-3b-pt-224)
- [Gemma2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
- [Gemma3](https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d)
- [Gemma3 Text](https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d)
- [Cohere](https://huggingface.co/CohereForAI/c4ai-command-r-plus)
- [Dbrx](https://huggingface.co/databricks/dbrx-instruct)
- [Mamba](https://huggingface.co/state-spaces/mamba-2.8b-slimpj)
- [Mistral](https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407)
- [Mixtral](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1)
- [Gpt Bigcode](https://huggingface.co/bigcode/gpt_bigcode-santacoder)
- [Phi](https://huggingface.co/microsoft/phi-1_5)
- [PhiMoe](https://huggingface.co/microsoft/Phi-3.5-MoE-instruct)
- [Baichuan](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat)
- [Falcon](https://huggingface.co/tiiuae/falcon-7b-instruct)
- [StarCoder 2](https://huggingface.co/bigcode/starcoder2-15b-instruct-v0.1)
- [Qwen 2](https://huggingface.co/collections/Qwen/qwen2-6659360b33528ced941e557f)
- [Qwen 2 VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d)
- [Qwen 2.5 VL](https://huggingface.co/collections/Qwen/qwen25-66e81a666513e518adb90d9e)
- [Opt](https://huggingface.co/facebook/opt-6.7b)
- [T5](https://huggingface.co/google/flan-t5-xxl)
- [Galactica](https://huggingface.co/facebook/galactica-120b)
- [SantaCoder](https://huggingface.co/bigcode/santacoder)
- [Bloom](https://huggingface.co/bigscience/bloom-560m)
- [Mpt](https://huggingface.co/mosaicml/mpt-7b-instruct)
- [Gpt2](https://huggingface.co/openai-community/gpt2)
- [Gpt Neox](https://huggingface.co/EleutherAI/gpt-neox-20b)
- [Gptj](https://huggingface.co/EleutherAI/gpt-j-6b)
- [Idefics](https://huggingface.co/HuggingFaceM4/idefics-9b) (Multimodal)
- [Mllama](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct) (Multimodal)


If the above list lacks the model you would like to serve, depending on the model's pipeline type, you can try to initialize and serve the model anyways to see how well it performs, but performance isn't guaranteed for non-optimized models:

```python
# for causal LMs/text-generation models
AutoModelForCausalLM.from_pretrained(<model>, device_map="auto")
# or, for text-to-text generation models
AutoModelForSeq2SeqLM.from_pretrained(<model>, device_map="auto")
```

If you wish to serve a supported model that already exists on a local folder, just point to the local folder.

```bash
text-generation-launcher --model-id <PATH-TO-LOCAL-BLOOM>
```


================================================
FILE: docs/source/usage_statistics.md
================================================

# Collection of Usage Statistics

Text Generation Inference collects anonymous usage statistics to help us improve the service. The collected data is used to improve TGI and to understand what causes failures. The data is collected transparently and any sensitive information is omitted.

Usage statistics are collected only when TGI is running in a Docker container. This prevents data collection when TGI is run directly on the host machine. The collected data includes startup and shutdown events, as well as a heartbeat signal sent every 15 minutes.

## What data is collected

The code that collects the data is available [here](https://github.com/huggingface/text-generation-inference/blob/main/router/src/usage_stats.rs).
As of release 2.1.2 this is an example of the data collected:

- From the TGI configuration:
```json
{
  "event_type": "start",
  "disable_grammar_support": false,
  "max_batch_prefill_tokens": 4096,
  "max_batch_size": null,
  "max_batch_total_tokens": null,
  "max_best_of": 2,
  "max_client_batch_size": 4,
  "max_concurrent_requests": 128,
  "max_input_tokens": 1024,
  "max_stop_sequences": 4,
  "max_top_n_tokens": 5,
  "max_total_tokens": 2048,
  "max_waiting_tokens": 20,
  "model_config": {
    "model_type": "Bloom"
  },
  "revision": null,
  "tokenizer_class": "BloomTokenizerFast",
  "validation_workers": 2,
  "waiting_served_ratio": 1.2,
  "docker_label": "latest",
  "git_sha": "cfc118704880453d29bcbe4fbbd91dda501cf5fe",
  "nvidia_env": {
    "name": "NVIDIA A10G",
    "pci_bus_id": "00000000:00:1E.0",
    "driver_version": "535.183.01",
    "pstate": "P8",
    "pcie_link_gen_max": "4",
    "pcie_link_gen_current": "1",
    "temperature_gpu": "31",
    "utilization_gpu": "0 %",
    "utilization_memory": "0 %",
    "memory_total": "23028 MiB",
    "memory_free": "22515 MiB",
    "memory_used": "0 MiB",
    "reset_status_reset_required": "No",
    "reset_status_drain_and_reset_recommended": "No",
    "compute_cap": "8.6",
    "ecc_errors_corrected_volatile_total": "0",
    "mig_mode_current": "[N/A]",
    "power_draw_instant": "10.86 W",
    "power_limit": "300.00 W"
  },
  "system_env": {
    "cpu_count": 16,
    "cpu_type": "AMD EPYC 7R32",
    "total_memory": 66681196544,
    "architecture": "x86_64",
    "platform": "linux-unix-x86_64"
  }
}

```

## How to opt-out

By passing the `--usage-stats` to the text-generation-launcher you can control how much usage statistics are being collected.
`--usage-stats=no-stack` will not emit the stack traces from errors and the error types, but will continue to send start and stop events
`--usage-stats=off` will completely disable everything


================================================
FILE: flake.nix
================================================
{
  inputs = {
    crate2nix = {
      url = "github:nix-community/crate2nix";
      inputs.nixpkgs.follows = "hf-nix/nixpkgs";
    };
    nix-filter.url = "github:numtide/nix-filter";
    hf-nix.url = "github:huggingface/hf-nix";
    nixpkgs.follows = "hf-nix/nixpkgs";
    flake-utils.url = "github:numtide/flake-utils";
    rust-overlay = {
      url = "github:oxalica/rust-overlay";
      inputs.nixpkgs.follows = "hf-nix/nixpkgs";
    };
  };
  outputs =
    {
      self,
      crate2nix,
      nix-filter,
      nixpkgs,
      flake-utils,
      rust-overlay,
      hf-nix,
    }:
    flake-utils.lib.eachDefaultSystem (
      system:
      let
        cargoNix = crate2nix.tools.${system}.appliedCargoNix {
          name = "tgi";
          src = ./.;
          additionalCargoNixArgs = [ "--all-features" ];
        };
        pkgs = import nixpkgs {
          inherit system;
          inherit (hf-nix.lib) config;
          overlays = [
            rust-overlay.overlays.default
            hf-nix.overlays.default
            (import nix/overlay.nix)
          ];
        };
        crateOverrides = import ./nix/crate-overrides.nix { inherit pkgs nix-filter; };
        benchmark = cargoNix.workspaceMembers.text-generation-benchmark.build.override {
          inherit crateOverrides;
        };
        launcher =
          let
            launcherUnwrapped = cargoNix.workspaceMembers.text-generation-launcher.build.override {
              inherit crateOverrides;
            };
            packagePath =
              with pkgs.python3.pkgs;
              makePythonPath [
                torch
              ];
          in
          pkgs.writeShellApplication {
            name = "text-generation-launcher";
            text = ''
              PYTHONPATH="${packagePath}" ${launcherUnwrapped}/bin/text-generation-launcher "$@"
            '';
          };

        router =
          let
            routerUnwrapped = cargoNix.workspaceMembers.text-generation-router-v3.build.override {
              inherit crateOverrides;
            };
            packagePath =
              with pkgs.python3.pkgs;
              makePythonPath [
                protobuf
                sentencepiece
                torch
                transformers
              ];
          in
          pkgs.writeShellApplication {
            name = "text-generation-router";
            text = ''
              PYTHONPATH="${packagePath}" ${routerUnwrapped}/bin/text-generation-router "$@"
            '';
          };
        server = pkgs.python3.pkgs.callPackage ./nix/server.nix { inherit nix-filter; };
        client = pkgs.python3.pkgs.callPackage ./nix/client.nix { };
      in
      {
        checks = {
          rust =
            with pkgs;
            rustPlatform.buildRustPackage {
              name = "rust-checks";
              src = ./.;
              cargoLock = {
                lockFile = ./Cargo.lock;
              };
              buildInputs = [ openssl.dev ];
              nativeBuildInputs = [
                clippy
                pkg-config
                protobuf
                python3
                rustfmt
              ];
              buildPhase = ''
                cargo check
              '';
              checkPhase = ''
                cargo fmt -- --check
                cargo test -j $NIX_BUILD_CORES
                cargo clippy
              '';
              installPhase = "touch $out";
            };
        };
        formatter = pkgs.nixfmt-rfc-style;
        devShells = with pkgs; rec {
          default = pure;

          pure = mkShell {
            buildInputs = [
              benchmark
              launcher
              router
              server
            ];
          };
          test = mkShell {
            buildInputs =
              [
                benchmark
                launcher
                router
                server
                client
                openssl.dev
                pkg-config
                cargo
                rustfmt
                clippy
              ]
              ++ (with python3.pkgs; [
                docker
                pytest
                pytest-asyncio
                syrupy
                pre-commit
                ruff
              ]);
          };

          impure = callPackage ./nix/impure-shell.nix { inherit server; };

          impureWithCuda = callPackage ./nix/impure-shell.nix {
            inherit server;
            withCuda = true;
          };

          impure-flash-attn-v1 = callPackage ./nix/impure-shell.nix {
            server = server.override { flash-attn = python3.pkgs.flash-attn-v1; };
          };
        };

        packages = rec {
          inherit server;

          default = pkgs.writeShellApplication {
            name = "text-generation-inference";
            runtimeInputs = [
              server
              router
            ];
            text = ''
              ${launcher}/bin/text-generation-launcher "$@"
            '';
          };

          # Use plain nixpkgs without overlays for dockerTools. dockerTools
          # uses a Python package for computing the layers from the transitive
          # closure. However, this needs a lot of rebuilds due to our overlay.

          dockerImage = nixpkgs.legacyPackages.${system}.callPackage nix/docker.nix {
            text-generation-inference = default;
          };

          dockerImageStreamed = nixpkgs.legacyPackages.${system}.callPackage nix/docker.nix {
            text-generation-inference = default;
            stream = true;
          };
        };
      }
    );
}


================================================
FILE: integration-tests/conftest.py
================================================
pytest_plugins = [
    "fixtures.neuron.service",
    "fixtures.neuron.export_models",
    "fixtures.gaudi.service",
]
# ruff: noqa: E402
from _pytest.fixtures import SubRequest
from huggingface_hub.inference._generated.types.chat_completion import (
    ChatCompletionStreamOutput,
    ChatCompletionOutput,
)
from openai.types.chat.chat_completion_chunk import (
    ChatCompletionChunk as OAIChatCompletionChunk,
)
from openai.types.completion import Completion as OAICompletion
import requests


class SessionTimeoutFix(requests.Session):
    def request(self, *args, **kwargs):
        timeout = kwargs.pop("timeout", 120)
        return super().request(*args, **kwargs, timeout=timeout)


requests.sessions.Session = SessionTimeoutFix

import warnings
import asyncio
import contextlib
import json
import math
import os
import random
import subprocess
import sys
import tempfile
import time
import docker
import pytest
import base64

from pathlib import Path
from typing import Dict, List, Optional
from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
from docker.errors import NotFound
from syrupy.extensions.json import JSONSnapshotExtension
from text_generation import AsyncClient
from text_generation.types import (
    BestOfSequence,
    Message,
    ChatComplete,
    ChatCompletionChunk,
    ChatCompletionComplete,
    Completion,
    Details,
    Grammar,
    InputToken,
    Response,
    Token,
)

DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", None)
HF_TOKEN = os.getenv("HF_TOKEN", None)
DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data")
DOCKER_DEVICES = os.getenv("DOCKER_DEVICES")


def pytest_addoption(parser):
    parser.addoption(
        "--release", action="store_true", default=False, help="run release tests"
    )
    parser.addoption(
        "--neuron", action="store_true", default=False, help="run neuron tests"
    )
    parser.addoption(
        "--gaudi", action="store_true", default=False, help="run gaudi tests"
    )
    parser.addoption(
        "--gaudi-all-models",
        action="store_true",
        default=False,
        help="Run tests for all models instead of just the default subset",
    )


def pytest_configure(config):
    config.addinivalue_line("markers", "release: mark test as a release-only test")
    config.addinivalue_line("markers", "neuron: mark test as a neuron test")


def pytest_collection_modifyitems(config, items):
    selectors = []
    if not config.getoption("--release"):
        # --release not given in cli: skip release tests
        def skip_release(item):
            if "release" in item.keywords:
                item.add_marker(pytest.mark.skip(reason="need --release option to run"))

        selectors.append(skip_release)

    if config.getoption("--gaudi"):

        def skip_not_gaudi(item):
            if "gaudi" not in item.keywords:
                item.add_marker(pytest.mark.skip(reason="requires --gaudi to run"))

        selectors.append(skip_not_gaudi)
    else:

        def skip_gaudi(item):
            if "gaudi" in item.keywords:
                item.add_marker(pytest.mark.skip(reason="requires --gaudi to run"))

        selectors.append(skip_gaudi)

    if config.getoption("--neuron"):

        def skip_not_neuron(item):
            if "neuron" not in item.keywords:
                item.add_marker(
                    pytest.mark.skip(reason="incompatible with --neuron option")
                )

        selectors.append(skip_not_neuron)
    else:

        def skip_neuron(item):
            if "neuron" in item.keywords:
                item.add_marker(pytest.mark.skip(reason="requires --neuron to run"))

        selectors.append(skip_neuron)

    for item in items:
        for selector in selectors:
            selector(item)


@pytest.fixture(autouse=True, scope="module")
def container_log(request: SubRequest):
    error_log = request.getfixturevalue("error_log")
    assert error_log is not None
    yield
    if request.session.testsfailed:
        error_log.seek(0)
        print(error_log.read(), file=sys.stderr)
    else:
        error_log.truncate(0)
        error_log.seek(0)


class ResponseComparator(JSONSnapshotExtension):
    rtol = 0.2
    ignore_logprob = False

    def _serialize(
        self,
        data,
    ):
        if (
            isinstance(data, Response)
            or isinstance(data, ChatComplete)
            or isinstance(data, ChatCompletionChunk)
            or isinstance(data, ChatCompletionComplete)
            or isinstance(data, Completion)
            or isinstance(data, OAIChatCompletionChunk)
            or isinstance(data, OAICompletion)
        ):
            data = data.model_dump()
        elif isinstance(data, ChatCompletionStreamOutput) or isinstance(
            data, ChatCompletionOutput
        ):
            data = dict(data)
        elif isinstance(data, List):
            data = [self._serialize(d) for d in data]
        elif isinstance(data, dict):
            return data
        else:
            raise RuntimeError(f"Unexpected data {type(data)} : {data}")
        return data

    def serialize(
        self,
        data,
        *,
        include=None,
        exclude=None,
        matcher=None,
    ):
        data = self._serialize(data)
        data = self._filter(
            data=data,
            depth=0,
            path=(),
            exclude=exclude,
            include=include,
            matcher=matcher,
        )
        data = json.dumps(data, indent=2, ensure_ascii=False, sort_keys=False) + "\n"
        return data

    def matches(
        self,
        *,
        serialized_data,
        snapshot_data,
    ) -> bool:
        def convert_data(data):
            data = json.loads(data)
            return _convert_data(data)

        def _convert_data(data):
            if isinstance(data, Dict):
                if "choices" in data:
                    data["choices"] = list(
                        sorted(data["choices"], key=lambda x: int(x["index"]))
                    )
                    choices = data["choices"]
                    if isinstance(choices, List) and len(choices) >= 1:
                        if "delta" in choices[0]:
                            return ChatCompletionChunk(**data)
                        if "text" in choices[0]:
                            return Completion(**data)
                    return ChatComplete(**data)
                else:
                    return Response(**data)
            if isinstance(data, List):
                return [_convert_data(d) for d in data]
            raise NotImplementedError(f"Data: {data}")

        def eq_token(token: Token, other: Token) -> bool:
            return (
                token.id == other.id
                and token.text == other.text
                and (
                    self.ignore_logprob
                    or (token.logprob == other.logprob and token.logprob is None)
                    or math.isclose(token.logprob, other.logprob, rel_tol=self.rtol)
                )
                and token.special == other.special
            )

        def eq_prefill_token(prefill_token: InputToken, other: InputToken) -> bool:
            try:
                return (
                    prefill_token.id == other.id
                    and prefill_token.text == other.text
                    and (
                        self.ignore_logprob
                        or math.isclose(
                            prefill_token.logprob,
                            other.logprob,
                            rel_tol=self.rtol,
                        )
                        if prefill_token.logprob is not None
                        else prefill_token.logprob == other.logprob
                    )
                )
            except TypeError:
                return False

        def eq_best_of(details: BestOfSequence, other: BestOfSequence) -> bool:
            return (
                details.finish_reason == other.finish_reason
                and details.generated_tokens == other.generated_tokens
                and details.seed == other.seed
                and len(details.prefill) == len(other.prefill)
                and all(
                    [
                        eq_prefill_token(d, o)
                        for d, o in zip(details.prefill, other.prefill)
                    ]
                )
                and len(details.tokens) == len(other.tokens)
                and all([eq_token(d, o) for d, o in zip(details.tokens, other.tokens)])
            )

        def eq_details(details: Details, other: Details) -> bool:
            return (
                details.finish_reason == other.finish_reason
                and details.generated_tokens == other.generated_tokens
                and details.seed == other.seed
                and len(details.prefill) == len(other.prefill)
                and all(
                    [
                        eq_prefill_token(d, o)
                        for d, o in zip(details.prefill, other.prefill)
                    ]
                )
                and len(details.tokens) == len(other.tokens)
                and all([eq_token(d, o) for d, o in zip(details.tokens, other.tokens)])
                and (
                    len(details.best_of_sequences)
                    if details.best_of_sequences is not None
                    else 0
                )
                == (
                    len(other.best_of_sequences)
                    if other.best_of_sequences is not None
                    else 0
                )
                and (
                    all(
                        [
                            eq_best_of(d, o)
                            for d, o in zip(
                                details.best_of_sequences, other.best_of_sequences
                            )
                        ]
                    )
                    if details.best_of_sequences is not None
                    else details.best_of_sequences == other.best_of_sequences
                )
            )

        def eq_completion(response: Completion, other: Completion) -> bool:
            return response.choices[0].text == other.choices[0].text

        def eq_chat_complete(response: ChatComplete, other: ChatComplete) -> bool:
            return (
                response.choices[0].message.content == other.choices[0].message.content
            )

        def eq_chat_complete_chunk(
            response: ChatCompletionChunk, other: ChatCompletionChunk
        ) -> bool:
            if response.choices:
                if response.choices[0].delta.content is not None:
                    return (
                        response.choices[0].delta.content
                        == other.choices[0].delta.content
                    )
                elif response.choices[0].delta.tool_calls is not None:
                    return (
                        response.choices[0].delta.tool_calls
                        == other.choices[0].delta.tool_calls
                    )
                else:
                    raise RuntimeError(
                        f"Invalid empty chat chunk {response} vs {other}"
                    )
            elif response.usage is not None:
                return response.usage == other.usage
            else:
                raise RuntimeError(f"Invalid empty chat {response} vs {other}")

        def eq_response(response: Response, other: Response) -> bool:
            return response.generated_text == other.generated_text and eq_details(
                response.details, other.details
            )

        serialized_data = convert_data(serialized_data)
        snapshot_data = convert_data(snapshot_data)

        if not isinstance(serialized_data, List):
            serialized_data = [serialized_data]
        if not isinstance(snapshot_data, List):
            snapshot_data = [snapshot_data]

        if len(serialized_data) == 0:
            return len(snapshot_data) == len(serialized_data)

        if isinstance(serialized_data[0], Completion):
            return len(snapshot_data) == len(serialized_data) and all(
                [eq_completion(r, o) for r, o in zip(serialized_data, snapshot_data)]
            )

        if isinstance(serialized_data[0], ChatComplete):
            return len(snapshot_data) == len(serialized_data) and all(
                [eq_chat_complete(r, o) for r, o in zip(serialized_data, snapshot_data)]
            )

        if isinstance(serialized_data[0], ChatCompletionChunk):
            return len(snapshot_data) == len(serialized_data) and all(
                [
                    eq_chat_complete_chunk(r, o)
                    for r, o in zip(serialized_data, snapshot_data)
                ]
            )

        return len(snapshot_data) == len(serialized_data) and all(
            [eq_response(r, o) for r, o in zip(serialized_data, snapshot_data)]
        )


class GenerousResponseComparator(ResponseComparator):
    # Needed for GPTQ with exllama which has serious numerical fluctuations.
    rtol = 0.75


class IgnoreLogProbResponseComparator(ResponseComparator):
    ignore_logprob = True


class LauncherHandle:
    def __init__(self, port: int, error_log):
        with warnings.catch_warnings(action="ignore"):
            self.client = AsyncClient(f"http://localhost:{port}", timeout=30)
        self.error_log = error_log

    def _inner_health(self):
        raise NotImplementedError

    async def health(self, timeout: int = 60):
        assert timeout > 0
        for _ in range(timeout):
            if not self._inner_health():
                self.error_log.seek(0)
                print(self.error_log.read(), file=sys.stderr)
                raise RuntimeError("Launcher crashed")

            try:
                await self.client.generate("test")
                return
            except (ClientConnectorError, ClientOSError, ServerDisconnectedError):
                time.sleep(1)
        self.error_log.seek(0)
        print(self.error_log.read(), file=sys.stderr)
        raise RuntimeError("Health check failed")


class ContainerLauncherHandle(LauncherHandle):
    def __init__(self, docker_client, container_name, port: int, error_log):
        super().__init__(port, error_log)
        self.docker_client = docker_client
        self.container_name = container_name

    def _inner_health(self) -> bool:
        container = self.docker_client.containers.get(self.container_name)
        return container.status in ["running", "created"]


class ProcessLauncherHandle(LauncherHandle):
    def __init__(self, process, port: int, error_log):
        super().__init__(port, error_log)
        self.process = process

    def _inner_health(self) -> bool:
        return self.process.poll() is None


@pytest.fixture
def response_snapshot(snapshot):
    return snapshot.use_extension(ResponseComparator)


@pytest.fixture
def generous_response_snapshot(snapshot):
    return snapshot.use_extension(GenerousResponseComparator)


@pytest.fixture
def ignore_logprob_response_snapshot(snapshot):
    return snapshot.use_extension(IgnoreLogProbResponseComparator)


@pytest.fixture(scope="session")
def error_log():
    with tempfile.TemporaryFile("w+") as tmp:
        yield tmp


@pytest.fixture(scope="session")
async def launcher(error_log):
    @contextlib.contextmanager
    def local_launcher(
        model_id: str,
        num_shard: Optional[int] = None,
        quantize: Optional[str] = None,
        trust_remote_code: bool = False,
        use_flash_attention: bool = True,
        disable_grammar_support: bool = False,
        dtype: Optional[str] = None,
        kv_cache_dtype: Optional[str] = None,
        revision: Optional[str] = None,
        max_input_length: Optional[int] = None,
        max_input_tokens: Optional[int] = None,
        max_batch_prefill_tokens: Optional[int] = None,
        max_total_tokens: Optional[int] = None,
        lora_adapters: Optional[List[str]] = None,
        cuda_graphs: Optional[List[int]] = None,
        attention: Optional[str] = None,
    ):
        port = random.randint(8000, 10_000)
        master_port = random.randint(10_000, 20_000)

        shard_uds_path = (
            f"/tmp/tgi-tests-{model_id.split('/')[-1]}-{num_shard}-{quantize}-server"
        )

        args = [
            "text-generation-launcher",
            "--model-id",
            model_id,
            "--port",
            str(port),
            "--master-port",
            str(master_port),
            "--shard-uds-path",
            shard_uds_path,
        ]

        env = os.environ

        if disable_grammar_support:
            args.append("--disable-grammar-support")
        if num_shard is not None:
            args.extend(["--num-shard", str(num_shard)])
        if quantize is not None:
            args.append("--quantize")
            args.append(quantize)
        if dtype is not None:
            args.append("--dtype")
            args.append(dtype)
        if kv_cache_dtype is not None:
            args.append("--kv-cache-dtype")
            args.append(kv_cache_dtype)
        if revision is not None:
            args.append("--revision")
            args.append(revision)
        if trust_remote_code:
            args.append("--trust-remote-code")
        if max_input_length:
            args.append("--max-input-length")
            args.append(str(max_input_length))
        if max_input_tokens:
            args.append("--max-input-tokens")
            args.append(str(max_input_tokens))
        if max_batch_prefill_tokens:
            args.append("--max-batch-prefill-tokens")
            args.append(str(max_batch_prefill_tokens))
        if max_total_tokens:
            args.append("--max-total-tokens")
            args.append(str(max_total_tokens))
        if lora_adapters:
            args.append("--lora-adapters")
            args.append(",".join(lora_adapters))
        if cuda_graphs:
            args.append("--cuda-graphs")
            args.append(",".join(map(str, cuda_graphs)))

        print(" ".join(args), file=sys.stderr)

        env["LOG_LEVEL"] = "info,text_generation_router=debug"
        env["PREFILL_CHUNKING"] = "1"

        if not use_flash_attention:
            env["USE_FLASH_ATTENTION"] = "false"
        if attention is not None:
            env["ATTENTION"] = attention

            # with tempfile.TemporaryFile("w+") as tmp:
            # We'll output stdout/stderr to a temporary file. Using a pipe
            # cause the process to block until stdout is read.
        with subprocess.Popen(
            args,
            stdout=error_log,
            stderr=subprocess.STDOUT,
            env=env,
        ) as process:
            yield ProcessLauncherHandle(process, port, error_log=error_log)

            process.terminate()
            process.wait(60)

        if not use_flash_attention:
            del env["USE_FLASH_ATTENTION"]

    @contextlib.contextmanager
    def docker_launcher(
        model_id: str,
        num_shard: Optional[int] = None,
        quantize: Optional[str] = None,
        trust_remote_code: bool = False,
        use_flash_attention: bool = True,
        disable_grammar_support: bool = False,
        dtype: Optional[str] = None,
        kv_cache_dtype: Optional[str] = None,
        revision: Optional[str] = None,
        max_input_length: Optional[int] = None,
        max_batch_prefill_tokens: Optional[int] = None,
        max_total_tokens: Optional[int] = None,
        lora_adapters: Optional[List[str]] = None,
        cuda_graphs: Optional[List[int]] = None,
        attention: Optional[str] = None,
    ):
        port = random.randint(8000, 10_000)

        args = ["--model-id", model_id, "--env"]

        if disable_grammar_support:
            args.append("--disable-grammar-support")
        if num_shard is not None:
            args.extend(["--num-shard", str(num_shard)])
        if quantize is not None:
            args.append("--quantize")
            args.append(quantize)
        if dtype is not None:
            args.append("--dtype")
            args.append(dtype)
        if kv_cache_dtype is not None:
            args.append("--kv-cache-dtype")
            args.append(kv_cache_dtype)
        if revision is not None:
            args.append("--revision")
            args.append(revision)
        if trust_remote_code:
            args.append("--trust-remote-code")
        if max_input_length:
            args.append("--max-input-length")
            args.append(str(max_input_length))
        if max_batch_prefill_tokens:
            args.append("--max-batch-prefill-tokens")
            args.append(str(max_batch_prefill_tokens))
        if max_total_tokens:
            args.append("--max-total-tokens")
            args.append(str(max_total_tokens))
        if lora_adapters:
            args.append("--lora-adapters")
            args.append(",".join(lora_adapters))
        if cuda_graphs:
            args.append("--cuda-graphs")
            args.append(",".join(map(str, cuda_graphs)))

        client = docker.from_env()

        container_name = f"tgi-tests-{model_id.split('/')[-1]}-{num_shard}-{quantize}"

        try:
            container = client.containers.get(container_name)
            container.stop()
            container.remove()
            container.wait()
        except NotFound:
            pass

        gpu_count = num_shard if num_shard is not None else 1

        env = {
            "LOG_LEVEL": "info,text_generation_router=debug",
            "PREFILL_CHUNKING": "1",
        }
        if not use_flash_attention:
            env["USE_FLASH_ATTENTION"] = "false"
        if attention is not None:
            env["ATTENTION"] = attention

        if HF_TOKEN is not None:
            env["HF_TOKEN"] = HF_TOKEN

        volumes = []
        if DOCKER_VOLUME:
            volumes = [f"{DOCKER_VOLUME}:/data"]

        if DOCKER_DEVICES:
            if DOCKER_DEVICES.lower() == "none":
                devices = []
            else:
                devices = DOCKER_DEVICES.strip().split(",")
            visible = os.getenv("ROCR_VISIBLE_DEVICES")
            if visible:
                env["ROCR_VISIBLE_DEVICES"] = visible
            device_requests = []
            if not devices:
                devices = None
            elif devices == ["nvidia.com/gpu=all"]:
                devices = None
                device_requests = [
                    docker.types.DeviceRequest(
                        driver="cdi",
                        # count=gpu_count,
                        device_ids=[f"nvidia.com/gpu={i}"],
                    )
                    for i in range(gpu_count)
                ]
        else:
            devices = None
            device_requests = [
                docker.types.DeviceRequest(count=gpu_count, capabilities=[["gpu"]])
            ]

        client.api.timeout = 1000
        container = client.containers.run(
            DOCKER_IMAGE,
            command=args,
            name=container_name,
            environment=env,
            auto_remove=False,
            detach=True,
            device_requests=device_requests,
            devices=devices,
            volumes=volumes,
            ports={"80/tcp": port},
            healthcheck={"timeout": int(180 * 1e9), "retries": 2},  # 60s
            shm_size="1G",
        )

        def pipe():
            for log in container.logs(stream=True):
                log = log.decode("utf-8")
                error_log.write(log)

        # Start looping to pipe the logs
        import threading

        t = threading.Thread(target=pipe, args=())
        t.start()

        try:
            yield ContainerLauncherHandle(
                client, container.name, port, error_log=error_log
            )

            if not use_flash_attention:
                del env["USE_FLASH_ATTENTION"]

            try:
                container.stop()
                container.wait()
            except NotFound:
                pass

        finally:
            try:
                container.remove()
            except Exception:
                pass

    if DOCKER_IMAGE is not None:
        return docker_launcher
    return local_launcher


@pytest.fixture(scope="module")
def generate_load():
    async def generate_load_inner(
        client: AsyncClient,
        prompt: str,
        max_new_tokens: int,
        n: int,
        seed: Optional[int] = None,
        grammar: Optional[Grammar] = None,
        stop_sequences: Optional[List[str]] = None,
    ) -> List[Response]:
        futures = [
            client.generate(
                prompt,
                max_new_tokens=max_new_tokens,
                decoder_input_details=True,
                seed=seed,
                grammar=grammar,
                stop_sequences=stop_sequences,
            )
            for _ in range(n)
        ]

        return await asyncio.gather(*futures)

    return generate_load_inner


@pytest.fixture(scope="module")
def generate_multi():
    async def generate_load_inner(
        client: AsyncClient,
        prompts: List[str],
        max_new_tokens: int,
        seed: Optional[int] = None,
    ) -> List[Response]:
        import numpy as np

        arange = np.arange(len(prompts))
        perm = np.random.permutation(arange)
        rperm = [-1] * len(perm)
        for i, p in enumerate(perm):
            rperm[p] = i

        shuffled_prompts = [prompts[p] for p in perm]
        futures = [
            client.chat(
                messages=[Message(role="user", content=prompt)],
                max_tokens=max_new_tokens,
                temperature=0,
                seed=seed,
            )
            for prompt in shuffled_prompts
        ]

        shuffled_responses = await asyncio.gather(*futures)
        responses = [shuffled_responses[p] for p in rperm]
        return responses

    return generate_load_inner


# TODO fix the server parsser to count inline image tokens correctly
@pytest.fixture
def chicken():
    path = Path(__file__).parent / "images" / "chicken_on_money.png"

    with open(path, "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read())
    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"


@pytest.fixture
def cow_beach():
    path = Path(__file__).parent / "images" / "cow_beach.png"

    with open(path, "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read())
    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"


================================================
FILE: integration-tests/fixtures/gaudi/service.py
================================================
import asyncio
import contextlib
import os
import shlex
import subprocess
import sys
import threading
import time
from tempfile import TemporaryDirectory
from typing import List
import socket

import docker
import pytest
from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
from docker.errors import NotFound
import logging
from huggingface_hub import AsyncInferenceClient, TextGenerationOutput
import huggingface_hub

logging.basicConfig(
    level=logging.INFO,
    format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>",
    stream=sys.stdout,
)
logger = logging.getLogger(__file__)

# Use the latest image from the local docker build
DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "tgi-gaudi")
DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", None)
HF_TOKEN = huggingface_hub.get_token()

assert (
    HF_TOKEN is not None
), "HF_TOKEN is not set, please set it as some models are gated and thus the test will fail without it"

if DOCKER_VOLUME is None:
    logger.warning(
        "DOCKER_VOLUME is not set, this will lead to the tests redownloading the models on each run, consider setting it to speed up testing"
    )

LOG_LEVEL = os.getenv("LOG_LEVEL", "info")

BASE_ENV = {
    "HF_HUB_ENABLE_HF_TRANSFER": "1",
    "LOG_LEVEL": LOG_LEVEL,
    "HF_TOKEN": os.getenv("HF_TOKEN", None),
}


HABANA_RUN_ARGS = {
    "runtime": "habana",
    "ipc_mode": "host",
    "cap_add": ["sys_nice"],
}


def stream_container_logs(container, test_name):
    """Stream container logs in a separate thread."""
    try:
        for log in container.logs(stream=True, follow=True):
            print(
                f"[TGI Server Logs - {test_name}] {log.decode('utf-8')}",
                end="",
                file=sys.stderr,
                flush=True,
            )
    except Exception as e:
        logger.error(f"Error streaming container logs: {str(e)}")


class TestClient(AsyncInferenceClient):
    def __init__(self, service_name: str, base_url: str):
        super().__init__(model=base_url)
        self.service_name = service_name


class LauncherHandle:
    def __init__(self, service_name: str, port: int):
        self.client = TestClient(service_name, f"http://localhost:{port}")

    def _inner_health(self):
        raise NotImplementedError

    async def health(self, timeout: int = 60):
        assert timeout > 0
        start_time = time.time()
        logger.info(f"Starting health check with timeout of {timeout}s")

        for attempt in range(timeout):
            if not self._inner_health():
                logger.error("Launcher crashed during health check")
                raise RuntimeError("Launcher crashed")

            try:
                await self.client.text_generation("test", max_new_tokens=1)
                elapsed = time.time() - start_time
                logger.info(f"Health check passed after {elapsed:.1f}s")
                return
            except (ClientConnectorError, ClientOSError, ServerDisconnectedError) as e:
                if attempt == timeout - 1:
                    logger.error(f"Health check failed after {timeout}s: {str(e)}")
                    raise RuntimeError(f"Health check failed: {str(e)}")
                if attempt % 10 == 0 and attempt != 0:  # Only log every 10th attempt
                    logger.debug(
                        f"Connection attempt {attempt}/{timeout} failed: {str(e)}"
                    )
                time.sleep(1)
            except Exception as e:
                logger.error(f"Unexpected error during health check: {str(e)}")
                # Get full traceback for debugging
                import traceback

                logger.error(f"Full traceback:\n{traceback.format_exc()}")
                raise


class ContainerLauncherHandle(LauncherHandle):
    def __init__(self, docker_client, container_name, port: int):
        service_name = container_name  # Use container name as service name
        super(ContainerLauncherHandle, self).__init__(service_name, port)
        self.docker_client = docker_client
        self.container_name = container_name

    def _inner_health(self) -> bool:
        try:
            container = self.docker_client.containers.get(self.container_name)
            status = container.status
            if status not in ["running", "created"]:
                logger.warning(f"Container status is {status}")
                # Get container logs for debugging
                logs = container.logs().decode("utf-8")
                logger.debug(f"Container logs:\n{logs}")
            return status in ["running", "created"]
        except Exception as e:
            logger.error(f"Error checking container health: {str(e)}")
            return False


class ProcessLauncherHandle(LauncherHandle):
    def __init__(self, process, port: int):
        service_name = "process"  # Use generic name for process launcher
        super(ProcessLauncherHandle, self).__init__(service_name, port)
        self.process = process

    def _inner_health(self) -> bool:
        return self.process.poll() is None


@pytest.fixture(scope="module")
def data_volume():
    tmpdir = TemporaryDirectory()
    yield tmpdir.name
    try:
        # Cleanup the temporary directory using sudo as it contains root files created by the container
        subprocess.run(shlex.split(f"sudo rm -rf {tmpdir.name}"), check=True)
    except subprocess.CalledProcessError as e:
        logger.error(f"Error cleaning up temporary directory: {str(e)}")


@pytest.fixture(scope="module")
def gaudi_launcher():
    @contextlib.contextmanager
    def docker_launcher(
        model_id: str,
        test_name: str,
        tgi_args: List[str] = None,
        env_config: dict = None,
    ):
        logger.info(
            f"Starting docker launcher for model {model_id} and test {test_name}"
        )

        # Get a random available port
        def get_free_port():
            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
                s.bind(("", 0))
                s.listen(1)
                port = s.getsockname()[1]
            return port

        port = get_free_port()
        logger.debug(f"Using port {port}")

        client = docker.from_env()

        container_name = f"tgi-gaudi-test-{test_name.replace('/', '-')}"

        try:
            container = client.containers.get(container_name)
            logger.info(
                f"Stopping existing container {container_name} for test {test_name}"
            )
            container.stop()
            container.wait()
            container.remove()
            logger.info(f"Removed existing container {container_name}")
        except NotFound:
            pass
        except Exception as e:
            logger.error(f"Error handling existing container: {str(e)}")

        if tgi_args is None:
            tgi_args = []
        else:
            tgi_args = tgi_args.copy()

        env = BASE_ENV.copy()

        # Add model_id to env
        env["MODEL_ID"] = model_id

        # Add env config that is defined in the fixture parameter
        if env_config is not None:
            env.update(env_config.copy())

        volumes = []
        if DOCKER_VOLUME:
            volumes = [f"{DOCKER_VOLUME}:/data"]
        logger.debug(f"Using volume {volumes}")

        try:
            logger.debug(f"Using command {tgi_args}")
            logger.info(f"Creating container with name {container_name}")

            logger.debug(f"Using environment {env}")
            logger.debug(f"Using volumes {volumes}")
            logger.debug(f"HABANA_RUN_ARGS {HABANA_RUN_ARGS}")

            # Log equivalent docker run command for debugging, this is not actually executed
            container = client.containers.run(
                DOCKER_IMAGE,
                command=tgi_args,
                name=container_name,
                environment=env,
                detach=True,
                volumes=volumes,
                ports={"80/tcp": port},
                **HABANA_RUN_ARGS,
            )

            logger.info(f"Container {container_name} started successfully")

            # Start log streaming in a background thread
            log_thread = threading.Thread(
                target=stream_container_logs,
                args=(container, test_name),
                daemon=True,  # This ensures the thread will be killed when the main program exits
            )
            log_thread.start()

            # Add a small delay to allow container to initialize
            time.sleep(2)

            # Check container status after creation
            status = container.status
            logger.debug(f"Initial container status: {status}")
            if status not in ["running", "created"]:
                logs = container.logs().decode("utf-8")
                logger.error(f"Container failed to start properly. Logs:\n{logs}")

            yield ContainerLauncherHandle(client, container.name, port)

        except Exception as e:
            logger.error(f"Error starting container: {str(e)}")
            # Get full traceback for debugging
            import traceback

            logger.error(f"Full traceback:\n{traceback.format_exc()}")
            raise
        finally:
            try:
                container = client.containers.get(container_name)
                logger.info(f"Stopping container {container_name}")
                container.stop()
                container.wait()

                container_output = container.logs().decode("utf-8")
                print(container_output, file=sys.stderr)

                container.remove()
                logger.info(f"Container {container_name} removed successfully")
            except NotFound:
                pass
            except Exception as e:
                logger.warning(f"Error cleaning up container: {str(e)}")

    return docker_launcher


@pytest.fixture(scope="module")
def gaudi_generate_load():
    async def generate_load_inner(
        client: AsyncInferenceClient, prompt: str, max_new_tokens: int, n: int
    ) -> List[TextGenerationOutput]:
        try:
            futures = [
                client.text_generation(
                    prompt,
                    max_new_tokens=max_new_tokens,
                    details=True,
                    decoder_input_details=True,
                )
                for _ in range(n)
            ]
            return await asyncio.gather(*futures)
        except Exception as e:
            logger.error(f"Error generating load: {str(e)}")
            raise

    return generate_load_inner


================================================
FILE: integration-tests/fixtures/neuron/export_models.py
================================================
import copy
import logging
import sys
from tempfile import TemporaryDirectory

import huggingface_hub
import pytest
import docker
import hashlib
import os
import tempfile

from docker.errors import NotFound


TEST_ORGANIZATION = "optimum-internal-testing"
TEST_CACHE_REPO_ID = f"{TEST_ORGANIZATION}/neuron-testing-cache"
HF_TOKEN = huggingface_hub.get_token()


logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s] %(levelname)s [%(filename)s.%(funcName)s:%(lineno)d] %(message)s",
    stream=sys.stdout,
)
logger = logging.getLogger(__file__)


# All model configurations below will be added to the neuron_model_config fixture
MODEL_CONFIGURATIONS = {
    "llama": {
        "model_id": "unsloth/Llama-3.2-1B-Instruct",
        "export_kwargs": {
            "batch_size": 4,
            "sequence_length": 2048,
            "num_cores": 2,
            "auto_cast_type": "fp16",
        },
    },
    "qwen2": {
        "model_id": "Qwen/Qwen2.5-0.5B",
        "export_kwargs": {
            "batch_size": 4,
            "sequence_length": 4096,
            "num_cores": 2,
            "auto_cast_type": "fp16",
        },
    },
    "qwen3": {
        "model_id": "Qwen/Qwen3-1.7B",
        "export_kwargs": {
            "batch_size": 4,
            "sequence_length": 4096,
            "num_cores": 2,
            "auto_cast_type": "bf16",
        },
    },
    "granite": {
        "model_id": "ibm-granite/granite-3.1-2b-instruct",
        "export_kwargs": {
            "batch_size": 4,
            "sequence_length": 4096,
            "num_cores": 2,
            "auto_cast_type": "bf16",
        },
    },
    "phi3": {
        "model_id": "microsoft/Phi-3-mini-4k-instruct",
        "export_kwargs": {
            "batch_size": 4,
            "sequence_length": 4096,
            "num_cores": 2,
            "auto_cast_type": "bf16",
        },
    },
}


def get_neuron_backend_hash():
    import subprocess

    res = subprocess.run(
        ["git", "rev-parse", "--show-toplevel"], capture_output=True, text=True
    )
    root_dir = res.stdout.split("\n")[0]

    def get_sha(path):
        res = subprocess.run(
            ["git", "ls-tree", "HEAD", f"{root_dir}/{path}"],
            capture_output=True,
            text=True,
        )
        # Output of the command is in the form '040000 tree|blob <SHA>\t<path>\n'
        sha = res.stdout.split("\t")[0].split(" ")[-1]
        return sha.encode()

    # We hash both the neuron backends directory and Dockerfile and create a smaller hash out of that
    m = hashlib.sha256()
    m.update(get_sha("backends/neuron"))
    m.update(get_sha("Dockerfile.neuron"))
    return m.hexdigest()[:10]


def get_neuron_model_name(config_name: str):
    return f"neuron-tgi-testing-{config_name}-{get_neuron_backend_hash()}"


def get_tgi_docker_image():
    docker_image = os.getenv("DOCKER_IMAGE", None)
    if docker_image is None:
        client = docker.from_env()
        images = client.images.list(filters={"reference": "text-generation-inference"})
        if not images:
            raise ValueError(
                "No text-generation-inference image found on this host to run tests."
            )
        docker_image = images[0].tags[0]
    return docker_image


def maybe_export_model(config_name, model_config):
    """Export a neuron model for the specified test configuration.

    If the neuron model has not already been compiled and pushed to the hub, it is
    exported by a custom image built on the fly from the base TGI image.
    This makes sure the exported model and image are aligned and avoids introducing
    neuron specific imports in the test suite.

    Args:
        config_name (`str`):
            Used to identify test configurations
        model_config (`str`):
            The model configuration for export (includes the original model id)
    """
    neuron_model_name = get_neuron_model_name(config_name)
    neuron_model_id = f"{TEST_ORGANIZATION}/{neuron_model_name}"
    hub = huggingface_hub.HfApi()
    if hub.repo_exists(neuron_model_id):
        logger.info(
            f"Skipping model export for config {config_name} as {neuron_model_id} already exists"
        )
        return neuron_model_id

    client = docker.from_env()

    env = {"LOG_LEVEL": "info", "CUSTOM_CACHE_REPO": TEST_CACHE_REPO_ID}
    if HF_TOKEN is not None:
        env["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
        env["HF_TOKEN"] = HF_TOKEN

    # Create a sub-image to export the model to workaround docker dind issues preventing
    # to share a volume from the container running tests
    model_id = model_config["model_id"]
    export_kwargs = model_config["export_kwargs"]
    base_image = get_tgi_docker_image()
    export_image = f"neuron-tgi-tests-{config_name}-export-img"
    logger.info(f"Building temporary image {export_image} from {base_image}")
    with tempfile.TemporaryDirectory() as context_dir:
        # Create entrypoint
        model_path = "/data/neuron_model"
        export_command = (
            f"optimum-cli export neuron -m {model_id} --task text-generation"
        )
        for kwarg, value in export_kwargs.items():
            export_command += f" --{kwarg} {str(value)}"
        export_command += f" {model_path}"
        entrypoint_content = f"""#!/bin/sh
        {export_command}
        huggingface-cli repo create --organization {TEST_ORGANIZATION} {neuron_model_name}
        huggingface-cli upload {TEST_ORGANIZATION}/{neuron_model_name} {model_path} --exclude *.bin *.safetensors
        optimum-cli neuron cache synchronize --repo_id {TEST_CACHE_REPO_ID}
        """
        with open(os.path.join(context_dir, "entrypoint.sh"), "wb") as f:
            f.write(entrypoint_content.encode("utf-8"))
            f.flush()
        # Create Dockerfile
        docker_content = f"""
        FROM {base_image}
        COPY entrypoint.sh /export-entrypoint.sh
        RUN chmod +x /export-entrypoint.sh
        ENTRYPOINT ["/export-entrypoint.sh"]
        """
        with open(os.path.join(context_dir, "Dockerfile"), "wb") as f:
            f.write(docker_content.encode("utf-8"))
            f.flush()
        image, logs = client.images.build(
            path=context_dir, dockerfile=f.name, tag=export_image
        )
        logger.info("Successfully built image %s", image.id)
        logger.debug("Build logs %s", logs)

    try:
        client.containers.run(
            export_image,
            environment=env,
            auto_remove=True,
            detach=False,
            devices=["/dev/neuron0"],
            shm_size="1G",
        )
        logger.info(f"Successfully exported model for config {config_name}")
    except Exception as e:
        logger.exception(f"An exception occurred while running container: {e}.")
        pass
    finally:
        # Cleanup the export image
        logger.info("Cleaning image %s", image.id)
        try:
            image.remove(force=True)
        except NotFound:
            pass
        except Exception as e:
            logger.error("Error while removing image %s, skipping", image.id)
            logger.exception(e)
    return neuron_model_id


def maybe_export_models():
    for config_name, model_config in MODEL_CONFIGURATIONS.items():
        maybe_export_model(config_name, model_config)


@pytest.fixture(scope="session", params=MODEL_CONFIGURATIONS.keys())
def neuron_model_config(request):
    """Expose a pre-trained neuron model

    The fixture first makes sure the following model artifacts are present on the hub:
    - exported neuron model under optimum-internal-testing/neuron-testing-<name>-<version>,
    - cached artifacts under optimum-internal-testing/neuron-testing-cache.
    If not, it will export the model and push it to the hub.

    It then fetches the model locally and return a dictionary containing:
    - a configuration name,
    - the original model id,
    - the export parameters,
    - the neuron model id,
    - the neuron model local path.

    For each exposed model, the local directory is maintained for the duration of the
    test session and cleaned up afterwards.
    The hub model artifacts are never cleaned up and persist accross sessions.
    They must be cleaned up manually when the optimum-neuron version changes.

    """
    config_name = request.param
    model_config = copy.deepcopy(MODEL_CONFIGURATIONS[request.param])
    # Export the model first (only if needed)
    neuron_model_id = maybe_export_model(config_name, model_config)
    with TemporaryDirectory() as neuron_model_path:
        logger.info(f"Fetching {neuron_model_id} from the HuggingFace hub")
        hub = huggingface_hub.HfApi()
        hub.snapshot_download(
            neuron_model_id, etag_timeout=30, local_dir=neuron_model_path
        )
        # Add dynamic parameters to the model configuration
        model_config["neuron_model_path"] = neuron_model_path
        model_config["neuron_model_id"] = neuron_model_id
        # Also add model configuration name to allow tests to adapt their expectations
        model_config["name"] = config_name
        # Yield instead of returning to keep a reference to the temporary directory.
        # It will go out of scope and be released only once all tests needing the fixture
        # have been completed.
        logger.info(f"{config_name} ready for testing ...")
        yield model_config
        logger.info(f"Done with {config_name}")


@pytest.fixture(scope="module")
def neuron_model_path(neuron_model_config):
    yield neuron_model_config["neuron_model_path"]


if __name__ == "__main__":
    maybe_export_models()


================================================
FILE: integration-tests/fixtures/neuron/service.py
================================================
import asyncio
import contextlib
import logging
import os
import random
import shutil
import sys
import tempfile
import time
from typing import List

import docker
import huggingface_hub
import pytest
from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
from docker.errors import NotFound
from huggingface_hub import AsyncInferenceClient, TextGenerationOutput


OPTIMUM_CACHE_REPO_ID = "optimum-internal-testing/neuron-testing-cache"
HF_TOKEN = huggingface_hub.get_token()


def get_tgi_docker_image():
    docker_image = os.getenv("DOCKER_IMAGE", None)
    if docker_image is None:
        client = docker.from_env()
        images = client.images.list(filters={"reference": "text-generation-inference"})
        if not images:
            raise ValueError(
                "No text-generation-inference image found on this host to run tests."
            )
        docker_image = images[0].tags[0]
    return docker_image


logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s] %(levelname)s [%(filename)s.%(funcName)s:%(lineno)d] %(message)s",
    stream=sys.stdout,
)
logger = logging.getLogger(__file__)


class TestClient(AsyncInferenceClient):
    def __init__(self, service_name: str, base_url: str):
        super().__init__(model=base_url)
        self.service_name = service_name


class LauncherHandle:
    def __init__(self, service_name: str, port: int):
        self.client = TestClient(service_name, f"http://localhost:{port}")

    def _inner_health(self):
        raise NotImplementedError

    async def health(self, timeout: int = 60):
        assert timeout > 0
        for i in range(timeout):
            if not self._inner_health():
                raise RuntimeError(f"Service crashed after {i} seconds.")

            try:
                await self.client.text_generation("test", max_new_tokens=1)
                logger.info(f"Service started after {i} seconds")
                return
            except (ClientConnectorError, ClientOSError, ServerDisconnectedError):
                time.sleep(1)
            except Exception:
                raise RuntimeError("Basic generation failed with: {e}")
        raise RuntimeError(f"Service failed to start after {i} seconds.")


class ContainerLauncherHandle(LauncherHandle):
    def __init__(self, service_name, docker_client, container_name, port: int):
        super(ContainerLauncherHandle, self).__init__(service_name, port)
        self.docker_client = docker_client
        self.container_name = container_name
        self._log_since = time.time()

    def _inner_health(self) -> bool:
        container = self.docker_client.containers.get(self.container_name)
        container_output = container.logs(since=self._log_since).decode("utf-8")
        self._log_since = time.time()
        if container_output != "":
            print(container_output, end="")
        return container.status in ["running", "created"]


@pytest.fixture(scope="module")
def event_loop():
    loop = asyncio.get_event_loop()
    yield loop
    loop.close()


@pytest.fixture(scope="module")
def neuron_launcher(event_loop):
    """Utility fixture to expose a TGI service.

    The fixture uses a single event loop for each module, but it can create multiple
    docker services with different parameters using the parametrized inner context.

    Args:
        service_name (`str`):
            Used to identify test configurations and adjust test expectations,
        model_name_or_path (`str`):
            The model to use (can be a hub model or a path)
        trust_remote_code (`bool`):
            Must be set to True for gated models.

    Returns:
        A `ContainerLauncherHandle` containing both a TGI server and client.
    """

    @contextlib.contextmanager
    def docker_launcher(
        service_name: str,
        model_name_or_path: str,
        trust_remote_code: bool = False,
    ):
        port = random.randint(8000, 10_000)

        client = docker.from_env()

        container_name = f"tgi-tests-{service_name}-{port}"

        try:
            container = client.containers.get(container_name)
            container.stop()
            container.wait()
        except NotFound:
            pass

        env = {
            "LOG_LEVEL": "info,text_generation_router=debug",
            "CUSTOM_CACHE_REPO": OPTIMUM_CACHE_REPO_ID,
        }

        if HF_TOKEN is not None:
            env["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
            env["HF_TOKEN"] = HF_TOKEN

        for var in [
            "MAX_BATCH_SIZE",
            "MAX_TOTAL_TOKENS",
            "HF_AUTO_CAST_TYPE",
            "HF_NUM_CORES",
        ]:
            if var in os.environ:
                env[var] = os.environ[var]

        base_image = get_tgi_docker_image()
        if os.path.isdir(model_name_or_path):
            # Create a sub-image containing the model to workaround docker dind issues preventing
            # to share a volume from the container running tests

            test_image = f"{container_name}-img"
            logger.info(
                "Building image on the flight derivated from %s, tagged with %s",
                base_image,
                test_image,
            )
            with tempfile.TemporaryDirectory() as context_dir:
                # Copy model directory to build context
                model_path = os.path.join(context_dir, "model")
                shutil.copytree(model_name_or_path, model_path)
                # Create Dockerfile
                container_model_id = f"/data/{model_name_or_path}"
                docker_content = f"""
                FROM {base_image}
                COPY model {container_model_id}
                """
                with open(os.path.join(context_dir, "Dockerfile"), "wb") as f:
                    f.write(docker_content.encode("utf-8"))
                    f.flush()
                image, logs = client.images.build(
                    path=context_dir, dockerfile=f.name, tag=test_image
                )
            logger.info("Successfully built image %s", image.id)
            logger.debug("Build logs %s", logs)
        else:
            test_image = base_image
            image = None
            container_model_id = model_name_or_path

        args = ["--model-id", container_model_id, "--env"]

        if trust_remote_code:
            args.append("--trust-remote-code")

        container = client.containers.run(
            test_image,
            command=args,
            name=container_name,
            environment=env,
            auto_remove=False,
            detach=True,
            devices=["/dev/neuron0"],
            ports={"80/tcp": port},
            shm_size="1G",
        )

        logger.info(f"Starting {container_name} container")
        yield ContainerLauncherHandle(service_name, client, container.name, port)

        try:
            container.stop(timeout=60)
            container.wait(timeout=60)
        except Exception as e:
            logger.exception(f"Ignoring exception while stopping container: {e}.")
            pass
        finally:
            logger.info("Removing container %s", container_name)
            try:
                container.remove(force=True)
            except Exception as e:
                logger.error(
                    "Error while removing container %s, skipping", container_name
                )
                logger.exception(e)

            # Cleanup the build image
            if image:
                logger.info("Cleaning image %s", image.id)
                try:
                    image.remove(force=True)
                except NotFound:
                    pass
                except Exception as e:
                    logger.error("Error while removing image %s, skipping", image.id)
                    logger.exception(e)

    return docker_launcher


@pytest.fixture(scope="module")
def neuron_generate_load():
    """A utility fixture to launch multiple asynchronous TGI requests in parallel

    Args:
        client (`AsyncClient`):
            An async client
        prompt (`str`):
            The prompt to use (identical for all requests)
        max_new_tokens (`int`):
            The number of tokens to generate for each request.
        n (`int`):
            The number of requests

    Returns:
        A list of `huggingface_hub.TextGenerationOutput`.
    """

    async def generate_load_inner(
        client: AsyncInferenceClient, prompt: str, max_new_tokens: int, n: int
    ) -> List[TextGenerationOutput]:
        futures = [
            client.text_generation(
                prompt,
                max_new_tokens=max_new_tokens,
                details=True,
                decoder_input_details=True,
            )
            for _ in range(n)
        ]

        return await asyncio.gather(*futures)

    return generate_load_inner


================================================
FILE: integration-tests/gaudi/capture_expected_outputs.py
================================================
import json
import os
from typing import Dict, Any, Generator

import pytest
from test_gaudi_generate import TEST_CONFIGS

UNKNOWN_CONFIGS = {
    name: config
    for name, config in TEST_CONFIGS.items()
    if config["expected_greedy_output"] == "unknown"
    or config["expected_batch_output"] == "unknown"
}


@pytest.fixture(scope="module", params=UNKNOWN_CONFIGS.keys())
def test_config(request) -> Dict[str, Any]:
    """Fixture that provides model configurations for testing."""
    test_config = UNKNOWN_CONFIGS[request.param]
    test_config["test_name"] = request.param
    return test_config


@pytest.fixture(scope="module")
def test_name(test_config):
    yield test_config["test_name"]


@pytest.fixture(scope="module")
def tgi_service(launcher, test_config, test_name) -> Generator:
    """Fixture that provides a TGI service for testing."""
    with launcher(test_config["model_id"], test_name) as service:
        yield service


@pytest.mark.asyncio
async def test_capture_expected_outputs(tgi_service, test_config, test_name):
    """Test that captures expected outputs for models with unknown outputs."""
    print(f"Testing {test_name} with {test_config['model_id']}")

    # Wait for service to be ready
    await tgi_service.health(1000)
    client = tgi_service.client

    # Test single request (greedy)
    print("Testing single request...")
    response = await client.generate(
        test_config["input"],
        max_new_tokens=32,
    )
    greedy_output = response.generated_text

    # Test multiple requests (batch)
    print("Testing batch requests...")
    responses = []
    for _ in range(4):
        response = await client.generate(
            test_config["input"],
            max_new_tokens=32,
        )
        responses.append(response.generated_text)

    # Store results in a JSON file
    output_file = "server/integration-tests/expected_outputs.json"
    results = {}

    # Try to load existing results if file exists
    if os.path.exists(output_file):
        with open(output_file, "r") as f:
            results = json.load(f)

    # Update results for this model
    results[test_name] = {
        "model_id": test_config["model_id"],
        "input": test_config["input"],
        "greedy_output": greedy_output,
        "batch_outputs": responses,
        "args": test_config["args"],
    }

    # Save updated results
    with open(output_file, "w") as f:
        json.dump(results, f, indent=2)

    print(f"\nResults for {test_name} saved to {output_file}")


================================================
FILE: integration-tests/gaudi/test_gaudi_generate.py
================================================
from typing import Any, Dict, Generator
from _pytest.fixtures import SubRequest
from huggingface_hub import AsyncInferenceClient
import pytest


def pytest_configure(config):
    config.addinivalue_line(
        "markers", "gaudi_all_models: mark test to run with all models"
    )


# The "args" values in TEST_CONFIGS are not optimized for speed but only check that the inference is working for the different models architectures.
TEST_CONFIGS = {
    "meta-llama/Llama-3.1-8B-Instruct-sharded": {
        "model_id": "meta-llama/Llama-3.1-8B-Instruct",
        "input": "What is Deep Learning?",
        "expected_greedy_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
        "expected_batch_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
        "args": [
            "--sharded",
            "true",
            "--num-shard",
            "2",
            "--max-input-tokens",
            "512",
            "--max-total-tokens",
            "1024",
            "--max-batch-size",
            "4",
            "--max-batch-prefill-tokens",
            "2048",
        ],
        "run_by_default": True,
    },
    "meta-llama/Llama-3.1-8B-Instruct": {
        "model_id": "meta-llama/Llama-3.1-8B-Instruct",
        "input": "What is Deep Learning?",
        "expected_greedy_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
        "expected_batch_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
        "env_config": {},
        "args": [
            "--max-input-tokens",
            "512",
            "--max-total-tokens",
            "1024",
            "--max-batch-size",
            "4",
            "--max-batch-prefill-tokens",
            "2048",
        ],
        "run_by_default": True,
    },
    "meta-llama/Llama-2-7b-chat-hf": {
        "model_id": "meta-llama/Llama-2-7b-chat-hf",
        "input": "What is Deep Learning?",
        "expected_greedy_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific",
        "expected_batch_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific",
        "args": [
            "--max-input-tokens",
            "512",
            "--max-total-tokens",
            "1024",
            "--max-batch-size",
            "4",
            "--max-batch-prefill-tokens",
            "2048",
        ],
    },
    "mistralai/Mistral-7B-Instruct-v0.3": {
        "model_id": "mistralai/Mistral-7B-Instruct-v0.3",
        "input": "What is Deep Learning?",
        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured",
        "expected_batch_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured",
        "args": [
            "--max-input-tokens",
            "512",
            "--max-total-tokens",
            "1024",
            "--max-batch-size",
            "4",
            "--max-batch-prefill-tokens",
            "2048",
        ],
    },
    "bigcode/starcoder2-3b": {
        "model_id": "bigcode/starcoder2-3b",
        "input": "What is Deep Learning?",
        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that",
        "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that",
        "args": [
            "--max-input-tokens",
            "512",
            "--max-total-tokens",
            "1024",
            "--max-batch-size",
            "4",
            "--max-batch-prefill-tokens",
            "2048",
        ],
    },
    "google/gemma-7b-it": {
        "model_id": "google/gemma-7b-it",
        "input": "What is Deep Learning?",
        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Deep learning is a powerful tool for many tasks,",
        "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Deep learning is a powerful tool for many tasks,",
        "args": [
            "--max-input-tokens",
            "512",
            "--max-total-tokens",
            "1024",
            "--max-batch-size",
            "4",
            "--max-batch-prefill-tokens",
            "2048",
        ],
    },
    "Qwen/Qwen2-0.5B-Instruct": {
        "model_id": "Qwen/Qwen2-0.5B-Instruct",
        "input": "What is Deep Learning?",
        "expected_greedy_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models",
        "expected_batch_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models",
        "args": [
            "--max-input-tokens",
            "512",
            "--max-total-tokens",
            "1024",
            "--max-batch-size",
            "4",
            "--max-batch-prefill-tokens",
            "2048",
        ],
    },
    "tiiuae/falcon-7b-instruct": {
        "model_id": "tiiuae/falcon-7b-instruct",
        "input": "What is Deep Learning?",
        "expected_greedy_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a",
        "expected_batch_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a",
        "args": [
            "--max-input-tokens",
            "512",
            "--max-total-tokens",
            "1024",
            "--max-batch-size",
            "4",
        ],
    },
    "microsoft/phi-1_5": {
        "model_id": "microsoft/phi-1_5",
        "input": "What is Deep Learning?",
        "expected_greedy_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large",
        "expected_batch_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large",
        "args": [
            "--max-input-tokens",
            "512",
            "--max-total-tokens",
            "1024",
            "--max-batch-size",
            "4",
        ],
    },
    "openai-community/gpt2": {
        "model_id": "openai-community/gpt2",
        "input": "What is Deep Learning?",
        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that is based on artificial neural networks. It is a type of machine learning that is based on the idea of",
        "expected_batch_output": "\n\nDeep learning is a subset of machine learning that is based on artificial neural networks. It is a type of machine learning that is based on the idea of",
        "args": [
            "--max-input-tokens",
            "512",
            "--max-total-tokens",
            "1024",
            "--max-batch-size",
            "4",
        ],
    },
    "EleutherAI/gpt-j-6b": {
        "model_id": "EleutherAI/gpt-j-6b",
        "input": "What is Deep Learning?",
        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by",
        "expected_batch_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by",
        "args": [
            "--max-input-tokens",
            "512",
            "--max-total-tokens",
            "1024",
            "--max-batch-size",
            "4",
        ],
    },
}


def pytest_generate_tests(metafunc):
    if "test_config" in metafunc.fixturenames:
        if metafunc.config.getoption("--gaudi-all-models"):
            models = list(TEST_CONFIGS.keys())
        else:
            models = [
                name
                for name, config in TEST_CONFIGS.items()
                if config.get("run_by_default", False)
            ]
        print(f"Testing {len(models)} models")
        metafunc.parametrize("test_config", models, indirect=True)


@pytest.fixture(scope="module")
def test_config(request: SubRequest) -> Dict[str, Any]:
    """Fixture that provides model configurations for testing."""
    model_name = request.param
    test_config = TEST_CONFIGS[model_name]
    test_config["test_name"] = model_name
    return test_config


@pytest.fixture(scope="module")
def model_id(test_config: Dict[str, Any]) -> Generator[str, None, None]:
    yield test_config["model_id"]


@pytest.fixture(scope="module")
def test_name(test_config: Dict[str, Any]) -> Generator[str, None, None]:
    yield test_config["test_name"]


@pytest.fixture(scope="module")
def expected_outputs(test_config: Dict[str, Any]) -> Dict[str, str]:
    return {
        "greedy": test_config["expected_greedy_output"],
        "batch": test_config["expected_batch_output"],
    }


@pytest.fixture(scope="module")
def input(test_config: Dict[str, Any]) -> str:
    return test_config["input"]


@pytest.fixture(scope="module")
def tgi_service(
    gaudi_launcher, model_id: str, test_name: str, test_config: Dict[str, Any]
):
    with gaudi_launcher(
        model_id,
        test_name,
        tgi_args=test_config.get("args", []),
        env_config=test_config.get("env_config", {}),
    ) as tgi_service:
        yield tgi_service


@pytest.fixture(scope="module")
async def tgi_client(tgi_service) -> AsyncInferenceClient:
    await tgi_service.health(1000)
    return tgi_service.client


@pytest.mark.asyncio
@pytest.mark.all_models
async def test_model_single_request(
    tgi_client: AsyncInferenceClient, expected_outputs: Dict[str, str], input: str
):
    # Bounded greedy decoding without input
    response = await tgi_client.text_generation(
        input,
        max_new_tokens=32,
        details=True,
        decoder_input_details=True,
    )
    assert response.details.generated_tokens == 32
    assert response.generated_text == expected_outputs["greedy"]


@pytest.mark.asyncio
@pytest.mark.all_models
async def test_model_multiple_requests(
    tgi_client: AsyncInferenceClient,
    gaudi_generate_load,
    expected_outputs: Dict[str, str],
    input: str,
):
    num_requests = 4
    responses = await gaudi_generate_load(
        tgi_client,
        input,
        max_new_tokens=32,
        n=num_requests,
    )

    assert len(responses) == 4
    expected = expected_outputs["batch"]
    for r in responses:
        assert r.details.generated_tokens == 32
        assert r.generated_text == expected


================================================
FILE: integration-tests/models/__snapshots__/test.py
================================================
import os
import json


for root, dirs, files in os.walk("."):
    for filename in files:
        if filename.endswith(".json"):
            with open(os.path.join(root, filename), "r") as f:
                data = json.load(f)

            print(os.path.join(root, filename))
            try:
                if filename.endswith("_load.json"):
                    for i in range(len(data)):
                        data[i]["details"]["prefill"] = []
                else:
                    data["details"]["prefill"] = []
            except Exception:
                pass

            with open(os.path.join(root, filename), "w") as f:
                json.dump(data, f, indent=2, ensure_ascii=False)


================================================
FILE: integration-tests/models/__snapshots__/test_bloom_560m/test_bloom_560m.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [
      {
        "id": 17934,
        "logprob": null,
        "text": "Pour"
      },
      {
        "id": 49833,
        "logprob": -10.5703125,
        "text": " dég"
      },
      {
        "id": 21543,
        "logprob": -0.14746094,
        "text": "uster"
      },
      {
        "id": 447,
        "logprob": -1.9277344,
        "text": " un"
      },
      {
        "id": 46341,
        "logprob": -15.421875,
        "text": " ort"
      },
      {
        "id": 35567,
        "logprob": -7.5820312,
        "text": "olan"
      },
      {
        "id": 15,
        "logprob": -1.4013672,
        "text": ","
      },
      {
        "id": 1669,
        "logprob": -1.5595703,
        "text": " il"
      },
      {
        "id": 11580,
        "logprob": -0.9428711,
        "text": " faut"
      },
      {
        "id": 3913,
        "logprob": -3.703125,
        "text": " tout"
      },
      {
        "id": 39261,
        "logprob": -1.7763672,
        "text": " d'abord"
      }
    ],
    "seed": 0,
    "tokens": [
      {
        "id": 578,
        "logprob": -1.7822266,
        "special": false,
        "text": " le"
      },
      {
        "id": 5608,
        "logprob": -2.4882812,
        "special": false,
        "text": " faire"
      },
      {
        "id": 7735,
        "logprob": -2.4199219,
        "special": false,
        "text": " fond"
      },
      {
        "id": 289,
        "logprob": 0.0,
        "special": false,
        "text": "re"
      },
      {
        "id": 693,
        "logprob": -2.4628906,
        "special": false,
        "text": " à"
      },
      {
        "id": 366,
        "logprob": -1.1308594,
        "special": false,
        "text": " la"
      },
      {
        "id": 48844,
        "logprob": -1.7900391,
        "special": false,
        "text": " cass"
      },
      {
        "id": 1744,
        "logprob": 0.0,
        "special": false,
        "text": "ero"
      },
      {
        "id": 327,
        "logprob": 0.0,
        "special": false,
        "text": "le"
      },
      {
        "id": 2940,
        "logprob": -1.9306641,
        "special": false,
        "text": " avec"
      }
    ],
    "top_tokens": null
  },
  "generated_text": " le faire fondre à la casserole avec"
}


================================================
FILE: integration-tests/models/__snapshots__/test_bloom_560m/test_bloom_560m_all_params.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [
      {
        "id": 15,
        "logprob": null,
        "text": ","
      },
      {
        "id": 1669,
        "logprob": -5.4453125,
        "text": " il"
      },
      {
        "id": 11580,
        "logprob": -2.3378906,
        "text": " faut"
      },
      {
        "id": 3913,
        "logprob": -4.3320312,
        "text": " tout"
      },
      {
        "id": 39261,
        "logprob": -2.9160156,
        "text": " d'abord"
      }
    ],
    "seed": 0,
    "tokens": [
      {
        "id": 408,
        "logprob": -0.16687012,
        "special": false,
        "text": " que"
      },
      {
        "id": 366,
        "logprob": -1.5517578,
        "special": false,
        "text": " la"
      },
      {
        "id": 8769,
        "logprob": -0.16687012,
        "special": false,
        "text": " personne"
      },
      {
        "id": 1479,
        "logprob": -2.1035156,
        "special": false,
        "text": " qui"
      },
      {
        "id": 143926,
        "logprob": -2.8671875,
        "special": false,
        "text": " réalise"
      },
      {
        "id": 578,
        "logprob": 0.0,
        "special": false,
        "text": " le"
      },
      {
        "id": 8138,
        "logprob": -0.66748047,
        "special": false,
        "text": " projet"
      },
      {
        "id": 795,
        "logprob": -1.6279297,
        "special": false,
        "text": " ne"
      },
      {
        "id": 9802,
        "logprob": -0.47875977,
        "special": false,
        "text": " soit"
      },
      {
        "id": 1230,
        "logprob": 0.0,
        "special": false,
        "text": " pas"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "Pour déguster un ortolan, il faut tout d'abord que la personne qui réalise le projet ne soit pas"
}


================================================
FILE: integration-tests/models/__snapshots__/test_bloom_560m/test_bloom_560m_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 17934,
          "logprob": null,
          "text": "Pour"
        },
        {
          "id": 49833,
          "logprob": -10.5625,
          "text": " dég"
        },
        {
          "id": 21543,
          "logprob": -0.14770508,
          "text": "uster"
        },
        {
          "id": 447,
          "logprob": -1.9287109,
          "text": " un"
        },
        {
          "id": 46341,
          "logprob": -15.4609375,
          "text": " ort"
        },
        {
          "id": 35567,
          "logprob": -7.5585938,
          "text": "olan"
        },
        {
          "id": 15,
          "logprob": -1.4003906,
          "text": ","
        },
        {
          "id": 1669,
          "logprob": -1.5673828,
          "text": " il"
        },
        {
          "id": 11580,
          "logprob": -0.94628906,
          "text": " faut"
        },
        {
          "id": 3913,
          "logprob": -3.703125,
          "text": " tout"
        },
        {
          "id": 39261,
          "logprob": -1.5732422,
          "text": " d'abord"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 578,
          "logprob": -1.7646484,
          "special": false,
          "text": " le"
        },
        {
          "id": 5608,
          "logprob": -2.6113281,
          "special": false,
          "text": " faire"
        },
        {
          "id": 1767,
          "logprob": -1.5263672,
          "special": false,
          "text": " cu"
        },
        {
          "id": 1273,
          "logprob": -0.00010049343,
          "special": false,
          "text": "ire"
        },
        {
          "id": 1486,
          "logprob": -1.4707031,
          "special": false,
          "text": " dans"
        },
        {
          "id": 283,
          "logprob": -1.2119141,
          "special": false,
          "text": " de"
        },
        {
          "id": 40410,
          "logprob": -0.11883545,
          "special": false,
          "text": " l'eau"
        },
        {
          "id": 20226,
          "logprob": -0.40844727,
          "special": false,
          "text": " bou"
        },
        {
          "id": 172483,
          "logprob": -0.0037841797,
          "special": false,
          "text": "illante"
        },
        {
          "id": 2805,
          "logprob": -1.0195312,
          "special": false,
          "text": " sal"
        }
      ]
    },
    "generated_text": " le faire cuire dans de l'eau bouillante sal"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 17934,
          "logprob": null,
          "text": "Pour"
        },
        {
          "id": 49833,
          "logprob": -10.53125,
          "text": " dég"
        },
        {
          "id": 21543,
          "logprob": -0.14770508,
          "text": "uster"
        },
        {
          "id": 447,
          "logprob": -1.9287109,
          "text": " un"
        },
        {
          "id": 46341,
          "logprob": -15.4140625,
          "text": " ort"
        },
        {
          "id": 35567,
          "logprob": -7.5234375,
          "text": "olan"
        },
        {
          "id": 15,
          "logprob": -1.3613281,
          "text": ","
        },
        {
          "id": 1669,
          "logprob": -1.5458984,
          "text": " il"
        },
        {
          "id": 11580,
          "logprob": -0.94189453,
          "text": " faut"
        },
        {
          "id": 3913,
          "logprob": -3.7011719,
          "text": " tout"
        },
        {
          "id": 39261,
          "logprob": -1.5732422,
          "text": " d'abord"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 578,
          "logprob": -1.7548828,
          "special": false,
          "text": " le"
        },
        {
          "id": 5608,
          "logprob": -2.578125,
          "special": false,
          "text": " faire"
        },
        {
          "id": 1767,
          "logprob": -1.5117188,
          "special": false,
          "text": " cu"
        },
        {
          "id": 1273,
          "logprob": -0.00010049343,
          "special": false,
          "text": "ire"
        },
        {
          "id": 1486,
          "logprob": -1.4707031,
          "special": false,
          "text": " dans"
        },
        {
          "id": 283,
          "logprob": -1.1982422,
          "special": false,
          "text": " de"
        },
        {
          "id": 40410,
          "logprob": -0.11004639,
          "special": false,
          "text": " l'eau"
        },
        {
          "id": 20226,
          "logprob": -0.4506836,
          "special": false,
          "text": " bou"
        },
        {
          "id": 172483,
          "logprob": -0.003047943,
          "special": false,
          "text": "illante"
        },
        {
          "id": 2805,
          "logprob": -1.0185547,
          "special": false,
          "text": " sal"
        }
      ]
    },
    "generated_text": " le faire cuire dans de l'eau bouillante sal"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 17934,
          "logprob": null,
          "text": "Pour"
        },
        {
          "id": 49833,
          "logprob": -10.53125,
          "text": " dég"
        },
        {
          "id": 21543,
          "logprob": -0.14770508,
          "text": "uster"
        },
        {
          "id": 447,
          "logprob": -1.9287109,
          "text": " un"
        },
        {
          "id": 46341,
          "logprob": -15.4140625,
          "text": " ort"
        },
        {
          "id": 35567,
          "logprob": -7.5234375,
          "text": "olan"
        },
        {
          "id": 15,
          "logprob": -1.3613281,
          "text": ","
        },
        {
          "id": 1669,
          "logprob": -1.5458984,
          "text": " il"
        },
        {
          "id": 11580,
          "logprob": -0.94189453,
          "text": " faut"
        },
        {
          "id": 3913,
          "logprob": -3.7011719,
          "text": " tout"
        },
        {
          "id": 39261,
          "logprob": -1.5732422,
          "text": " d'abord"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 578,
          "logprob": -1.7548828,
          "special": false,
          "text": " le"
        },
        {
          "id": 5608,
          "logprob": -2.578125,
          "special": false,
          "text": " faire"
        },
        {
          "id": 1767,
          "logprob": -1.5117188,
          "special": false,
          "text": " cu"
        },
        {
          "id": 1273,
          "logprob": -0.00010049343,
          "special": false,
          "text": "ire"
        },
        {
          "id": 1486,
          "logprob": -1.4707031,
          "special": false,
          "text": " dans"
        },
        {
          "id": 283,
          "logprob": -1.1982422,
          "special": false,
          "text": " de"
        },
        {
          "id": 40410,
          "logprob": -0.11004639,
          "special": false,
          "text": " l'eau"
        },
        {
          "id": 20226,
          "logprob": -0.4506836,
          "special": false,
          "text": " bou"
        },
        {
          "id": 172483,
          "logprob": -0.003047943,
          "special": false,
          "text": "illante"
        },
        {
          "id": 2805,
          "logprob": -1.0185547,
          "special": false,
          "text": " sal"
        }
      ]
    },
    "generated_text": " le faire cuire dans de l'eau bouillante sal"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 17934,
          "logprob": null,
          "text": "Pour"
        },
        {
          "id": 49833,
          "logprob": -10.53125,
          "text": " dég"
        },
        {
          "id": 21543,
          "logprob": -0.14770508,
          "text": "uster"
        },
        {
          "id": 447,
          "logprob": -1.9287109,
          "text": " un"
        },
        {
          "id": 46341,
          "logprob": -15.4140625,
          "text": " ort"
        },
        {
          "id": 35567,
          "logprob": -7.5234375,
          "text": "olan"
        },
        {
          "id": 15,
          "logprob": -1.3613281,
          "text": ","
        },
        {
          "id": 1669,
          "logprob": -1.5458984,
          "text": " il"
        },
        {
          "id": 11580,
          "logprob": -0.94189453,
          "text": " faut"
        },
        {
          "id": 3913,
          "logprob": -3.7011719,
          "text": " tout"
        },
        {
          "id": 39261,
          "logprob": -1.5732422,
          "text": " d'abord"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 578,
          "logprob": -1.7548828,
          "special": false,
          "text": " le"
        },
        {
          "id": 5608,
          "logprob": -2.578125,
          "special": false,
          "text": " faire"
        },
        {
          "id": 1767,
          "logprob": -1.5117188,
          "special": false,
          "text": " cu"
        },
        {
          "id": 1273,
          "logprob": -0.00010049343,
          "special": false,
          "text": "ire"
        },
        {
          "id": 1486,
          "logprob": -1.4707031,
          "special": false,
          "text": " dans"
        },
        {
          "id": 283,
          "logprob": -1.1982422,
          "special": false,
          "text": " de"
        },
        {
          "id": 40410,
          "logprob": -0.11004639,
          "special": false,
          "text": " l'eau"
        },
        {
          "id": 20226,
          "logprob": -0.4506836,
          "special": false,
          "text": " bou"
        },
        {
          "id": 172483,
          "logprob": -0.003047943,
          "special": false,
          "text": "illante"
        },
        {
          "id": 2805,
          "logprob": -1.0185547,
          "special": false,
          "text": " sal"
        }
      ]
    },
    "generated_text": " le faire cuire dans de l'eau bouillante sal"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_bloom_560m_sharded/test_bloom_560m_sharded.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [
      {
        "id": 17934,
        "logprob": null,
        "text": "Pour"
      },
      {
        "id": 49833,
        "logprob": -10.546875,
        "text": " dég"
      },
      {
        "id": 21543,
        "logprob": -0.14819336,
        "text": "uster"
      },
      {
        "id": 447,
        "logprob": -1.9257812,
        "text": " un"
      },
      {
        "id": 46341,
        "logprob": -15.4296875,
        "text": " ort"
      },
      {
        "id": 35567,
        "logprob": -7.5625,
        "text": "olan"
      },
      {
        "id": 15,
        "logprob": -1.4199219,
        "text": ","
      },
      {
        "id": 1669,
        "logprob": -1.5634766,
        "text": " il"
      },
      {
        "id": 11580,
        "logprob": -0.9458008,
        "text": " faut"
      },
      {
        "id": 3913,
        "logprob": -3.6816406,
        "text": " tout"
      },
      {
        "id": 39261,
        "logprob": -1.7753906,
        "text": " d'abord"
      }
    ],
    "seed": 0,
    "tokens": [
      {
        "id": 578,
        "logprob": -1.828125,
        "special": false,
        "text": " le"
      },
      {
        "id": 5608,
        "logprob": -2.5546875,
        "special": false,
        "text": " faire"
      },
      {
        "id": 7735,
        "logprob": -2.4277344,
        "special": false,
        "text": " fond"
      },
      {
        "id": 289,
        "logprob": 0.0,
        "special": false,
        "text": "re"
      },
      {
        "id": 693,
        "logprob": -2.4472656,
        "special": false,
        "text": " à"
      },
      {
        "id": 366,
        "logprob": -1.1494141,
        "special": false,
        "text": " la"
      },
      {
        "id": 48844,
        "logprob": -1.7939453,
        "special": false,
        "text": " cass"
      },
      {
        "id": 1744,
        "logprob": 0.0,
        "special": false,
        "text": "ero"
      },
      {
        "id": 327,
        "logprob": 0.0,
        "special": false,
        "text": "le"
      },
      {
        "id": 2940,
        "logprob": -1.9013672,
        "special": false,
        "text": " avec"
      }
    ],
    "top_tokens": null
  },
  "generated_text": " le faire fondre à la casserole avec"
}


================================================
FILE: integration-tests/models/__snapshots__/test_bloom_560m_sharded/test_bloom_560m_sharded_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 17934,
          "logprob": null,
          "text": "Pour"
        },
        {
          "id": 49833,
          "logprob": -10.5390625,
          "text": " dég"
        },
        {
          "id": 21543,
          "logprob": -0.14758301,
          "text": "uster"
        },
        {
          "id": 447,
          "logprob": -1.9296875,
          "text": " un"
        },
        {
          "id": 46341,
          "logprob": -15.4453125,
          "text": " ort"
        },
        {
          "id": 35567,
          "logprob": -7.59375,
          "text": "olan"
        },
        {
          "id": 15,
          "logprob": -1.3994141,
          "text": ","
        },
        {
          "id": 1669,
          "logprob": -1.578125,
          "text": " il"
        },
        {
          "id": 11580,
          "logprob": -0.9453125,
          "text": " faut"
        },
        {
          "id": 3913,
          "logprob": -3.7011719,
          "text": " tout"
        },
        {
          "id": 39261,
          "logprob": -1.5732422,
          "text": " d'abord"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 578,
          "logprob": -1.7529297,
          "special": false,
          "text": " le"
        },
        {
          "id": 5608,
          "logprob": -2.6054688,
          "special": false,
          "text": " faire"
        },
        {
          "id": 1767,
          "logprob": -1.5283203,
          "special": false,
          "text": " cu"
        },
        {
          "id": 1273,
          "logprob": -0.00010049343,
          "special": false,
          "text": "ire"
        },
        {
          "id": 1486,
          "logprob": -1.4716797,
          "special": false,
          "text": " dans"
        },
        {
          "id": 283,
          "logprob": -1.1982422,
          "special": false,
          "text": " de"
        },
        {
          "id": 40410,
          "logprob": -0.11853027,
          "special": false,
          "text": " l'eau"
        },
        {
          "id": 20226,
          "logprob": -0.41210938,
          "special": false,
          "text": " bou"
        },
        {
          "id": 172483,
          "logprob": -0.0037765503,
          "special": false,
          "text": "illante"
        },
        {
          "id": 2805,
          "logprob": -1.0166016,
          "special": false,
          "text": " sal"
        }
      ]
    },
    "generated_text": " le faire cuire dans de l'eau bouillante sal"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 17934,
          "logprob": null,
          "text": "Pour"
        },
        {
          "id": 49833,
          "logprob": -10.515625,
          "text": " dég"
        },
        {
          "id": 21543,
          "logprob": -0.1484375,
          "text": "uster"
        },
        {
          "id": 447,
          "logprob": -1.9287109,
          "text": " un"
        },
        {
          "id": 46341,
          "logprob": -15.34375,
          "text": " ort"
        },
        {
          "id": 35567,
          "logprob": -7.515625,
          "text": "olan"
        },
        {
          "id": 15,
          "logprob": -1.4199219,
          "text": ","
        },
        {
          "id": 1669,
          "logprob": -1.5664062,
          "text": " il"
        },
        {
          "id": 11580,
          "logprob": -0.94091797,
          "text": " faut"
        },
        {
          "id": 3913,
          "logprob": -3.6660156,
          "text": " tout"
        },
        {
          "id": 39261,
          "logprob": -1.7753906,
          "text": " d'abord"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 578,
          "logprob": -1.7626953,
          "special": false,
          "text": " le"
        },
        {
          "id": 5608,
          "logprob": -2.5820312,
          "special": false,
          "text": " faire"
        },
        {
          "id": 1767,
          "logprob": -1.5097656,
          "special": false,
          "text": " cu"
        },
        {
          "id": 1273,
          "logprob": -9.393692e-05,
          "special": false,
          "text": "ire"
        },
        {
          "id": 1486,
          "logprob": -1.5175781,
          "special": false,
          "text": " dans"
        },
        {
          "id": 283,
          "logprob": -1.1982422,
          "special": false,
          "text": " de"
        },
        {
          "id": 40410,
          "logprob": -0.11883545,
          "special": false,
          "text": " l'eau"
        },
        {
          "id": 20226,
          "logprob": -0.4909668,
          "special": false,
          "text": " bou"
        },
        {
          "id": 172483,
          "logprob": -0.003047943,
          "special": false,
          "text": "illante"
        },
        {
          "id": 2805,
          "logprob": -1.0185547,
          "special": false,
          "text": " sal"
        }
      ]
    },
    "generated_text": " le faire cuire dans de l'eau bouillante sal"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 17934,
          "logprob": null,
          "text": "Pour"
        },
        {
          "id": 49833,
          "logprob": -10.515625,
          "text": " dég"
        },
        {
          "id": 21543,
          "logprob": -0.1484375,
          "text": "uster"
        },
        {
          "id": 447,
          "logprob": -1.9287109,
          "text": " un"
        },
        {
          "id": 46341,
          "logprob": -15.34375,
          "text": " ort"
        },
        {
          "id": 35567,
          "logprob": -7.515625,
          "text": "olan"
        },
        {
          "id": 15,
          "logprob": -1.4199219,
          "text": ","
        },
        {
          "id": 1669,
          "logprob": -1.5664062,
          "text": " il"
        },
        {
          "id": 11580,
          "logprob": -0.94091797,
          "text": " faut"
        },
        {
          "id": 3913,
          "logprob": -3.6660156,
          "text": " tout"
        },
        {
          "id": 39261,
          "logprob": -1.7753906,
          "text": " d'abord"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 578,
          "logprob": -1.7626953,
          "special": false,
          "text": " le"
        },
        {
          "id": 5608,
          "logprob": -2.5820312,
          "special": false,
          "text": " faire"
        },
        {
          "id": 1767,
          "logprob": -1.5097656,
          "special": false,
          "text": " cu"
        },
        {
          "id": 1273,
          "logprob": -9.393692e-05,
          "special": false,
          "text": "ire"
        },
        {
          "id": 1486,
          "logprob": -1.5175781,
          "special": false,
          "text": " dans"
        },
        {
          "id": 283,
          "logprob": -1.1982422,
          "special": false,
          "text": " de"
        },
        {
          "id": 40410,
          "logprob": -0.11883545,
          "special": false,
          "text": " l'eau"
        },
        {
          "id": 20226,
          "logprob": -0.4909668,
          "special": false,
          "text": " bou"
        },
        {
          "id": 172483,
          "logprob": -0.003047943,
          "special": false,
          "text": "illante"
        },
        {
          "id": 2805,
          "logprob": -1.0185547,
          "special": false,
          "text": " sal"
        }
      ]
    },
    "generated_text": " le faire cuire dans de l'eau bouillante sal"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 17934,
          "logprob": null,
          "text": "Pour"
        },
        {
          "id": 49833,
          "logprob": -10.515625,
          "text": " dég"
        },
        {
          "id": 21543,
          "logprob": -0.1484375,
          "text": "uster"
        },
        {
          "id": 447,
          "logprob": -1.9287109,
          "text": " un"
        },
        {
          "id": 46341,
          "logprob": -15.34375,
          "text": " ort"
        },
        {
          "id": 35567,
          "logprob": -7.515625,
          "text": "olan"
        },
        {
          "id": 15,
          "logprob": -1.4199219,
          "text": ","
        },
        {
          "id": 1669,
          "logprob": -1.5664062,
          "text": " il"
        },
        {
          "id": 11580,
          "logprob": -0.94091797,
          "text": " faut"
        },
        {
          "id": 3913,
          "logprob": -3.6660156,
          "text": " tout"
        },
        {
          "id": 39261,
          "logprob": -1.7753906,
          "text": " d'abord"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 578,
          "logprob": -1.7626953,
          "special": false,
          "text": " le"
        },
        {
          "id": 5608,
          "logprob": -2.5820312,
          "special": false,
          "text": " faire"
        },
        {
          "id": 1767,
          "logprob": -1.5097656,
          "special": false,
          "text": " cu"
        },
        {
          "id": 1273,
          "logprob": -9.393692e-05,
          "special": false,
          "text": "ire"
        },
        {
          "id": 1486,
          "logprob": -1.5175781,
          "special": false,
          "text": " dans"
        },
        {
          "id": 283,
          "logprob": -1.1982422,
          "special": false,
          "text": " de"
        },
        {
          "id": 40410,
          "logprob": -0.11883545,
          "special": false,
          "text": " l'eau"
        },
        {
          "id": 20226,
          "logprob": -0.4909668,
          "special": false,
          "text": " bou"
        },
        {
          "id": 172483,
          "logprob": -0.003047943,
          "special": false,
          "text": "illante"
        },
        {
          "id": 2805,
          "logprob": -1.0185547,
          "special": false,
          "text": " sal"
        }
      ]
    },
    "generated_text": " le faire cuire dans de l'eau bouillante sal"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_chat_llama/test_flash_llama_simple.json
================================================
{
  "choices": [
    {
      "finish_reason": "length",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "As of your last question, the weather in Brooklyn, New York, is typically hot and humid throughout the year. The suburbs around New York City are jealously sheltered, and at least in the Lower Bronx, there are very few outdoor environments to appreciate nature.\n\nIn terms of temperature, the warmest times of the year are from June to August, when average high temperatures typically range from around 73°F or 23°C",
        "name": null,
        "role": "assistant",
        "tool_calls": null
      },
      "usage": null
    }
  ],
  "created": 1724792495,
  "id": "",
  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
  "object": "chat.completion",
  "system_fingerprint": "2.2.1-dev0-native",
  "usage": {
    "completion_tokens": 100,
    "prompt_tokens": 61,
    "total_tokens": 161
  }
}


================================================
FILE: integration-tests/models/__snapshots__/test_completion_prompts/test_chat_hfhub_nousage.json
================================================
[
  {
    "choices": [
      {
        "delta": {
          "content": "OK",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741265520,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "!",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741265520,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": "stop",
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741265520,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_completion_prompts/test_chat_hfhub_usage.json
================================================
[
  {
    "choices": [
      {
        "delta": {
          "content": "OK",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741266005,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "!",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741266005,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": "stop",
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741266005,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [],
    "created": 1741266005,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": {
      "completion_tokens": 3,
      "prompt_tokens": 39,
      "total_tokens": 42
    }
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_completion_prompts/test_chat_openai_nousage.json
================================================
[
  {
    "choices": [
      {
        "delta": {
          "content": "OK",
          "function_call": null,
          "refusal": null,
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741265134,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "service_tier": null,
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "!",
          "function_call": null,
          "refusal": null,
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741265134,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "service_tier": null,
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "",
          "function_call": null,
          "refusal": null,
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": "stop",
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741265134,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "service_tier": null,
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_completion_prompts/test_chat_openai_usage.json
================================================
[
  {
    "choices": [
      {
        "delta": {
          "content": "OK",
          "function_call": null,
          "refusal": null,
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741265133,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "service_tier": null,
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "!",
          "function_call": null,
          "refusal": null,
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741265133,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "service_tier": null,
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "",
          "function_call": null,
          "refusal": null,
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": "stop",
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741265133,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "service_tier": null,
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [],
    "created": 1741265133,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "service_tier": null,
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": {
      "completion_tokens": 3,
      "completion_tokens_details": null,
      "prompt_tokens": 39,
      "prompt_tokens_details": null,
      "total_tokens": 42
    }
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts.json
================================================
{
  "choices": [
    {
      "finish_reason": "length",
      "index": 1,
      "logprobs": null,
      "text": " This is a question that has puzzled many people for"
    },
    {
      "finish_reason": "length",
      "index": 0,
      "logprobs": null,
      "text": " A Beginner’s Guide\nDeep learning is a subset"
    },
    {
      "finish_reason": "length",
      "index": 3,
      "logprobs": null,
      "text": "usculas_minusculas(s):\n    \"\"\"\n"
    },
    {
      "finish_reason": "length",
      "index": 2,
      "logprobs": null,
      "text": " Paris\nWhat is the capital of France?\nThe"
    }
  ],
  "created": 1741264813,
  "id": "",
  "model": "meta-llama/Llama-3.1-8B-Instruct",
  "object": "text_completion",
  "system_fingerprint": "3.1.2-dev0-native",
  "usage": {
    "completion_tokens": 40,
    "prompt_tokens": 22,
    "total_tokens": 62
  }
}


================================================
FILE: integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts_stream.json
================================================
[
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 0,
        "logprobs": null,
        "text": " A"
      }
    ],
    "created": 1741340006,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 1,
        "logprobs": null,
        "text": " This"
      }
    ],
    "created": 1741340006,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 2,
        "logprobs": null,
        "text": " Paris"
      }
    ],
    "created": 1741340006,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 3,
        "logprobs": null,
        "text": "us"
      }
    ],
    "created": 1741340006,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 0,
        "logprobs": null,
        "text": " Beginner"
      }
    ],
    "created": 1741340006,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 1,
        "logprobs": null,
        "text": " is"
      }
    ],
    "created": 1741340006,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 2,
        "logprobs": null,
        "text": "\n"
      }
    ],
    "created": 1741340007,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 3,
        "logprobs": null,
        "text": "cul"
      }
    ],
    "created": 1741340007,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 0,
        "logprobs": null,
        "text": "’s"
      }
    ],
    "created": 1741340007,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 1,
        "logprobs": null,
        "text": " a"
      }
    ],
    "created": 1741340007,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 2,
        "logprobs": null,
        "text": "What"
      }
    ],
    "created": 1741340007,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 3,
        "logprobs": null,
        "text": "as"
      }
    ],
    "created": 1741340007,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 0,
        "logprobs": null,
        "text": " Guide"
      }
    ],
    "created": 1741340007,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 1,
        "logprobs": null,
        "text": " question"
      }
    ],
    "created": 1741340007,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 2,
        "logprobs": null,
        "text": " is"
      }
    ],
    "created": 1741340007,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 3,
        "logprobs": null,
        "text": "_minus"
      }
    ],
    "created": 1741340007,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 0,
        "logprobs": null,
        "text": "\n"
      }
    ],
    "created": 1741340007,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 1,
        "logprobs": null,
        "text": " that"
      }
    ],
    "created": 1741340007,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 2,
        "logprobs": null,
        "text": " the"
      }
    ],
    "created": 1741340007,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 3,
        "logprobs": null,
        "text": "cul"
      }
    ],
    "created": 1741340007,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 0,
        "logprobs": null,
        "text": "Deep"
      }
    ],
    "created": 1741340007,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 1,
        "logprobs": null,
        "text": " has"
      }
    ],
    "created": 1741340007,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 2,
        "logprobs": null,
        "text": " capital"
      }
    ],
    "created": 1741340007,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 3,
        "logprobs": null,
        "text": "as"
      }
    ],
    "created": 1741340007,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 0,
        "logprobs": null,
        "text": " learning"
      }
    ],
    "created": 1741340007,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 1,
        "logprobs": null,
        "text": " puzzled"
      }
    ],
    "created": 1741340007,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 2,
        "logprobs": null,
        "text": " of"
      }
    ],
    "created": 1741340007,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 3,
        "logprobs": null,
        "text": "(s"
      }
    ],
    "created": 1741340007,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 0,
        "logprobs": null,
        "text": " is"
      }
    ],
    "created": 1741340007,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 1,
        "logprobs": null,
        "text": " many"
      }
    ],
    "created": 1741340007,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 2,
        "logprobs": null,
        "text": " France"
      }
    ],
    "created": 1741340007,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 3,
        "logprobs": null,
        "text": "):\n"
      }
    ],
    "created": 1741340007,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 0,
        "logprobs": null,
        "text": " a"
      }
    ],
    "created": 1741340007,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 1,
        "logprobs": null,
        "text": " people"
      }
    ],
    "created": 1741340007,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 2,
        "logprobs": null,
        "text": "?\n"
      }
    ],
    "created": 1741340007,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "",
        "index": 3,
        "logprobs": null,
        "text": "   "
      }
    ],
    "created": 1741340007,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "text": " subset"
      }
    ],
    "created": 1741340007,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "completion_tokens_details": null,
      "prompt_tokens": 6,
      "prompt_tokens_details": null,
      "total_tokens": 16
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 1,
        "logprobs": null,
        "text": " for"
      }
    ],
    "created": 1741340007,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "completion_tokens_details": null,
      "prompt_tokens": 5,
      "prompt_tokens_details": null,
      "total_tokens": 15
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 2,
        "logprobs": null,
        "text": "The"
      }
    ],
    "created": 1741340007,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "completion_tokens_details": null,
      "prompt_tokens": 8,
      "prompt_tokens_details": null,
      "total_tokens": 18
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 3,
        "logprobs": null,
        "text": " \"\"\"\n"
      }
    ],
    "created": 1741340007,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "text_completion",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "completion_tokens_details": null,
      "prompt_tokens": 3,
      "prompt_tokens_details": null,
      "total_tokens": 13
    }
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_single_prompt.json
================================================
{
  "choices": [
    {
      "finish_reason": "length",
      "index": 0,
      "logprobs": null,
      "text": " A Beginner’s Guide\nDeep learning is a subset"
    }
  ],
  "created": 1741264812,
  "id": "",
  "model": "meta-llama/Llama-3.1-8B-Instruct",
  "object": "text_completion",
  "system_fingerprint": "3.1.2-dev0-native",
  "usage": {
    "completion_tokens": 10,
    "prompt_tokens": 6,
    "total_tokens": 16
  }
}


================================================
FILE: integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_stream_usage.json
================================================
[
  {
    "choices": [
      {
        "delta": {
          "content": "**",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741373593,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "Deep",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741373593,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " Learning",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741373593,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": ":",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741373594,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " An",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741373594,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " Overview",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741373594,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "**\n",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741373594,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "================================",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741373594,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "=====",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741373594,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "\n\n",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": "length",
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741373594,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [],
    "created": 1741373594,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 40,
      "total_tokens": 50
    }
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int/test_compressed_tensors_w8a8_int.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 323,
        "logprob": -1.1171875,
        "special": false,
        "text": " and"
      },
      {
        "id": 1268,
        "logprob": -0.9477539,
        "special": false,
        "text": " how"
      },
      {
        "id": 1587,
        "logprob": -0.51464844,
        "special": false,
        "text": " does"
      },
      {
        "id": 433,
        "logprob": -0.043182373,
        "special": false,
        "text": " it"
      },
      {
        "id": 1782,
        "logprob": -1.0810547,
        "special": false,
        "text": " differ"
      },
      {
        "id": 505,
        "logprob": -0.005054474,
        "special": false,
        "text": " from"
      },
      {
        "id": 8776,
        "logprob": -0.47485352,
        "special": false,
        "text": " traditional"
      },
      {
        "id": 5780,
        "logprob": -0.15112305,
        "special": false,
        "text": " machine"
      },
      {
        "id": 6975,
        "logprob": -0.0011291504,
        "special": false,
        "text": " learning"
      },
      {
        "id": 5380,
        "logprob": -0.31323242,
        "special": false,
        "text": "?\n"
      }
    ],
    "top_tokens": null
  },
  "generated_text": " and how does it differ from traditional machine learning?\n"
}


================================================
FILE: integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int/test_compressed_tensors_w8a8_int_all_params.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": 0,
    "tokens": [
      {
        "id": 5380,
        "logprob": 0.0,
        "special": false,
        "text": "?\n"
      },
      {
        "id": 34564,
        "logprob": 0.0,
        "special": false,
        "text": "Deep"
      },
      {
        "id": 6975,
        "logprob": 0.0,
        "special": false,
        "text": " learning"
      },
      {
        "id": 11,
        "logprob": 0.0,
        "special": false,
        "text": ","
      },
      {
        "id": 1101,
        "logprob": -1.0136719,
        "special": false,
        "text": " also"
      },
      {
        "id": 3967,
        "logprob": 0.0,
        "special": false,
        "text": " known"
      },
      {
        "id": 439,
        "logprob": 0.0,
        "special": false,
        "text": " as"
      },
      {
        "id": 30828,
        "logprob": 0.0,
        "special": false,
        "text": " neural"
      },
      {
        "id": 4009,
        "logprob": -0.21923828,
        "special": false,
        "text": " network"
      },
      {
        "id": 477,
        "logprob": -1.4824219,
        "special": false,
        "text": " or"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "What is deep learning?\nDeep learning, also known as neural network or"
}


================================================
FILE: integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int/test_compressed_tensors_w8a8_int_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 323,
          "logprob": -1.1171875,
          "special": false,
          "text": " and"
        },
        {
          "id": 1268,
          "logprob": -0.9477539,
          "special": false,
          "text": " how"
        },
        {
          "id": 1587,
          "logprob": -0.51464844,
          "special": false,
          "text": " does"
        },
        {
          "id": 433,
          "logprob": -0.043182373,
          "special": false,
          "text": " it"
        },
        {
          "id": 1782,
          "logprob": -1.0810547,
          "special": false,
          "text": " differ"
        },
        {
          "id": 505,
          "logprob": -0.005054474,
          "special": false,
          "text": " from"
        },
        {
          "id": 8776,
          "logprob": -0.47485352,
          "special": false,
          "text": " traditional"
        },
        {
          "id": 5780,
          "logprob": -0.15112305,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6975,
          "logprob": -0.0011291504,
          "special": false,
          "text": " learning"
        },
        {
          "id": 5380,
          "logprob": -0.3173828,
          "special": false,
          "text": "?\n"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " and how does it differ from traditional machine learning?\n"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 323,
          "logprob": -1.1171875,
          "special": false,
          "text": " and"
        },
        {
          "id": 1268,
          "logprob": -0.9477539,
          "special": false,
          "text": " how"
        },
        {
          "id": 1587,
          "logprob": -0.51464844,
          "special": false,
          "text": " does"
        },
        {
          "id": 433,
          "logprob": -0.043182373,
          "special": false,
          "text": " it"
        },
        {
          "id": 1782,
          "logprob": -1.0810547,
          "special": false,
          "text": " differ"
        },
        {
          "id": 505,
          "logprob": -0.005054474,
          "special": false,
          "text": " from"
        },
        {
          "id": 8776,
          "logprob": -0.47485352,
          "special": false,
          "text": " traditional"
        },
        {
          "id": 5780,
          "logprob": -0.15112305,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6975,
          "logprob": -0.0011291504,
          "special": false,
          "text": " learning"
        },
        {
          "id": 5380,
          "logprob": -0.3173828,
          "special": false,
          "text": "?\n"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " and how does it differ from traditional machine learning?\n"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 323,
          "logprob": -1.1171875,
          "special": false,
          "text": " and"
        },
        {
          "id": 1268,
          "logprob": -0.9477539,
          "special": false,
          "text": " how"
        },
        {
          "id": 1587,
          "logprob": -0.51464844,
          "special": false,
          "text": " does"
        },
        {
          "id": 433,
          "logprob": -0.043182373,
          "special": false,
          "text": " it"
        },
        {
          "id": 1782,
          "logprob": -1.0810547,
          "special": false,
          "text": " differ"
        },
        {
          "id": 505,
          "logprob": -0.005054474,
          "special": false,
          "text": " from"
        },
        {
          "id": 8776,
          "logprob": -0.47485352,
          "special": false,
          "text": " traditional"
        },
        {
          "id": 5780,
          "logprob": -0.15112305,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6975,
          "logprob": -0.0011291504,
          "special": false,
          "text": " learning"
        },
        {
          "id": 5380,
          "logprob": -0.3173828,
          "special": false,
          "text": "?\n"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " and how does it differ from traditional machine learning?\n"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 323,
          "logprob": -1.1171875,
          "special": false,
          "text": " and"
        },
        {
          "id": 1268,
          "logprob": -0.9477539,
          "special": false,
          "text": " how"
        },
        {
          "id": 1587,
          "logprob": -0.51464844,
          "special": false,
          "text": " does"
        },
        {
          "id": 433,
          "logprob": -0.043182373,
          "special": false,
          "text": " it"
        },
        {
          "id": 1782,
          "logprob": -1.0810547,
          "special": false,
          "text": " differ"
        },
        {
          "id": 505,
          "logprob": -0.005054474,
          "special": false,
          "text": " from"
        },
        {
          "id": 8776,
          "logprob": -0.47485352,
          "special": false,
          "text": " traditional"
        },
        {
          "id": 5780,
          "logprob": -0.15112305,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6975,
          "logprob": -0.0011291504,
          "special": false,
          "text": " learning"
        },
        {
          "id": 5380,
          "logprob": -0.3173828,
          "special": false,
          "text": "?\n"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " and how does it differ from traditional machine learning?\n"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "eos_token",
    "generated_tokens": 76,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 18183,
        "logprob": -1.5195312,
        "special": false,
        "text": " Deep"
      },
      {
        "id": 6832,
        "logprob": -0.06817627,
        "special": false,
        "text": " learning"
      },
      {
        "id": 374,
        "logprob": -0.13122559,
        "special": false,
        "text": " is"
      },
      {
        "id": 264,
        "logprob": -0.13415527,
        "special": false,
        "text": " a"
      },
      {
        "id": 25993,
        "logprob": -0.8769531,
        "special": false,
        "text": " subset"
      },
      {
        "id": 315,
        "logprob": -0.0011396408,
        "special": false,
        "text": " of"
      },
      {
        "id": 5662,
        "logprob": -0.16442871,
        "special": false,
        "text": " machine"
      },
      {
        "id": 6832,
        "logprob": -0.0026416779,
        "special": false,
        "text": " learning"
      },
      {
        "id": 429,
        "logprob": -0.48754883,
        "special": false,
        "text": " that"
      },
      {
        "id": 5711,
        "logprob": -1.2294922,
        "special": false,
        "text": " uses"
      },
      {
        "id": 29728,
        "logprob": -0.66503906,
        "special": false,
        "text": " neural"
      },
      {
        "id": 14155,
        "logprob": -0.02960205,
        "special": false,
        "text": " networks"
      },
      {
        "id": 311,
        "logprob": -0.7236328,
        "special": false,
        "text": " to"
      },
      {
        "id": 3960,
        "logprob": -1.1914062,
        "special": false,
        "text": " learn"
      },
      {
        "id": 504,
        "logprob": -0.7089844,
        "special": false,
        "text": " from"
      },
      {
        "id": 821,
        "logprob": -0.7729492,
        "special": false,
        "text": " data"
      },
      {
        "id": 13,
        "logprob": -0.7836914,
        "special": false,
        "text": "."
      },
      {
        "id": 1084,
        "logprob": -0.9941406,
        "special": false,
        "text": " It"
      },
      {
        "id": 374,
        "logprob": -0.52441406,
        "special": false,
        "text": " is"
      },
      {
        "id": 264,
        "logprob": -0.9511719,
        "special": false,
        "text": " a"
      },
      {
        "id": 943,
        "logprob": -0.8642578,
        "special": false,
        "text": " type"
      },
      {
        "id": 315,
        "logprob": -0.00030231476,
        "special": false,
        "text": " of"
      },
      {
        "id": 20443,
        "logprob": -0.14416504,
        "special": false,
        "text": " artificial"
      },
      {
        "id": 11229,
        "logprob": -0.013824463,
        "special": false,
        "text": " intelligence"
      },
      {
        "id": 429,
        "logprob": -0.18762207,
        "special": false,
        "text": " that"
      },
      {
        "id": 646,
        "logprob": -1.0087891,
        "special": false,
        "text": " can"
      },
      {
        "id": 3960,
        "logprob": -0.90234375,
        "special": false,
        "text": " learn"
      },
      {
        "id": 504,
        "logprob": -0.54345703,
        "special": false,
        "text": " from"
      },
      {
        "id": 323,
        "logprob": -1.0400391,
        "special": false,
        "text": " and"
      },
      {
        "id": 1281,
        "logprob": -0.072509766,
        "special": false,
        "text": " make"
      },
      {
        "id": 19898,
        "logprob": -0.16516113,
        "special": false,
        "text": " predictions"
      },
      {
        "id": 389,
        "logprob": -0.4416504,
        "special": false,
        "text": " on"
      },
      {
        "id": 3460,
        "logprob": -0.5385742,
        "special": false,
        "text": " large"
      },
      {
        "id": 14713,
        "logprob": -0.4387207,
        "special": false,
        "text": " amounts"
      },
      {
        "id": 315,
        "logprob": -0.00015091896,
        "special": false,
        "text": " of"
      },
      {
        "id": 821,
        "logprob": -0.061431885,
        "special": false,
        "text": " data"
      },
      {
        "id": 13,
        "logprob": -0.71875,
        "special": false,
        "text": "."
      },
      {
        "id": 18183,
        "logprob": -0.23632812,
        "special": false,
        "text": " Deep"
      },
      {
        "id": 6832,
        "logprob": -0.0017204285,
        "special": false,
        "text": " learning"
      },
      {
        "id": 374,
        "logprob": -1.1738281,
        "special": false,
        "text": " is"
      },
      {
        "id": 1483,
        "logprob": -0.61083984,
        "special": false,
        "text": " used"
      },
      {
        "id": 304,
        "logprob": -0.035003662,
        "special": false,
        "text": " in"
      },
      {
        "id": 264,
        "logprob": -0.118652344,
        "special": false,
        "text": " a"
      },
      {
        "id": 8045,
        "logprob": -0.42016602,
        "special": false,
        "text": " variety"
      },
      {
        "id": 315,
        "logprob": -1.6212463e-05,
        "special": false,
        "text": " of"
      },
      {
        "id": 8357,
        "logprob": -0.1315918,
        "special": false,
        "text": " applications"
      },
      {
        "id": 11,
        "logprob": -0.12915039,
        "special": false,
        "text": ","
      },
      {
        "id": 2670,
        "logprob": -0.12463379,
        "special": false,
        "text": " including"
      },
      {
        "id": 2168,
        "logprob": -0.37402344,
        "special": false,
        "text": " image"
      },
      {
        "id": 323,
        "logprob": -0.1451416,
        "special": false,
        "text": " and"
      },
      {
        "id": 8806,
        "logprob": -0.028869629,
        "special": false,
        "text": " speech"
      },
      {
        "id": 17843,
        "logprob": -0.00024068356,
        "special": false,
        "text": " recognition"
      },
      {
        "id": 11,
        "logprob": -0.00031018257,
        "special": false,
        "text": ","
      },
      {
        "id": 5810,
        "logprob": -0.019821167,
        "special": false,
        "text": " natural"
      },
      {
        "id": 4128,
        "logprob": -0.00012528896,
        "special": false,
        "text": " language"
      },
      {
        "id": 8692,
        "logprob": -0.00089263916,
        "special": false,
        "text": " processing"
      },
      {
        "id": 11,
        "logprob": -0.00073862076,
        "special": false,
        "text": ","
      },
      {
        "id": 323,
        "logprob": -0.040161133,
        "special": false,
        "text": " and"
      },
      {
        "id": 38193,
        "logprob": -0.4519043,
        "special": false,
        "text": " autonomous"
      },
      {
        "id": 11474,
        "logprob": -0.39941406,
        "special": false,
        "text": " vehicles"
      },
      {
        "id": 13,
        "logprob": -0.21166992,
        "special": false,
        "text": "."
      },
      {
        "id": 1084,
        "logprob": -0.9082031,
        "special": false,
        "text": " It"
      },
      {
        "id": 374,
        "logprob": -0.44213867,
        "special": false,
        "text": " is"
      },
      {
        "id": 264,
        "logprob": -1.2177734,
        "special": false,
        "text": " a"
      },
      {
        "id": 18512,
        "logprob": -0.5205078,
        "special": false,
        "text": " rapidly"
      },
      {
        "id": 7826,
        "logprob": -0.15332031,
        "special": false,
        "text": " growing"
      },
      {
        "id": 2070,
        "logprob": -0.0039978027,
        "special": false,
        "text": " field"
      },
      {
        "id": 448,
        "logprob": -0.9091797,
        "special": false,
        "text": " with"
      },
      {
        "id": 1657,
        "logprob": -0.17114258,
        "special": false,
        "text": " many"
      },
      {
        "id": 4650,
        "logprob": -0.70703125,
        "special": false,
        "text": " potential"
      },
      {
        "id": 8357,
        "logprob": -0.025131226,
        "special": false,
        "text": " applications"
      },
      {
        "id": 304,
        "logprob": -0.6699219,
        "special": false,
        "text": " in"
      },
      {
        "id": 279,
        "logprob": -0.35205078,
        "special": false,
        "text": " the"
      },
      {
        "id": 3853,
        "logprob": -0.049194336,
        "special": false,
        "text": " future"
      },
      {
        "id": 13,
        "logprob": -0.21972656,
        "special": false,
        "text": "."
      },
      {
        "id": 151643,
        "logprob": -2.0019531,
        "special": true,
        "text": "<|endoftext|>"
      }
    ],
    "top_tokens": null
  },
  "generated_text": " Deep learning is a subset of machine learning that uses neural networks to learn from data. It is a type of artificial intelligence that can learn from and make predictions on large amounts of data. Deep learning is used in a variety of applications, including image and speech recognition, natural language processing, and autonomous vehicles. It is a rapidly growing field with many potential applications in the future."
}


================================================
FILE: integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": 0,
    "tokens": [
      {
        "id": 5267,
        "logprob": -1.1464844,
        "special": false,
        "text": "?\n"
      },
      {
        "id": 33464,
        "logprob": -0.83203125,
        "special": false,
        "text": "Deep"
      },
      {
        "id": 20909,
        "logprob": -0.5625,
        "special": false,
        "text": " Learning"
      },
      {
        "id": 320,
        "logprob": -2.1464844,
        "special": false,
        "text": " ("
      },
      {
        "id": 16524,
        "logprob": 0.0,
        "special": false,
        "text": "DL"
      },
      {
        "id": 701,
        "logprob": -2.2089844,
        "special": false,
        "text": "),"
      },
      {
        "id": 476,
        "logprob": -0.27368164,
        "special": false,
        "text": " or"
      },
      {
        "id": 20443,
        "logprob": -0.09442139,
        "special": false,
        "text": " artificial"
      },
      {
        "id": 29728,
        "logprob": 0.0,
        "special": false,
        "text": " neural"
      },
      {
        "id": 14155,
        "logprob": 0.0,
        "special": false,
        "text": " networks"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "What is deep learning?\nDeep Learning (DL), or artificial neural networks"
}


================================================
FILE: integration-tests/models/__snapshots__/test_compressed_tensors_w8a8_int_dynamic_weight/test_compressed_tensors_w8a8_int_dynamic_weight_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 18183,
          "logprob": -1.5195312,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 6832,
          "logprob": -0.06817627,
          "special": false,
          "text": " learning"
        },
        {
          "id": 374,
          "logprob": -0.13122559,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
          "logprob": -0.13415527,
          "special": false,
          "text": " a"
        },
        {
          "id": 25993,
          "logprob": -0.87353516,
          "special": false,
          "text": " subset"
        },
        {
          "id": 315,
          "logprob": -0.0011396408,
          "special": false,
          "text": " of"
        },
        {
          "id": 5662,
          "logprob": -0.16442871,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6832,
          "logprob": -0.0026416779,
          "special": false,
          "text": " learning"
        },
        {
          "id": 429,
          "logprob": -0.48754883,
          "special": false,
          "text": " that"
        },
        {
          "id": 5711,
          "logprob": -1.2294922,
          "special": false,
          "text": " uses"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " Deep learning is a subset of machine learning that uses"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 18183,
          "logprob": -1.5195312,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 6832,
          "logprob": -0.06817627,
          "special": false,
          "text": " learning"
        },
        {
          "id": 374,
          "logprob": -0.13122559,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
          "logprob": -0.13415527,
          "special": false,
          "text": " a"
        },
        {
          "id": 25993,
          "logprob": -0.87353516,
          "special": false,
          "text": " subset"
        },
        {
          "id": 315,
          "logprob": -0.0011396408,
          "special": false,
          "text": " of"
        },
        {
          "id": 5662,
          "logprob": -0.16442871,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6832,
          "logprob": -0.0026416779,
          "special": false,
          "text": " learning"
        },
        {
          "id": 429,
          "logprob": -0.48754883,
          "special": false,
          "text": " that"
        },
        {
          "id": 5711,
          "logprob": -1.2294922,
          "special": false,
          "text": " uses"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " Deep learning is a subset of machine learning that uses"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 18183,
          "logprob": -1.5195312,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 6832,
          "logprob": -0.06817627,
          "special": false,
          "text": " learning"
        },
        {
          "id": 374,
          "logprob": -0.13122559,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
          "logprob": -0.13415527,
          "special": false,
          "text": " a"
        },
        {
          "id": 25993,
          "logprob": -0.87353516,
          "special": false,
          "text": " subset"
        },
        {
          "id": 315,
          "logprob": -0.0011396408,
          "special": false,
          "text": " of"
        },
        {
          "id": 5662,
          "logprob": -0.16442871,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6832,
          "logprob": -0.0026416779,
          "special": false,
          "text": " learning"
        },
        {
          "id": 429,
          "logprob": -0.48754883,
          "special": false,
          "text": " that"
        },
        {
          "id": 5711,
          "logprob": -1.2294922,
          "special": false,
          "text": " uses"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " Deep learning is a subset of machine learning that uses"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 18183,
          "logprob": -1.5195312,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 6832,
          "logprob": -0.06817627,
          "special": false,
          "text": " learning"
        },
        {
          "id": 374,
          "logprob": -0.13122559,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
          "logprob": -0.13415527,
          "special": false,
          "text": " a"
        },
        {
          "id": 25993,
          "logprob": -0.87353516,
          "special": false,
          "text": " subset"
        },
        {
          "id": 315,
          "logprob": -0.0011396408,
          "special": false,
          "text": " of"
        },
        {
          "id": 5662,
          "logprob": -0.16442871,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6832,
          "logprob": -0.0026416779,
          "special": false,
          "text": " learning"
        },
        {
          "id": 429,
          "logprob": -0.48754883,
          "special": false,
          "text": " that"
        },
        {
          "id": 5711,
          "logprob": -1.2294922,
          "special": false,
          "text": " uses"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " Deep learning is a subset of machine learning that uses"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 18682,
        "logprob": -0.8769531,
        "special": false,
        "text": " Deep"
      },
      {
        "id": 6975,
        "logprob": -0.0076942444,
        "special": false,
        "text": " learning"
      },
      {
        "id": 374,
        "logprob": -0.25073242,
        "special": false,
        "text": " is"
      },
      {
        "id": 264,
        "logprob": -0.097595215,
        "special": false,
        "text": " a"
      },
      {
        "id": 955,
        "logprob": -0.921875,
        "special": false,
        "text": " type"
      },
      {
        "id": 315,
        "logprob": -0.00027918816,
        "special": false,
        "text": " of"
      },
      {
        "id": 21075,
        "logprob": -0.5527344,
        "special": false,
        "text": " artificial"
      },
      {
        "id": 11478,
        "logprob": -0.042541504,
        "special": false,
        "text": " intelligence"
      },
      {
        "id": 320,
        "logprob": -0.38891602,
        "special": false,
        "text": " ("
      },
      {
        "id": 15836,
        "logprob": -0.0011043549,
        "special": false,
        "text": "AI"
      }
    ],
    "top_tokens": null
  },
  "generated_text": " Deep learning is a type of artificial intelligence (AI"
}


================================================
FILE: integration-tests/models/__snapshots__/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an_all_params.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": 0,
    "tokens": [
      {
        "id": 5380,
        "logprob": -0.23840332,
        "special": false,
        "text": "?\n"
      },
      {
        "id": 34564,
        "logprob": 0.0,
        "special": false,
        "text": "Deep"
      },
      {
        "id": 6975,
        "logprob": 0.0,
        "special": false,
        "text": " learning"
      },
      {
        "id": 11,
        "logprob": 0.0,
        "special": false,
        "text": ","
      },
      {
        "id": 1101,
        "logprob": -1.2011719,
        "special": false,
        "text": " also"
      },
      {
        "id": 3967,
        "logprob": 0.0,
        "special": false,
        "text": " known"
      },
      {
        "id": 439,
        "logprob": 0.0,
        "special": false,
        "text": " as"
      },
      {
        "id": 30828,
        "logprob": 0.0,
        "special": false,
        "text": " neural"
      },
      {
        "id": 4009,
        "logprob": -0.6777344,
        "special": false,
        "text": " network"
      },
      {
        "id": 477,
        "logprob": 0.0,
        "special": false,
        "text": " or"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "What is deep learning?\nDeep learning, also known as neural network or"
}


================================================
FILE: integration-tests/models/__snapshots__/test_compressed_tensors_w8an_fp/test_compressed_tensors_w8an_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 18682,
          "logprob": -0.8769531,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 6975,
          "logprob": -0.0076942444,
          "special": false,
          "text": " learning"
        },
        {
          "id": 374,
          "logprob": -0.25146484,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
          "logprob": -0.097595215,
          "special": false,
          "text": " a"
        },
        {
          "id": 955,
          "logprob": -0.9248047,
          "special": false,
          "text": " type"
        },
        {
          "id": 315,
          "logprob": -0.00027513504,
          "special": false,
          "text": " of"
        },
        {
          "id": 21075,
          "logprob": -0.5527344,
          "special": false,
          "text": " artificial"
        },
        {
          "id": 11478,
          "logprob": -0.043151855,
          "special": false,
          "text": " intelligence"
        },
        {
          "id": 320,
          "logprob": -0.3840332,
          "special": false,
          "text": " ("
        },
        {
          "id": 15836,
          "logprob": -0.0011043549,
          "special": false,
          "text": "AI"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " Deep learning is a type of artificial intelligence (AI"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 18682,
          "logprob": -0.875,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 6975,
          "logprob": -0.007698059,
          "special": false,
          "text": " learning"
        },
        {
          "id": 374,
          "logprob": -0.25268555,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
          "logprob": -0.09753418,
          "special": false,
          "text": " a"
        },
        {
          "id": 955,
          "logprob": -0.92529297,
          "special": false,
          "text": " type"
        },
        {
          "id": 315,
          "logprob": -0.00027942657,
          "special": false,
          "text": " of"
        },
        {
          "id": 21075,
          "logprob": -0.5527344,
          "special": false,
          "text": " artificial"
        },
        {
          "id": 11478,
          "logprob": -0.042541504,
          "special": false,
          "text": " intelligence"
        },
        {
          "id": 320,
          "logprob": -0.3840332,
          "special": false,
          "text": " ("
        },
        {
          "id": 15836,
          "logprob": -0.0011053085,
          "special": false,
          "text": "AI"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " Deep learning is a type of artificial intelligence (AI"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 18682,
          "logprob": -0.875,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 6975,
          "logprob": -0.007698059,
          "special": false,
          "text": " learning"
        },
        {
          "id": 374,
          "logprob": -0.25268555,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
          "logprob": -0.09753418,
          "special": false,
          "text": " a"
        },
        {
          "id": 955,
          "logprob": -0.92529297,
          "special": false,
          "text": " type"
        },
        {
          "id": 315,
          "logprob": -0.00027942657,
          "special": false,
          "text": " of"
        },
        {
          "id": 21075,
          "logprob": -0.5527344,
          "special": false,
          "text": " artificial"
        },
        {
          "id": 11478,
          "logprob": -0.042541504,
          "special": false,
          "text": " intelligence"
        },
        {
          "id": 320,
          "logprob": -0.3840332,
          "special": false,
          "text": " ("
        },
        {
          "id": 15836,
          "logprob": -0.0011053085,
          "special": false,
          "text": "AI"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " Deep learning is a type of artificial intelligence (AI"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 18682,
          "logprob": -0.875,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 6975,
          "logprob": -0.007698059,
          "special": false,
          "text": " learning"
        },
        {
          "id": 374,
          "logprob": -0.25268555,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
          "logprob": -0.09753418,
          "special": false,
          "text": " a"
        },
        {
          "id": 955,
          "logprob": -0.92529297,
          "special": false,
          "text": " type"
        },
        {
          "id": 315,
          "logprob": -0.00027942657,
          "special": false,
          "text": " of"
        },
        {
          "id": 21075,
          "logprob": -0.5527344,
          "special": false,
          "text": " artificial"
        },
        {
          "id": 11478,
          "logprob": -0.042541504,
          "special": false,
          "text": " intelligence"
        },
        {
          "id": 320,
          "logprob": -0.3840332,
          "special": false,
          "text": " ("
        },
        {
          "id": 15836,
          "logprob": -0.0011053085,
          "special": false,
          "text": "AI"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " Deep learning is a type of artificial intelligence (AI"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 109,
        "logprob": -0.24707031,
        "special": false,
        "text": "\n\n"
      },
      {
        "id": 26843,
        "logprob": -0.14550781,
        "special": false,
        "text": "Deep"
      },
      {
        "id": 6044,
        "logprob": -0.038330078,
        "special": false,
        "text": " learning"
      },
      {
        "id": 603,
        "logprob": -0.029907227,
        "special": false,
        "text": " is"
      },
      {
        "id": 476,
        "logprob": -0.020996094,
        "special": false,
        "text": " a"
      },
      {
        "id": 38397,
        "logprob": -0.828125,
        "special": false,
        "text": " subset"
      },
      {
        "id": 576,
        "logprob": -0.00049209595,
        "special": false,
        "text": " of"
      },
      {
        "id": 6479,
        "logprob": -0.057373047,
        "special": false,
        "text": " machine"
      },
      {
        "id": 6044,
        "logprob": -0.000207901,
        "special": false,
        "text": " learning"
      },
      {
        "id": 674,
        "logprob": -0.15429688,
        "special": false,
        "text": " that"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "\n\nDeep learning is a subset of machine learning that"
}


================================================
FILE: integration-tests/models/__snapshots__/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_all_params.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": 0,
    "tokens": [
      {
        "id": 235336,
        "logprob": 0.0,
        "special": false,
        "text": "?"
      },
      {
        "id": 109,
        "logprob": 0.0,
        "special": false,
        "text": "\n\n"
      },
      {
        "id": 26843,
        "logprob": 0.0,
        "special": false,
        "text": "Deep"
      },
      {
        "id": 14715,
        "logprob": -0.38671875,
        "special": false,
        "text": " Learning"
      },
      {
        "id": 603,
        "logprob": 0.0,
        "special": false,
        "text": " is"
      },
      {
        "id": 476,
        "logprob": 0.0,
        "special": false,
        "text": " a"
      },
      {
        "id": 38397,
        "logprob": 0.0,
        "special": false,
        "text": " subset"
      },
      {
        "id": 576,
        "logprob": 0.0,
        "special": false,
        "text": " of"
      },
      {
        "id": 6479,
        "logprob": 0.0,
        "special": false,
        "text": " machine"
      },
      {
        "id": 6044,
        "logprob": 0.0,
        "special": false,
        "text": " learning"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "What is deep learning?\n\nDeep Learning is a subset of machine learning"
}


================================================
FILE: integration-tests/models/__snapshots__/test_compressed_tensors_wna16_int/test_compressed_tensors_wna16_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 109,
          "logprob": -0.24707031,
          "special": false,
          "text": "\n\n"
        },
        {
          "id": 26843,
          "logprob": -0.14550781,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 6044,
          "logprob": -0.03857422,
          "special": false,
          "text": " learning"
        },
        {
          "id": 603,
          "logprob": -0.030883789,
          "special": false,
          "text": " is"
        },
        {
          "id": 476,
          "logprob": -0.020996094,
          "special": false,
          "text": " a"
        },
        {
          "id": 38397,
          "logprob": -0.828125,
          "special": false,
          "text": " subset"
        },
        {
          "id": 576,
          "logprob": -0.00051498413,
          "special": false,
          "text": " of"
        },
        {
          "id": 6479,
          "logprob": -0.05883789,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6044,
          "logprob": -0.00020694733,
          "special": false,
          "text": " learning"
        },
        {
          "id": 674,
          "logprob": -0.15820312,
          "special": false,
          "text": " that"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nDeep learning is a subset of machine learning that"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 109,
          "logprob": -0.23828125,
          "special": false,
          "text": "\n\n"
        },
        {
          "id": 26843,
          "logprob": -0.14550781,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 6044,
          "logprob": -0.038330078,
          "special": false,
          "text": " learning"
        },
        {
          "id": 603,
          "logprob": -0.030883789,
          "special": false,
          "text": " is"
        },
        {
          "id": 476,
          "logprob": -0.020996094,
          "special": false,
          "text": " a"
        },
        {
          "id": 38397,
          "logprob": -0.80859375,
          "special": false,
          "text": " subset"
        },
        {
          "id": 576,
          "logprob": -0.0005455017,
          "special": false,
          "text": " of"
        },
        {
          "id": 6479,
          "logprob": -0.05908203,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6044,
          "logprob": -0.00020599365,
          "special": false,
          "text": " learning"
        },
        {
          "id": 674,
          "logprob": -0.17285156,
          "special": false,
          "text": " that"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nDeep learning is a subset of machine learning that"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 109,
          "logprob": -0.23828125,
          "special": false,
          "text": "\n\n"
        },
        {
          "id": 26843,
          "logprob": -0.14550781,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 6044,
          "logprob": -0.038330078,
          "special": false,
          "text": " learning"
        },
        {
          "id": 603,
          "logprob": -0.030883789,
          "special": false,
          "text": " is"
        },
        {
          "id": 476,
          "logprob": -0.020996094,
          "special": false,
          "text": " a"
        },
        {
          "id": 38397,
          "logprob": -0.80859375,
          "special": false,
          "text": " subset"
        },
        {
          "id": 576,
          "logprob": -0.0005455017,
          "special": false,
          "text": " of"
        },
        {
          "id": 6479,
          "logprob": -0.05908203,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6044,
          "logprob": -0.00020599365,
          "special": false,
          "text": " learning"
        },
        {
          "id": 674,
          "logprob": -0.17285156,
          "special": false,
          "text": " that"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nDeep learning is a subset of machine learning that"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 109,
          "logprob": -0.23828125,
          "special": false,
          "text": "\n\n"
        },
        {
          "id": 26843,
          "logprob": -0.14550781,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 6044,
          "logprob": -0.038330078,
          "special": false,
          "text": " learning"
        },
        {
          "id": 603,
          "logprob": -0.030883789,
          "special": false,
          "text": " is"
        },
        {
          "id": 476,
          "logprob": -0.020996094,
          "special": false,
          "text": " a"
        },
        {
          "id": 38397,
          "logprob": -0.80859375,
          "special": false,
          "text": " subset"
        },
        {
          "id": 576,
          "logprob": -0.0005455017,
          "special": false,
          "text": " of"
        },
        {
          "id": 6479,
          "logprob": -0.05908203,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6044,
          "logprob": -0.00020599365,
          "special": false,
          "text": " learning"
        },
        {
          "id": 674,
          "logprob": -0.17285156,
          "special": false,
          "text": " that"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nDeep learning is a subset of machine learning that"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_compressed_tensors_wna16_int_24/test_compressed_tensors_wna16_int_24.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 34564,
        "logprob": -1.765625,
        "special": false,
        "text": "Deep"
      },
      {
        "id": 6975,
        "logprob": -0.023864746,
        "special": false,
        "text": " learning"
      },
      {
        "id": 374,
        "logprob": -0.1060791,
        "special": false,
        "text": " is"
      },
      {
        "id": 264,
        "logprob": -0.1940918,
        "special": false,
        "text": " a"
      },
      {
        "id": 27084,
        "logprob": -0.79785156,
        "special": false,
        "text": " subset"
      },
      {
        "id": 315,
        "logprob": -0.008262634,
        "special": false,
        "text": " of"
      },
      {
        "id": 5780,
        "logprob": -0.046569824,
        "special": false,
        "text": " machine"
      },
      {
        "id": 6975,
        "logprob": -0.0023479462,
        "special": false,
        "text": " learning"
      },
      {
        "id": 430,
        "logprob": -0.7626953,
        "special": false,
        "text": " that"
      },
      {
        "id": 5829,
        "logprob": -1.0107422,
        "special": false,
        "text": " uses"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "Deep learning is a subset of machine learning that uses"
}


================================================
FILE: integration-tests/models/__snapshots__/test_compressed_tensors_wna16_int_24/test_compressed_tensors_wna16_int_24_all_params.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": 0,
    "tokens": [
      {
        "id": 5380,
        "logprob": 0.0,
        "special": false,
        "text": "?\n"
      },
      {
        "id": 34564,
        "logprob": 0.0,
        "special": false,
        "text": "Deep"
      },
      {
        "id": 6975,
        "logprob": 0.0,
        "special": false,
        "text": " learning"
      },
      {
        "id": 320,
        "logprob": -0.19580078,
        "special": false,
        "text": " ("
      },
      {
        "id": 16931,
        "logprob": -1.7783203,
        "special": false,
        "text": "DL"
      },
      {
        "id": 8,
        "logprob": 0.0,
        "special": false,
        "text": ")"
      },
      {
        "id": 374,
        "logprob": -1.4287109,
        "special": false,
        "text": " is"
      },
      {
        "id": 264,
        "logprob": 0.0,
        "special": false,
        "text": " a"
      },
      {
        "id": 27084,
        "logprob": 0.0,
        "special": false,
        "text": " subset"
      },
      {
        "id": 315,
        "logprob": 0.0,
        "special": false,
        "text": " of"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "What is deep learning?\nDeep learning (DL) is a subset of"
}


================================================
FILE: integration-tests/models/__snapshots__/test_compressed_tensors_wna16_int_24/test_compressed_tensors_wna16_int_24_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 34564,
          "logprob": -1.765625,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 6975,
          "logprob": -0.024002075,
          "special": false,
          "text": " learning"
        },
        {
          "id": 374,
          "logprob": -0.10760498,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
          "logprob": -0.19580078,
          "special": false,
          "text": " a"
        },
        {
          "id": 27084,
          "logprob": -0.7993164,
          "special": false,
          "text": " subset"
        },
        {
          "id": 315,
          "logprob": -0.008300781,
          "special": false,
          "text": " of"
        },
        {
          "id": 5780,
          "logprob": -0.046295166,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6975,
          "logprob": -0.002374649,
          "special": false,
          "text": " learning"
        },
        {
          "id": 430,
          "logprob": -0.7651367,
          "special": false,
          "text": " that"
        },
        {
          "id": 5829,
          "logprob": -1.0107422,
          "special": false,
          "text": " uses"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "Deep learning is a subset of machine learning that uses"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 34564,
          "logprob": -1.7597656,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 6975,
          "logprob": -0.024032593,
          "special": false,
          "text": " learning"
        },
        {
          "id": 374,
          "logprob": -0.10748291,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
          "logprob": -0.19592285,
          "special": false,
          "text": " a"
        },
        {
          "id": 27084,
          "logprob": -0.7988281,
          "special": false,
          "text": " subset"
        },
        {
          "id": 315,
          "logprob": -0.008354187,
          "special": false,
          "text": " of"
        },
        {
          "id": 5780,
          "logprob": -0.046569824,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6975,
          "logprob": -0.0023517609,
          "special": false,
          "text": " learning"
        },
        {
          "id": 430,
          "logprob": -0.7661133,
          "special": false,
          "text": " that"
        },
        {
          "id": 5829,
          "logprob": -1.0107422,
          "special": false,
          "text": " uses"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "Deep learning is a subset of machine learning that uses"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 34564,
          "logprob": -1.7597656,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 6975,
          "logprob": -0.024032593,
          "special": false,
          "text": " learning"
        },
        {
          "id": 374,
          "logprob": -0.10748291,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
          "logprob": -0.19592285,
          "special": false,
          "text": " a"
        },
        {
          "id": 27084,
          "logprob": -0.7988281,
          "special": false,
          "text": " subset"
        },
        {
          "id": 315,
          "logprob": -0.008354187,
          "special": false,
          "text": " of"
        },
        {
          "id": 5780,
          "logprob": -0.046569824,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6975,
          "logprob": -0.0023517609,
          "special": false,
          "text": " learning"
        },
        {
          "id": 430,
          "logprob": -0.7661133,
          "special": false,
          "text": " that"
        },
        {
          "id": 5829,
          "logprob": -1.0107422,
          "special": false,
          "text": " uses"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "Deep learning is a subset of machine learning that uses"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 34564,
          "logprob": -1.7597656,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 6975,
          "logprob": -0.024032593,
          "special": false,
          "text": " learning"
        },
        {
          "id": 374,
          "logprob": -0.10748291,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
          "logprob": -0.19592285,
          "special": false,
          "text": " a"
        },
        {
          "id": 27084,
          "logprob": -0.7988281,
          "special": false,
          "text": " subset"
        },
        {
          "id": 315,
          "logprob": -0.008354187,
          "special": false,
          "text": " of"
        },
        {
          "id": 5780,
          "logprob": -0.046569824,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6975,
          "logprob": -0.0023517609,
          "special": false,
          "text": " learning"
        },
        {
          "id": 430,
          "logprob": -0.7661133,
          "special": false,
          "text": " that"
        },
        {
          "id": 5829,
          "logprob": -1.0107422,
          "special": false,
          "text": " uses"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "Deep learning is a subset of machine learning that uses"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_continue_final_message/test_llama_completion_single_prompt.json
================================================
{
  "choices": [
    {
      "finish_reason": "length",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "Both an elephant and a mouse are mammals. However, the differences between elephants and mice are:\n\n1",
        "role": "assistant"
      }
    }
  ],
  "created": 1732541189,
  "id": "",
  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
  "object": "chat.completion",
  "system_fingerprint": "2.4.1-dev0-native",
  "usage": {
    "completion_tokens": 30,
    "prompt_tokens": 49,
    "total_tokens": 79
  }
}


================================================
FILE: integration-tests/models/__snapshots__/test_continue_final_message/test_llama_completion_single_prompt_continue.json
================================================
{
  "choices": [
    {
      "finish_reason": "length",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": " the royal mouse? It is a little more slender and only weighs around 1.5 pounds for males and 1.3 pounds",
        "role": "assistant"
      }
    }
  ],
  "created": 1732541190,
  "id": "",
  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
  "object": "chat.completion",
  "system_fingerprint": "2.4.1-dev0-native",
  "usage": {
    "completion_tokens": 30,
    "prompt_tokens": 73,
    "total_tokens": 103
  }
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 13,
        "logprob": -1.9306641,
        "special": false,
        "text": "\n"
      },
      {
        "id": 5618,
        "logprob": -2.4550781,
        "special": false,
        "text": "What"
      },
      {
        "id": 338,
        "logprob": -0.5732422,
        "special": false,
        "text": " is"
      },
      {
        "id": 278,
        "logprob": -1.5761719,
        "special": false,
        "text": " the"
      },
      {
        "id": 4328,
        "logprob": -1.5888672,
        "special": false,
        "text": " difference"
      },
      {
        "id": 1546,
        "logprob": -0.026504517,
        "special": false,
        "text": " between"
      },
      {
        "id": 21784,
        "logprob": -1.4287109,
        "special": false,
        "text": " Deep"
      },
      {
        "id": 29257,
        "logprob": -0.15856934,
        "special": false,
        "text": " Learning"
      },
      {
        "id": 322,
        "logprob": -0.17456055,
        "special": false,
        "text": " and"
      },
      {
        "id": 6189,
        "logprob": -0.62646484,
        "special": false,
        "text": " Machine"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "\nWhat is the difference between Deep Learning and Machine"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_all_params.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": 0,
    "tokens": [
      {
        "id": 13,
        "logprob": -0.19958496,
        "special": false,
        "text": "\n"
      },
      {
        "id": 4013,
        "logprob": -2.203125,
        "special": false,
        "text": "This"
      },
      {
        "id": 1139,
        "logprob": -0.23693848,
        "special": false,
        "text": " question"
      },
      {
        "id": 756,
        "logprob": 0.0,
        "special": false,
        "text": " has"
      },
      {
        "id": 1063,
        "logprob": -0.076538086,
        "special": false,
        "text": " been"
      },
      {
        "id": 4433,
        "logprob": 0.0,
        "special": false,
        "text": " asked"
      },
      {
        "id": 1784,
        "logprob": -1.1367188,
        "special": false,
        "text": " many"
      },
      {
        "id": 3064,
        "logprob": 0.0,
        "special": false,
        "text": " times"
      },
      {
        "id": 322,
        "logprob": -1.7460938,
        "special": false,
        "text": " and"
      },
      {
        "id": 306,
        "logprob": 0.0,
        "special": false,
        "text": " I"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "What is Deep Learning?\nThis question has been asked many times and I"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 13,
          "logprob": -1.9306641,
          "special": false,
          "text": "\n"
        },
        {
          "id": 5618,
          "logprob": -2.4550781,
          "special": false,
          "text": "What"
        },
        {
          "id": 338,
          "logprob": -0.5732422,
          "special": false,
          "text": " is"
        },
        {
          "id": 278,
          "logprob": -1.5761719,
          "special": false,
          "text": " the"
        },
        {
          "id": 4328,
          "logprob": -1.5888672,
          "special": false,
          "text": " difference"
        },
        {
          "id": 1546,
          "logprob": -0.026504517,
          "special": false,
          "text": " between"
        },
        {
          "id": 21784,
          "logprob": -1.4287109,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 29257,
          "logprob": -0.15856934,
          "special": false,
          "text": " Learning"
        },
        {
          "id": 322,
          "logprob": -0.17456055,
          "special": false,
          "text": " and"
        },
        {
          "id": 6189,
          "logprob": -0.62646484,
          "special": false,
          "text": " Machine"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\nWhat is the difference between Deep Learning and Machine"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 13,
          "logprob": -1.9306641,
          "special": false,
          "text": "\n"
        },
        {
          "id": 5618,
          "logprob": -2.4550781,
          "special": false,
          "text": "What"
        },
        {
          "id": 338,
          "logprob": -0.5732422,
          "special": false,
          "text": " is"
        },
        {
          "id": 278,
          "logprob": -1.5761719,
          "special": false,
          "text": " the"
        },
        {
          "id": 4328,
          "logprob": -1.5888672,
          "special": false,
          "text": " difference"
        },
        {
          "id": 1546,
          "logprob": -0.026504517,
          "special": false,
          "text": " between"
        },
        {
          "id": 21784,
          "logprob": -1.4287109,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 29257,
          "logprob": -0.15856934,
          "special": false,
          "text": " Learning"
        },
        {
          "id": 322,
          "logprob": -0.17456055,
          "special": false,
          "text": " and"
        },
        {
          "id": 6189,
          "logprob": -0.62646484,
          "special": false,
          "text": " Machine"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\nWhat is the difference between Deep Learning and Machine"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 13,
          "logprob": -1.9306641,
          "special": false,
          "text": "\n"
        },
        {
          "id": 5618,
          "logprob": -2.4550781,
          "special": false,
          "text": "What"
        },
        {
          "id": 338,
          "logprob": -0.5732422,
          "special": false,
          "text": " is"
        },
        {
          "id": 278,
          "logprob": -1.5761719,
          "special": false,
          "text": " the"
        },
        {
          "id": 4328,
          "logprob": -1.5888672,
          "special": false,
          "text": " difference"
        },
        {
          "id": 1546,
          "logprob": -0.026504517,
          "special": false,
          "text": " between"
        },
        {
          "id": 21784,
          "logprob": -1.4287109,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 29257,
          "logprob": -0.15856934,
          "special": false,
          "text": " Learning"
        },
        {
          "id": 322,
          "logprob": -0.17456055,
          "special": false,
          "text": " and"
        },
        {
          "id": 6189,
          "logprob": -0.62646484,
          "special": false,
          "text": " Machine"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\nWhat is the difference between Deep Learning and Machine"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 13,
          "logprob": -1.9306641,
          "special": false,
          "text": "\n"
        },
        {
          "id": 5618,
          "logprob": -2.4550781,
          "special": false,
          "text": "What"
        },
        {
          "id": 338,
          "logprob": -0.5732422,
          "special": false,
          "text": " is"
        },
        {
          "id": 278,
          "logprob": -1.5761719,
          "special": false,
          "text": " the"
        },
        {
          "id": 4328,
          "logprob": -1.5888672,
          "special": false,
          "text": " difference"
        },
        {
          "id": 1546,
          "logprob": -0.026504517,
          "special": false,
          "text": " between"
        },
        {
          "id": 21784,
          "logprob": -1.4287109,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 29257,
          "logprob": -0.15856934,
          "special": false,
          "text": " Learning"
        },
        {
          "id": 322,
          "logprob": -0.17456055,
          "special": false,
          "text": " and"
        },
        {
          "id": 6189,
          "logprob": -0.62646484,
          "special": false,
          "text": " Machine"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\nWhat is the difference between Deep Learning and Machine"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_flash_awq_sharded/test_flash_llama_awq_load_sharded.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 13,
          "logprob": -1.9228516,
          "special": false,
          "text": "\n"
        },
        {
          "id": 5618,
          "logprob": -2.4609375,
          "special": false,
          "text": "What"
        },
        {
          "id": 338,
          "logprob": -0.57177734,
          "special": false,
          "text": " is"
        },
        {
          "id": 278,
          "logprob": -1.5722656,
          "special": false,
          "text": " the"
        },
        {
          "id": 4328,
          "logprob": -1.5859375,
          "special": false,
          "text": " difference"
        },
        {
          "id": 1546,
          "logprob": -0.02633667,
          "special": false,
          "text": " between"
        },
        {
          "id": 21784,
          "logprob": -1.4335938,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 29257,
          "logprob": -0.15991211,
          "special": false,
          "text": " Learning"
        },
        {
          "id": 322,
          "logprob": -0.17456055,
          "special": false,
          "text": " and"
        },
        {
          "id": 6189,
          "logprob": -0.62060547,
          "special": false,
          "text": " Machine"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\nWhat is the difference between Deep Learning and Machine"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 13,
          "logprob": -1.9228516,
          "special": false,
          "text": "\n"
        },
        {
          "id": 5618,
          "logprob": -2.4609375,
          "special": false,
          "text": "What"
        },
        {
          "id": 338,
          "logprob": -0.57177734,
          "special": false,
          "text": " is"
        },
        {
          "id": 278,
          "logprob": -1.5722656,
          "special": false,
          "text": " the"
        },
        {
          "id": 4328,
          "logprob": -1.5859375,
          "special": false,
          "text": " difference"
        },
        {
          "id": 1546,
          "logprob": -0.02633667,
          "special": false,
          "text": " between"
        },
        {
          "id": 21784,
          "logprob": -1.4335938,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 29257,
          "logprob": -0.15991211,
          "special": false,
          "text": " Learning"
        },
        {
          "id": 322,
          "logprob": -0.17456055,
          "special": false,
          "text": " and"
        },
        {
          "id": 6189,
          "logprob": -0.62060547,
          "special": false,
          "text": " Machine"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\nWhat is the difference between Deep Learning and Machine"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 13,
          "logprob": -1.9228516,
          "special": false,
          "text": "\n"
        },
        {
          "id": 5618,
          "logprob": -2.4609375,
          "special": false,
          "text": "What"
        },
        {
          "id": 338,
          "logprob": -0.57177734,
          "special": false,
          "text": " is"
        },
        {
          "id": 278,
          "logprob": -1.5722656,
          "special": false,
          "text": " the"
        },
        {
          "id": 4328,
          "logprob": -1.5859375,
          "special": false,
          "text": " difference"
        },
        {
          "id": 1546,
          "logprob": -0.02633667,
          "special": false,
          "text": " between"
        },
        {
          "id": 21784,
          "logprob": -1.4335938,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 29257,
          "logprob": -0.15991211,
          "special": false,
          "text": " Learning"
        },
        {
          "id": 322,
          "logprob": -0.17456055,
          "special": false,
          "text": " and"
        },
        {
          "id": 6189,
          "logprob": -0.62060547,
          "special": false,
          "text": " Machine"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\nWhat is the difference between Deep Learning and Machine"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 13,
          "logprob": -1.9228516,
          "special": false,
          "text": "\n"
        },
        {
          "id": 5618,
          "logprob": -2.4609375,
          "special": false,
          "text": "What"
        },
        {
          "id": 338,
          "logprob": -0.57177734,
          "special": false,
          "text": " is"
        },
        {
          "id": 278,
          "logprob": -1.5722656,
          "special": false,
          "text": " the"
        },
        {
          "id": 4328,
          "logprob": -1.5859375,
          "special": false,
          "text": " difference"
        },
        {
          "id": 1546,
          "logprob": -0.02633667,
          "special": false,
          "text": " between"
        },
        {
          "id": 21784,
          "logprob": -1.4335938,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 29257,
          "logprob": -0.15991211,
          "special": false,
          "text": " Learning"
        },
        {
          "id": 322,
          "logprob": -0.17456055,
          "special": false,
          "text": " and"
        },
        {
          "id": 6189,
          "logprob": -0.62060547,
          "special": false,
          "text": " Machine"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\nWhat is the difference between Deep Learning and Machine"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_flash_awq_sharded/test_flash_llama_awq_sharded.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 13,
        "logprob": -1.9228516,
        "special": false,
        "text": "\n"
      },
      {
        "id": 5618,
        "logprob": -2.4609375,
        "special": false,
        "text": "What"
      },
      {
        "id": 338,
        "logprob": -0.57177734,
        "special": false,
        "text": " is"
      },
      {
        "id": 278,
        "logprob": -1.5722656,
        "special": false,
        "text": " the"
      },
      {
        "id": 4328,
        "logprob": -1.5927734,
        "special": false,
        "text": " difference"
      },
      {
        "id": 1546,
        "logprob": -0.026428223,
        "special": false,
        "text": " between"
      },
      {
        "id": 21784,
        "logprob": -1.4267578,
        "special": false,
        "text": " Deep"
      },
      {
        "id": 29257,
        "logprob": -0.16015625,
        "special": false,
        "text": " Learning"
      },
      {
        "id": 322,
        "logprob": -0.17382812,
        "special": false,
        "text": " and"
      },
      {
        "id": 6189,
        "logprob": -0.62060547,
        "special": false,
        "text": " Machine"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "\nWhat is the difference between Deep Learning and Machine"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_deepseek_v2/test_flash_deepseek_v2.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 185,
        "logprob": -1.546875,
        "special": false,
        "text": "\n"
      },
      {
        "id": 549,
        "logprob": -2.859375,
        "special": false,
        "text": "The"
      },
      {
        "id": 1727,
        "logprob": -2.484375,
        "special": false,
        "text": " test"
      },
      {
        "id": 3102,
        "logprob": -0.83203125,
        "special": false,
        "text": " request"
      },
      {
        "id": 317,
        "logprob": -1.1484375,
        "special": false,
        "text": " is"
      },
      {
        "id": 245,
        "logprob": -1.578125,
        "special": false,
        "text": " a"
      },
      {
        "id": 3412,
        "logprob": -2.578125,
        "special": false,
        "text": " document"
      },
      {
        "id": 344,
        "logprob": -1.125,
        "special": false,
        "text": " that"
      },
      {
        "id": 317,
        "logprob": -1.6953125,
        "special": false,
        "text": " is"
      },
      {
        "id": 1222,
        "logprob": -1.71875,
        "special": false,
        "text": " used"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "\nThe test request is a document that is used"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_deepseek_v2/test_flash_deepseek_v2_all_params.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "eos_token",
    "generated_tokens": 4,
    "prefill": [],
    "seed": 0,
    "tokens": [
      {
        "id": 2143,
        "logprob": -1.828125,
        "special": false,
        "text": " sent"
      },
      {
        "id": 10081,
        "logprob": -0.41210938,
        "special": false,
        "text": " successfully"
      },
      {
        "id": 13,
        "logprob": 0.0,
        "special": false,
        "text": "."
      },
      {
        "id": 100001,
        "logprob": -0.16015625,
        "special": true,
        "text": "<｜end▁of▁sentence｜>"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "Test request sent successfully."
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_deepseek_v2/test_flash_deepseek_v2_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 185,
          "logprob": -1.546875,
          "special": false,
          "text": "\n"
        },
        {
          "id": 549,
          "logprob": -2.859375,
          "special": false,
          "text": "The"
        },
        {
          "id": 1727,
          "logprob": -2.4375,
          "special": false,
          "text": " test"
        },
        {
          "id": 3102,
          "logprob": -0.83984375,
          "special": false,
          "text": " request"
        },
        {
          "id": 317,
          "logprob": -1.1328125,
          "special": false,
          "text": " is"
        },
        {
          "id": 254,
          "logprob": -1.515625,
          "special": false,
          "text": " the"
        },
        {
          "id": 1022,
          "logprob": -1.15625,
          "special": false,
          "text": " first"
        },
        {
          "id": 3458,
          "logprob": -0.3671875,
          "special": false,
          "text": " step"
        },
        {
          "id": 279,
          "logprob": -0.88671875,
          "special": false,
          "text": " in"
        },
        {
          "id": 254,
          "logprob": -0.69140625,
          "special": false,
          "text": " the"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\nThe test request is the first step in the"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 185,
          "logprob": -1.546875,
          "special": false,
          "text": "\n"
        },
        {
          "id": 549,
          "logprob": -2.859375,
          "special": false,
          "text": "The"
        },
        {
          "id": 1727,
          "logprob": -2.4375,
          "special": false,
          "text": " test"
        },
        {
          "id": 3102,
          "logprob": -0.83984375,
          "special": false,
          "text": " request"
        },
        {
          "id": 317,
          "logprob": -1.1328125,
          "special": false,
          "text": " is"
        },
        {
          "id": 254,
          "logprob": -1.515625,
          "special": false,
          "text": " the"
        },
        {
          "id": 1022,
          "logprob": -1.15625,
          "special": false,
          "text": " first"
        },
        {
          "id": 3458,
          "logprob": -0.3671875,
          "special": false,
          "text": " step"
        },
        {
          "id": 279,
          "logprob": -0.88671875,
          "special": false,
          "text": " in"
        },
        {
          "id": 254,
          "logprob": -0.69140625,
          "special": false,
          "text": " the"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\nThe test request is the first step in the"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 185,
          "logprob": -1.546875,
          "special": false,
          "text": "\n"
        },
        {
          "id": 549,
          "logprob": -2.859375,
          "special": false,
          "text": "The"
        },
        {
          "id": 1727,
          "logprob": -2.4375,
          "special": false,
          "text": " test"
        },
        {
          "id": 3102,
          "logprob": -0.83984375,
          "special": false,
          "text": " request"
        },
        {
          "id": 317,
          "logprob": -1.1328125,
          "special": false,
          "text": " is"
        },
        {
          "id": 254,
          "logprob": -1.515625,
          "special": false,
          "text": " the"
        },
        {
          "id": 1022,
          "logprob": -1.15625,
          "special": false,
          "text": " first"
        },
        {
          "id": 3458,
          "logprob": -0.3671875,
          "special": false,
          "text": " step"
        },
        {
          "id": 279,
          "logprob": -0.88671875,
          "special": false,
          "text": " in"
        },
        {
          "id": 254,
          "logprob": -0.69140625,
          "special": false,
          "text": " the"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\nThe test request is the first step in the"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 185,
          "logprob": -1.546875,
          "special": false,
          "text": "\n"
        },
        {
          "id": 549,
          "logprob": -2.859375,
          "special": false,
          "text": "The"
        },
        {
          "id": 1727,
          "logprob": -2.4375,
          "special": false,
          "text": " test"
        },
        {
          "id": 3102,
          "logprob": -0.83984375,
          "special": false,
          "text": " request"
        },
        {
          "id": 317,
          "logprob": -1.1328125,
          "special": false,
          "text": " is"
        },
        {
          "id": 254,
          "logprob": -1.515625,
          "special": false,
          "text": " the"
        },
        {
          "id": 1022,
          "logprob": -1.15625,
          "special": false,
          "text": " first"
        },
        {
          "id": 3458,
          "logprob": -0.3671875,
          "special": false,
          "text": " step"
        },
        {
          "id": 279,
          "logprob": -0.88671875,
          "special": false,
          "text": " in"
        },
        {
          "id": 254,
          "logprob": -0.69140625,
          "special": false,
          "text": " the"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\nThe test request is the first step in the"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_flash_falcon/test_flash_falcon.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 23090,
        "logprob": -1.8251953,
        "special": false,
        "text": " Hello"
      },
      {
        "id": 23,
        "logprob": -0.3173828,
        "special": false,
        "text": ","
      },
      {
        "id": 8156,
        "logprob": -0.23803711,
        "special": false,
        "text": " Daniel"
      },
      {
        "id": 12,
        "logprob": -0.56933594,
        "special": false,
        "text": "!"
      },
      {
        "id": 193,
        "logprob": -0.61279297,
        "special": false,
        "text": "\n"
      },
      {
        "id": 23626,
        "logprob": -0.41967773,
        "special": false,
        "text": "Daniel"
      },
      {
        "id": 37,
        "logprob": -0.0023403168,
        "special": false,
        "text": ":"
      },
      {
        "id": 1634,
        "logprob": -2.0605469,
        "special": false,
        "text": " What"
      },
      {
        "id": 18,
        "logprob": -1.5292969,
        "special": false,
        "text": "'"
      },
      {
        "id": 94,
        "logprob": -0.007904053,
        "special": false,
        "text": "s"
      }
    ]
  },
  "generated_text": " Hello, Daniel!\nDaniel: What's"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_falcon/test_flash_falcon_all_params.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": 0,
    "tokens": [
      {
        "id": 836,
        "logprob": -1.265625,
        "special": false,
        "text": " i"
      },
      {
        "id": 18,
        "logprob": -0.119628906,
        "special": false,
        "text": "'"
      },
      {
        "id": 298,
        "logprob": -2.265625,
        "special": false,
        "text": "ve"
      },
      {
        "id": 650,
        "logprob": -0.49804688,
        "special": false,
        "text": " been"
      },
      {
        "id": 1241,
        "logprob": 0.0,
        "special": false,
        "text": " using"
      },
      {
        "id": 334,
        "logprob": 0.0,
        "special": false,
        "text": " it"
      },
      {
        "id": 312,
        "logprob": -1.2421875,
        "special": false,
        "text": " for"
      },
      {
        "id": 909,
        "logprob": -0.99609375,
        "special": false,
        "text": " years"
      },
      {
        "id": 193,
        "logprob": -0.30273438,
        "special": false,
        "text": "\n"
      },
      {
        "id": 807,
        "logprob": -1.078125,
        "special": false,
        "text": "ik"
      }
    ]
  },
  "generated_text": "Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.\nDaniel: Hello, Girafatron!\nGirafatron: i've been using it for years\nik"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_falcon/test_flash_falcon_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 23090,
          "logprob": -1.828125,
          "special": false,
          "text": " Hello"
        },
        {
          "id": 23,
          "logprob": -0.3178711,
          "special": false,
          "text": ","
        },
        {
          "id": 8156,
          "logprob": -0.23925781,
          "special": false,
          "text": " Daniel"
        },
        {
          "id": 12,
          "logprob": -0.5698242,
          "special": false,
          "text": "!"
        },
        {
          "id": 193,
          "logprob": -0.61279297,
          "special": false,
          "text": "\n"
        },
        {
          "id": 23626,
          "logprob": -0.4177246,
          "special": false,
          "text": "Daniel"
        },
        {
          "id": 37,
          "logprob": -0.0023345947,
          "special": false,
          "text": ":"
        },
        {
          "id": 1634,
          "logprob": -2.0605469,
          "special": false,
          "text": " What"
        },
        {
          "id": 18,
          "logprob": -1.5283203,
          "special": false,
          "text": "'"
        },
        {
          "id": 94,
          "logprob": -0.007965088,
          "special": false,
          "text": "s"
        }
      ]
    },
    "generated_text": " Hello, Daniel!\nDaniel: What's"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 23090,
          "logprob": -1.8251953,
          "special": false,
          "text": " Hello"
        },
        {
          "id": 23,
          "logprob": -0.31762695,
          "special": false,
          "text": ","
        },
        {
          "id": 8156,
          "logprob": -0.2388916,
          "special": false,
          "text": " Daniel"
        },
        {
          "id": 12,
          "logprob": -0.5698242,
          "special": false,
          "text": "!"
        },
        {
          "id": 193,
          "logprob": -0.6152344,
          "special": false,
          "text": "\n"
        },
        {
          "id": 23626,
          "logprob": -0.42211914,
          "special": false,
          "text": "Daniel"
        },
        {
          "id": 37,
          "logprob": -0.002336502,
          "special": false,
          "text": ":"
        },
        {
          "id": 1634,
          "logprob": -2.0605469,
          "special": false,
          "text": " What"
        },
        {
          "id": 18,
          "logprob": -1.5292969,
          "special": false,
          "text": "'"
        },
        {
          "id": 94,
          "logprob": -0.007926941,
          "special": false,
          "text": "s"
        }
      ]
    },
    "generated_text": " Hello, Daniel!\nDaniel: What's"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 23090,
          "logprob": -1.8251953,
          "special": false,
          "text": " Hello"
        },
        {
          "id": 23,
          "logprob": -0.31762695,
          "special": false,
          "text": ","
        },
        {
          "id": 8156,
          "logprob": -0.2388916,
          "special": false,
          "text": " Daniel"
        },
        {
          "id": 12,
          "logprob": -0.5698242,
          "special": false,
          "text": "!"
        },
        {
          "id": 193,
          "logprob": -0.6152344,
          "special": false,
          "text": "\n"
        },
        {
          "id": 23626,
          "logprob": -0.42211914,
          "special": false,
          "text": "Daniel"
        },
        {
          "id": 37,
          "logprob": -0.002336502,
          "special": false,
          "text": ":"
        },
        {
          "id": 1634,
          "logprob": -2.0605469,
          "special": false,
          "text": " What"
        },
        {
          "id": 18,
          "logprob": -1.5292969,
          "special": false,
          "text": "'"
        },
        {
          "id": 94,
          "logprob": -0.007926941,
          "special": false,
          "text": "s"
        }
      ]
    },
    "generated_text": " Hello, Daniel!\nDaniel: What's"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 23090,
          "logprob": -1.8251953,
          "special": false,
          "text": " Hello"
        },
        {
          "id": 23,
          "logprob": -0.31762695,
          "special": false,
          "text": ","
        },
        {
          "id": 8156,
          "logprob": -0.2388916,
          "special": false,
          "text": " Daniel"
        },
        {
          "id": 12,
          "logprob": -0.5698242,
          "special": false,
          "text": "!"
        },
        {
          "id": 193,
          "logprob": -0.6152344,
          "special": false,
          "text": "\n"
        },
        {
          "id": 23626,
          "logprob": -0.42211914,
          "special": false,
          "text": "Daniel"
        },
        {
          "id": 37,
          "logprob": -0.002336502,
          "special": false,
          "text": ":"
        },
        {
          "id": 1634,
          "logprob": -2.0605469,
          "special": false,
          "text": " What"
        },
        {
          "id": 18,
          "logprob": -1.5292969,
          "special": false,
          "text": "'"
        },
        {
          "id": 94,
          "logprob": -0.007926941,
          "special": false,
          "text": "s"
        }
      ]
    },
    "generated_text": " Hello, Daniel!\nDaniel: What's"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_flash_gemma/test_flash_gemma_all_params.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": 0,
    "tokens": [
      {
        "id": 7539,
        "logprob": -0.609375,
        "special": false,
        "text": " forms"
      },
      {
        "id": 708,
        "logprob": 0.0,
        "special": false,
        "text": " are"
      },
      {
        "id": 671,
        "logprob": -1.5546875,
        "special": false,
        "text": " an"
      },
      {
        "id": 8727,
        "logprob": 0.0,
        "special": false,
        "text": " essential"
      },
      {
        "id": 1702,
        "logprob": 0.0,
        "special": false,
        "text": " part"
      },
      {
        "id": 576,
        "logprob": 0.0,
        "special": false,
        "text": " of"
      },
      {
        "id": 573,
        "logprob": 0.0,
        "special": false,
        "text": " the"
      },
      {
        "id": 11859,
        "logprob": -1.953125,
        "special": false,
        "text": " lab"
      },
      {
        "id": 2185,
        "logprob": -1.7734375,
        "special": false,
        "text": " process"
      },
      {
        "id": 235265,
        "logprob": 0.0,
        "special": false,
        "text": "."
      }
    ],
    "top_tokens": null
  },
  "generated_text": "Test request forms are an essential part of the lab process."
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_gemma/test_flash_gemma_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 1736,
          "logprob": -2.09375,
          "special": false,
          "text": " form"
        },
        {
          "id": 109,
          "logprob": -1.9140625,
          "special": false,
          "text": "\n\n"
        },
        {
          "id": 651,
          "logprob": -2.453125,
          "special": false,
          "text": "The"
        },
        {
          "id": 2121,
          "logprob": -1.8984375,
          "special": false,
          "text": " test"
        },
        {
          "id": 3853,
          "logprob": -0.23535156,
          "special": false,
          "text": " request"
        },
        {
          "id": 1736,
          "logprob": -0.091308594,
          "special": false,
          "text": " form"
        },
        {
          "id": 603,
          "logprob": -0.96875,
          "special": false,
          "text": " is"
        },
        {
          "id": 1671,
          "logprob": -1.6484375,
          "special": false,
          "text": " used"
        },
        {
          "id": 577,
          "logprob": -0.43164062,
          "special": false,
          "text": " to"
        },
        {
          "id": 3853,
          "logprob": -1.2421875,
          "special": false,
          "text": " request"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " form\n\nThe test request form is used to request"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 1736,
          "logprob": -2.09375,
          "special": false,
          "text": " form"
        },
        {
          "id": 109,
          "logprob": -1.9140625,
          "special": false,
          "text": "\n\n"
        },
        {
          "id": 651,
          "logprob": -2.453125,
          "special": false,
          "text": "The"
        },
        {
          "id": 2121,
          "logprob": -1.8984375,
          "special": false,
          "text": " test"
        },
        {
          "id": 3853,
          "logprob": -0.23535156,
          "special": false,
          "text": " request"
        },
        {
          "id": 1736,
          "logprob": -0.091308594,
          "special": false,
          "text": " form"
        },
        {
          "id": 603,
          "logprob": -0.96875,
          "special": false,
          "text": " is"
        },
        {
          "id": 1671,
          "logprob": -1.6484375,
          "special": false,
          "text": " used"
        },
        {
          "id": 577,
          "logprob": -0.43164062,
          "special": false,
          "text": " to"
        },
        {
          "id": 3853,
          "logprob": -1.2421875,
          "special": false,
          "text": " request"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " form\n\nThe test request form is used to request"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 1736,
          "logprob": -2.09375,
          "special": false,
          "text": " form"
        },
        {
          "id": 109,
          "logprob": -1.9140625,
          "special": false,
          "text": "\n\n"
        },
        {
          "id": 651,
          "logprob": -2.453125,
          "special": false,
          "text": "The"
        },
        {
          "id": 2121,
          "logprob": -1.8984375,
          "special": false,
          "text": " test"
        },
        {
          "id": 3853,
          "logprob": -0.23535156,
          "special": false,
          "text": " request"
        },
        {
          "id": 1736,
          "logprob": -0.091308594,
          "special": false,
          "text": " form"
        },
        {
          "id": 603,
          "logprob": -0.96875,
          "special": false,
          "text": " is"
        },
        {
          "id": 1671,
          "logprob": -1.6484375,
          "special": false,
          "text": " used"
        },
        {
          "id": 577,
          "logprob": -0.43164062,
          "special": false,
          "text": " to"
        },
        {
          "id": 3853,
          "logprob": -1.2421875,
          "special": false,
          "text": " request"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " form\n\nThe test request form is used to request"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 1736,
          "logprob": -2.09375,
          "special": false,
          "text": " form"
        },
        {
          "id": 109,
          "logprob": -1.9140625,
          "special": false,
          "text": "\n\n"
        },
        {
          "id": 651,
          "logprob": -2.453125,
          "special": false,
          "text": "The"
        },
        {
          "id": 2121,
          "logprob": -1.8984375,
          "special": false,
          "text": " test"
        },
        {
          "id": 3853,
          "logprob": -0.23535156,
          "special": false,
          "text": " request"
        },
        {
          "id": 1736,
          "logprob": -0.091308594,
          "special": false,
          "text": " form"
        },
        {
          "id": 603,
          "logprob": -0.96875,
          "special": false,
          "text": " is"
        },
        {
          "id": 1671,
          "logprob": -1.6484375,
          "special": false,
          "text": " used"
        },
        {
          "id": 577,
          "logprob": -0.43164062,
          "special": false,
          "text": " to"
        },
        {
          "id": 3853,
          "logprob": -1.2421875,
          "special": false,
          "text": " request"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " form\n\nThe test request form is used to request"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_flash_gemma/test_flash_gemma_simple.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 1736,
        "logprob": -2.109375,
        "special": false,
        "text": " form"
      },
      {
        "id": 109,
        "logprob": -1.90625,
        "special": false,
        "text": "\n\n"
      },
      {
        "id": 651,
        "logprob": -2.4375,
        "special": false,
        "text": "The"
      },
      {
        "id": 2121,
        "logprob": -1.796875,
        "special": false,
        "text": " test"
      },
      {
        "id": 3853,
        "logprob": -0.24511719,
        "special": false,
        "text": " request"
      },
      {
        "id": 1736,
        "logprob": -0.09326172,
        "special": false,
        "text": " form"
      },
      {
        "id": 603,
        "logprob": -0.95703125,
        "special": false,
        "text": " is"
      },
      {
        "id": 1671,
        "logprob": -1.5859375,
        "special": false,
        "text": " used"
      },
      {
        "id": 577,
        "logprob": -0.39257812,
        "special": false,
        "text": " to"
      },
      {
        "id": 3853,
        "logprob": -1.25,
        "special": false,
        "text": " request"
      }
    ],
    "top_tokens": null
  },
  "generated_text": " form\n\nThe test request form is used to request"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_gemma2/test_flash_gemma2.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 688,
        "logprob": -0.546875,
        "special": false,
        "text": "**"
      },
      {
        "id": 103889,
        "logprob": -0.49023438,
        "special": false,
        "text": "Hydrogen"
      },
      {
        "id": 190213,
        "logprob": -0.48632812,
        "special": false,
        "text": "**,"
      },
      {
        "id": 2611,
        "logprob": -0.58203125,
        "special": false,
        "text": " light"
      },
      {
        "id": 578,
        "logprob": -0.099121094,
        "special": false,
        "text": " and"
      },
      {
        "id": 2223,
        "logprob": -1.078125,
        "special": false,
        "text": " free"
      },
      {
        "id": 235269,
        "logprob": -0.025756836,
        "special": false,
        "text": ","
      },
      {
        "id": 108,
        "logprob": -0.29101562,
        "special": false,
        "text": "\n"
      },
      {
        "id": 688,
        "logprob": -0.0035858154,
        "special": false,
        "text": "**"
      },
      {
        "id": 1949,
        "logprob": -4.1007996e-05,
        "special": false,
        "text": "He"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "**Hydrogen**, light and free,\n**He"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_gemma2/test_flash_gemma2_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 688,
          "logprob": -0.546875,
          "special": false,
          "text": "**"
        },
        {
          "id": 103889,
          "logprob": -0.49023438,
          "special": false,
          "text": "Hydrogen"
        },
        {
          "id": 190213,
          "logprob": -0.48632812,
          "special": false,
          "text": "**,"
        },
        {
          "id": 2611,
          "logprob": -0.58203125,
          "special": false,
          "text": " light"
        },
        {
          "id": 578,
          "logprob": -0.08886719,
          "special": false,
          "text": " and"
        },
        {
          "id": 2223,
          "logprob": -1.09375,
          "special": false,
          "text": " free"
        },
        {
          "id": 235269,
          "logprob": -0.024291992,
          "special": false,
          "text": ","
        },
        {
          "id": 108,
          "logprob": -0.30664062,
          "special": false,
          "text": "\n"
        },
        {
          "id": 688,
          "logprob": -0.0035552979,
          "special": false,
          "text": "**"
        },
        {
          "id": 1949,
          "logprob": -4.220009e-05,
          "special": false,
          "text": "He"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "**Hydrogen**, light and free,\n**He"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 688,
          "logprob": -0.546875,
          "special": false,
          "text": "**"
        },
        {
          "id": 103889,
          "logprob": -0.49023438,
          "special": false,
          "text": "Hydrogen"
        },
        {
          "id": 190213,
          "logprob": -0.48632812,
          "special": false,
          "text": "**,"
        },
        {
          "id": 2611,
          "logprob": -0.58203125,
          "special": false,
          "text": " light"
        },
        {
          "id": 578,
          "logprob": -0.08886719,
          "special": false,
          "text": " and"
        },
        {
          "id": 2223,
          "logprob": -1.09375,
          "special": false,
          "text": " free"
        },
        {
          "id": 235269,
          "logprob": -0.024291992,
          "special": false,
          "text": ","
        },
        {
          "id": 108,
          "logprob": -0.30664062,
          "special": false,
          "text": "\n"
        },
        {
          "id": 688,
          "logprob": -0.0035552979,
          "special": false,
          "text": "**"
        },
        {
          "id": 1949,
          "logprob": -4.220009e-05,
          "special": false,
          "text": "He"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "**Hydrogen**, light and free,\n**He"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 688,
          "logprob": -0.546875,
          "special": false,
          "text": "**"
        },
        {
          "id": 103889,
          "logprob": -0.49023438,
          "special": false,
          "text": "Hydrogen"
        },
        {
          "id": 190213,
          "logprob": -0.48632812,
          "special": false,
          "text": "**,"
        },
        {
          "id": 2611,
          "logprob": -0.58203125,
          "special": false,
          "text": " light"
        },
        {
          "id": 578,
          "logprob": -0.08984375,
          "special": false,
          "text": " and"
        },
        {
          "id": 2223,
          "logprob": -1.1015625,
          "special": false,
          "text": " free"
        },
        {
          "id": 235269,
          "logprob": -0.024291992,
          "special": false,
          "text": ","
        },
        {
          "id": 108,
          "logprob": -0.30664062,
          "special": false,
          "text": "\n"
        },
        {
          "id": 688,
          "logprob": -0.0038452148,
          "special": false,
          "text": "**"
        },
        {
          "id": 1949,
          "logprob": -4.1484833e-05,
          "special": false,
          "text": "He"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "**Hydrogen**, light and free,\n**He"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 688,
          "logprob": -0.546875,
          "special": false,
          "text": "**"
        },
        {
          "id": 103889,
          "logprob": -0.49023438,
          "special": false,
          "text": "Hydrogen"
        },
        {
          "id": 190213,
          "logprob": -0.48632812,
          "special": false,
          "text": "**,"
        },
        {
          "id": 2611,
          "logprob": -0.58203125,
          "special": false,
          "text": " light"
        },
        {
          "id": 578,
          "logprob": -0.08886719,
          "special": false,
          "text": " and"
        },
        {
          "id": 2223,
          "logprob": -1.09375,
          "special": false,
          "text": " free"
        },
        {
          "id": 235269,
          "logprob": -0.024291992,
          "special": false,
          "text": ","
        },
        {
          "id": 108,
          "logprob": -0.30664062,
          "special": false,
          "text": "\n"
        },
        {
          "id": 688,
          "logprob": -0.0035552979,
          "special": false,
          "text": "**"
        },
        {
          "id": 1949,
          "logprob": -4.220009e-05,
          "special": false,
          "text": "He"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "**Hydrogen**, light and free,\n**He"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_flash_gemma3/test_exceed_window.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "eos_token",
    "generated_tokens": 16,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 506,
        "logprob": -1.3984375,
        "special": false,
        "text": " the"
      },
      {
        "id": 1331,
        "logprob": -1.6953125,
        "special": false,
        "text": " people"
      },
      {
        "id": 236764,
        "logprob": -0.23535156,
        "special": false,
        "text": ","
      },
      {
        "id": 532,
        "logprob": -0.24316406,
        "special": false,
        "text": " and"
      },
      {
        "id": 506,
        "logprob": -0.12109375,
        "special": false,
        "text": " the"
      },
      {
        "id": 2780,
        "logprob": -1.1640625,
        "special": false,
        "text": " food"
      },
      {
        "id": 236761,
        "logprob": -0.21386719,
        "special": false,
        "text": "."
      },
      {
        "id": 108,
        "logprob": -0.64453125,
        "special": false,
        "text": "\n\n"
      },
      {
        "id": 2094,
        "logprob": -0.77734375,
        "special": false,
        "text": "This"
      },
      {
        "id": 563,
        "logprob": -0.040283203,
        "special": false,
        "text": " is"
      },
      {
        "id": 496,
        "logprob": -0.03125,
        "special": false,
        "text": " a"
      },
      {
        "id": 6290,
        "logprob": -0.03515625,
        "special": false,
        "text": " nice"
      },
      {
        "id": 1977,
        "logprob": -0.0020751953,
        "special": false,
        "text": " place"
      },
      {
        "id": 236761,
        "logprob": -0.0079956055,
        "special": false,
        "text": "."
      },
      {
        "id": 107,
        "logprob": -0.9921875,
        "special": false,
        "text": "\n"
      },
      {
        "id": 106,
        "logprob": -0.45507812,
        "special": true,
        "text": "<end_of_turn>"
      }
    ],
    "top_tokens": null
  },
  "generated_text": " the people, and the food.\n\nThis is a nice place.\n"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 100,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 1331,
        "logprob": -0.31835938,
        "special": false,
        "text": " people"
      },
      {
        "id": 8390,
        "logprob": -0.1484375,
        "special": false,
        "text": " died"
      },
      {
        "id": 528,
        "logprob": -1.1171875,
        "special": false,
        "text": " in"
      },
      {
        "id": 506,
        "logprob": -0.45898438,
        "special": false,
        "text": " the"
      },
      {
        "id": 3640,
        "logprob": -0.55859375,
        "special": false,
        "text": " United"
      },
      {
        "id": 4184,
        "logprob": -0.0026397705,
        "special": false,
        "text": " States"
      },
      {
        "id": 236761,
        "logprob": -0.38085938,
        "special": false,
        "text": "."
      },
      {
        "id": 108,
        "logprob": -0.07421875,
        "special": false,
        "text": "\n\n"
      },
      {
        "id": 818,
        "logprob": -1.0859375,
        "special": false,
        "text": "The"
      },
      {
        "id": 6816,
        "logprob": -1.75,
        "special": false,
        "text": " generally"
      },
      {
        "id": 10951,
        "logprob": -0.14648438,
        "special": false,
        "text": " accepted"
      },
      {
        "id": 10967,
        "logprob": -0.9609375,
        "special": false,
        "text": " estimate"
      },
      {
        "id": 563,
        "logprob": -0.49414062,
        "special": false,
        "text": " is"
      },
      {
        "id": 600,
        "logprob": -0.703125,
        "special": false,
        "text": " that"
      },
      {
        "id": 236743,
        "logprob": -1.171875,
        "special": false,
        "text": " "
      },
      {
        "id": 236825,
        "logprob": -0.0009918213,
        "special": false,
        "text": "6"
      },
      {
        "id": 236832,
        "logprob": -6.389618e-05,
        "special": false,
        "text": "7"
      },
      {
        "id": 236810,
        "logprob": -4.7445297e-05,
        "special": false,
        "text": "5"
      },
      {
        "id": 236764,
        "logprob": -0.00017929077,
        "special": false,
        "text": ","
      },
      {
        "id": 236771,
        "logprob": -1.4901161e-05,
        "special": false,
        "text": "0"
      },
      {
        "id": 236771,
        "logprob": -1.7881393e-06,
        "special": false,
        "text": "0"
      },
      {
        "id": 236771,
        "logprob": 0.0,
        "special": false,
        "text": "0"
      },
      {
        "id": 1331,
        "logprob": -0.45898438,
        "special": false,
        "text": " people"
      },
      {
        "id": 8390,
        "logprob": -0.011474609,
        "special": false,
        "text": " died"
      },
      {
        "id": 528,
        "logprob": -0.084472656,
        "special": false,
        "text": " in"
      },
      {
        "id": 506,
        "logprob": -0.00032615662,
        "special": false,
        "text": " the"
      },
      {
        "id": 3640,
        "logprob": -0.029785156,
        "special": false,
        "text": " United"
      },
      {
        "id": 4184,
        "logprob": -0.00012302399,
        "special": false,
        "text": " States"
      },
      {
        "id": 236761,
        "logprob": -1.1796875,
        "special": false,
        "text": "."
      },
      {
        "id": 3153,
        "logprob": -0.09667969,
        "special": false,
        "text": " However"
      },
      {
        "id": 236764,
        "logprob": -0.009094238,
        "special": false,
        "text": ","
      },
      {
        "id": 1070,
        "logprob": -0.91015625,
        "special": false,
        "text": " some"
      },
      {
        "id": 61806,
        "logprob": -0.859375,
        "special": false,
        "text": " historians"
      },
      {
        "id": 4646,
        "logprob": -1.3828125,
        "special": false,
        "text": " believe"
      },
      {
        "id": 506,
        "logprob": -0.65234375,
        "special": false,
        "text": " the"
      },
      {
        "id": 5396,
        "logprob": -0.765625,
        "special": false,
        "text": " actual"
      },
      {
        "id": 1548,
        "logprob": -0.048339844,
        "special": false,
        "text": " number"
      },
      {
        "id": 1451,
        "logprob": -0.65625,
        "special": false,
        "text": " could"
      },
      {
        "id": 577,
        "logprob": -0.09082031,
        "special": false,
        "text": " be"
      },
      {
        "id": 618,
        "logprob": -0.625,
        "special": false,
        "text": " as"
      },
      {
        "id": 1494,
        "logprob": -0.00037193298,
        "special": false,
        "text": " high"
      },
      {
        "id": 618,
        "logprob": -0.0001296997,
        "special": false,
        "text": " as"
      },
      {
        "id": 236743,
        "logprob": -0.00093460083,
        "special": false,
        "text": " "
      },
      {
        "id": 236770,
        "logprob": -0.21289062,
        "special": false,
        "text": "1"
      },
      {
        "id": 236771,
        "logprob": -0.16796875,
        "special": false,
        "text": "0"
      },
      {
        "id": 3625,
        "logprob": -0.0126953125,
        "special": false,
        "text": " million"
      },
      {
        "id": 236761,
        "logprob": -0.22460938,
        "special": false,
        "text": "."
      },
      {
        "id": 108,
        "logprob": -0.3984375,
        "special": false,
        "text": "\n\n"
      },
      {
        "id": 236777,
        "logprob": -1.078125,
        "special": false,
        "text": "I"
      },
      {
        "id": 1006,
        "logprob": -1.359375,
        "special": false,
        "text": " am"
      },
      {
        "id": 3182,
        "logprob": -1.0859375,
        "special": false,
        "text": " looking"
      },
      {
        "id": 573,
        "logprob": -0.035888672,
        "special": false,
        "text": " for"
      },
      {
        "id": 919,
        "logprob": -1.2578125,
        "special": false,
        "text": " more"
      },
      {
        "id": 1938,
        "logprob": -1.3046875,
        "special": false,
        "text": " information"
      },
      {
        "id": 580,
        "logprob": -0.7421875,
        "special": false,
        "text": " on"
      },
      {
        "id": 672,
        "logprob": -0.78125,
        "special": false,
        "text": " this"
      },
      {
        "id": 59725,
        "logprob": -0.7109375,
        "special": false,
        "text": " discrepancy"
      },
      {
        "id": 532,
        "logprob": -0.8046875,
        "special": false,
        "text": " and"
      },
      {
        "id": 506,
        "logprob": -0.71484375,
        "special": false,
        "text": " the"
      },
      {
        "id": 5872,
        "logprob": -1.1640625,
        "special": false,
        "text": " factors"
      },
      {
        "id": 600,
        "logprob": -0.20410156,
        "special": false,
        "text": " that"
      },
      {
        "id": 19263,
        "logprob": -1.1484375,
        "special": false,
        "text": " contributed"
      },
      {
        "id": 531,
        "logprob": -0.000957489,
        "special": false,
        "text": " to"
      },
      {
        "id": 506,
        "logprob": -0.19921875,
        "special": false,
        "text": " the"
      },
      {
        "id": 5777,
        "logprob": -1.171875,
        "special": false,
        "text": " wide"
      },
      {
        "id": 2644,
        "logprob": -0.020141602,
        "special": false,
        "text": " range"
      },
      {
        "id": 529,
        "logprob": -0.14550781,
        "special": false,
        "text": " of"
      },
      {
        "id": 14287,
        "logprob": -0.03564453,
        "special": false,
        "text": " estimates"
      },
      {
        "id": 236761,
        "logprob": -0.010620117,
        "special": false,
        "text": "."
      },
      {
        "id": 108,
        "logprob": -0.060302734,
        "special": false,
        "text": "\n\n"
      },
      {
        "id": 8291,
        "logprob": -0.7421875,
        "special": false,
        "text": "Here"
      },
      {
        "id": 236789,
        "logprob": -0.24023438,
        "special": false,
        "text": "'"
      },
      {
        "id": 236751,
        "logprob": -1.0728836e-06,
        "special": false,
        "text": "s"
      },
      {
        "id": 496,
        "logprob": -0.16992188,
        "special": false,
        "text": " a"
      },
      {
        "id": 25890,
        "logprob": -0.06933594,
        "special": false,
        "text": " breakdown"
      },
      {
        "id": 529,
        "logprob": -0.002243042,
        "special": false,
        "text": " of"
      },
      {
        "id": 506,
        "logprob": -0.18554688,
        "special": false,
        "text": " the"
      },
      {
        "id": 5872,
        "logprob": -0.9921875,
        "special": false,
        "text": " factors"
      },
      {
        "id": 20894,
        "logprob": -0.25976562,
        "special": false,
        "text": " contributing"
      },
      {
        "id": 531,
        "logprob": -8.440018e-05,
        "special": false,
        "text": " to"
      },
      {
        "id": 506,
        "logprob": -0.009765625,
        "special": false,
        "text": " the"
      },
      {
        "id": 5777,
        "logprob": -0.67578125,
        "special": false,
        "text": " wide"
      },
      {
        "id": 2644,
        "logprob": -0.0023956299,
        "special": false,
        "text": " range"
      },
      {
        "id": 529,
        "logprob": -0.014831543,
        "special": false,
        "text": " of"
      },
      {
        "id": 14287,
        "logprob": -0.012329102,
        "special": false,
        "text": " estimates"
      },
      {
        "id": 573,
        "logprob": -0.3125,
        "special": false,
        "text": " for"
      },
      {
        "id": 506,
        "logprob": -0.21484375,
        "special": false,
        "text": " the"
      },
      {
        "id": 236743,
        "logprob": -0.43359375,
        "special": false,
        "text": " "
      },
      {
        "id": 236770,
        "logprob": -3.5762787e-07,
        "special": false,
        "text": "1"
      },
      {
        "id": 236819,
        "logprob": 0.0,
        "special": false,
        "text": "9"
      },
      {
        "id": 236770,
        "logprob": 0.0,
        "special": false,
        "text": "1"
      },
      {
        "id": 236828,
        "logprob": 0.0,
        "special": false,
        "text": "8"
      },
      {
        "id": 7745,
        "logprob": -0.703125,
        "special": false,
        "text": " flu"
      },
      {
        "id": 10248,
        "logprob": -0.013427734,
        "special": false,
        "text": " pandemic"
      },
      {
        "id": 4355,
        "logprob": -0.6953125,
        "special": false,
        "text": " death"
      },
      {
        "id": 25363,
        "logprob": -6.771088e-05,
        "special": false,
        "text": " toll"
      },
      {
        "id": 528,
        "logprob": -0.076171875,
        "special": false,
        "text": " in"
      },
      {
        "id": 506,
        "logprob": -7.2717667e-06,
        "special": false,
        "text": " the"
      },
      {
        "id": 3640,
        "logprob": -0.0052490234,
        "special": false,
        "text": " United"
      },
      {
        "id": 4184,
        "logprob": 0.0,
        "special": false,
        "text": " States"
      }
    ],
    "top_tokens": null
  },
  "generated_text": " people died in the United States.\n\nThe generally accepted estimate is that 675,000 people died in the United States. However, some historians believe the actual number could be as high as 10 million.\n\nI am looking for more information on this discrepancy and the factors that contributed to the wide range of estimates.\n\nHere's a breakdown of the factors contributing to the wide range of estimates for the 1918 flu pandemic death toll in the United States"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_jpg.json
================================================
{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "Okay, let's analyze the image.\n\nThe image is a solid, bright white color. There is nothing else visible within it. \n\nIt's essentially a blank white square or rectangle.",
        "name": null,
        "role": "assistant",
        "tool_calls": null
      },
      "usage": null
    }
  ],
  "created": 1747062956,
  "id": "",
  "model": "google/gemma-3-4b-it",
  "object": "chat.completion",
  "system_fingerprint": "3.3.6-dev0-native",
  "usage": {
    "completion_tokens": 42,
    "prompt_tokens": 277,
    "total_tokens": 319
  }
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_png.json
================================================
{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "Okay, let's analyze the image. \n\nThe image is a very plain, solid white square. That's it! \n\nIt's essentially a blank canvas. \n\nDo you want me to describe it in more detail, or are you interested in something else regarding this image?",
        "name": null,
        "role": "assistant",
        "tool_calls": null
      },
      "usage": null
    }
  ],
  "created": 1747062955,
  "id": "",
  "model": "google/gemma-3-4b-it",
  "object": "chat.completion",
  "system_fingerprint": "3.3.6-dev0-native",
  "usage": {
    "completion_tokens": 62,
    "prompt_tokens": 277,
    "total_tokens": 339
  }
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgba.json
================================================
{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "Okay, let's analyze the image. \n\nThe transparent image reveals a stylized depiction of **a human head**. It's a minimalist, geometric representation, showing the basic shapes of the skull, eye sockets, and head outline. \n\nDo you want me to describe any specific element of the image in more detail?",
        "name": null,
        "role": "assistant",
        "tool_calls": null
      },
      "usage": null
    }
  ],
  "created": 1747062952,
  "id": "",
  "model": "google/gemma-3-4b-it",
  "object": "chat.completion",
  "system_fingerprint": "3.3.6-dev0-native",
  "usage": {
    "completion_tokens": 67,
    "prompt_tokens": 277,
    "total_tokens": 344
  }
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow.json
================================================
{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "Here's a description of what's shown in the image:\n\nThe image depicts a brown cow standing on a sandy beach. The beach has turquoise water and a distant island visible in the background. The sky is bright blue with some white clouds. \n\nIt's a quite a humorous and unusual scene – a cow enjoying a beach day!",
        "name": null,
        "role": "assistant",
        "tool_calls": null
      },
      "usage": null
    }
  ],
  "created": 1747216083,
  "id": "",
  "model": "google/gemma-3-4b-it",
  "object": "chat.completion",
  "system_fingerprint": "3.3.6-dev0-native",
  "usage": {
    "completion_tokens": 72,
    "prompt_tokens": 275,
    "total_tokens": 347
  }
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow_dog.json
================================================
{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "That's a fantastic question! However, the image doesn't show a dog. It shows a **Brown Swiss cow** standing on a beach. \n\nBrown Swiss cows are known for their beautiful reddish-brown coats and distinctive white markings. \n\nIf you'd like, you can send me another image, and I'll do my best to identify the animal in it!",
        "name": null,
        "role": "assistant",
        "tool_calls": null
      },
      "usage": null
    }
  ],
  "created": 1747216080,
  "id": "",
  "model": "google/gemma-3-4b-it",
  "object": "chat.completion",
  "system_fingerprint": "3.3.6-dev0-native",
  "usage": {
    "completion_tokens": 80,
    "prompt_tokens": 279,
    "total_tokens": 359
  }
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_gemma_gptq/test_flash_gemma_gptq.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 604,
        "logprob": -2.4296875,
        "special": false,
        "text": " for"
      },
      {
        "id": 573,
        "logprob": -2.4453125,
        "special": false,
        "text": " the"
      },
      {
        "id": 2412,
        "logprob": -2.8632812,
        "special": false,
        "text": " following"
      },
      {
        "id": 235292,
        "logprob": -2.1328125,
        "special": false,
        "text": ":"
      },
      {
        "id": 109,
        "logprob": -0.76660156,
        "special": false,
        "text": "\n\n"
      },
      {
        "id": 235287,
        "logprob": -1.3837891,
        "special": false,
        "text": "*"
      },
      {
        "id": 235248,
        "logprob": -1.9746094,
        "special": false,
        "text": " "
      },
      {
        "id": 199,
        "logprob": -1.4189453,
        "special": false,
        "text": "<strong>"
      },
      {
        "id": 1232,
        "logprob": -4.34375,
        "special": false,
        "text": "Name"
      },
      {
        "id": 208,
        "logprob": -0.8852539,
        "special": false,
        "text": "</strong>"
      }
    ],
    "top_tokens": null
  },
  "generated_text": " for the following:\n\n* <strong>Name</strong>"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_gemma_gptq/test_flash_gemma_gptq_all_params.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": 0,
    "tokens": [
      {
        "id": 604,
        "logprob": -0.28271484,
        "special": false,
        "text": " for"
      },
      {
        "id": 573,
        "logprob": -0.19030762,
        "special": false,
        "text": " the"
      },
      {
        "id": 16819,
        "logprob": -1.4863281,
        "special": false,
        "text": " detection"
      },
      {
        "id": 576,
        "logprob": -0.7089844,
        "special": false,
        "text": " of"
      },
      {
        "id": 573,
        "logprob": -2.0410156,
        "special": false,
        "text": " the"
      },
      {
        "id": 8566,
        "logprob": 0.0,
        "special": false,
        "text": " presence"
      },
      {
        "id": 689,
        "logprob": -0.16491699,
        "special": false,
        "text": " or"
      },
      {
        "id": 14862,
        "logprob": 0.0,
        "special": false,
        "text": " absence"
      },
      {
        "id": 576,
        "logprob": -0.9970703,
        "special": false,
        "text": " of"
      },
      {
        "id": 671,
        "logprob": -0.5292969,
        "special": false,
        "text": " an"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "Test request for the detection of the presence or absence of an"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_gemma_gptq/test_flash_gemma_gptq_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 604,
          "logprob": -2.4277344,
          "special": false,
          "text": " for"
        },
        {
          "id": 573,
          "logprob": -2.4394531,
          "special": false,
          "text": " the"
        },
        {
          "id": 2412,
          "logprob": -2.8613281,
          "special": false,
          "text": " following"
        },
        {
          "id": 235292,
          "logprob": -2.1523438,
          "special": false,
          "text": ":"
        },
        {
          "id": 109,
          "logprob": -0.76220703,
          "special": false,
          "text": "\n\n"
        },
        {
          "id": 235287,
          "logprob": -1.3642578,
          "special": false,
          "text": "*"
        },
        {
          "id": 235248,
          "logprob": -2.0175781,
          "special": false,
          "text": " "
        },
        {
          "id": 199,
          "logprob": -1.4238281,
          "special": false,
          "text": "<strong>"
        },
        {
          "id": 1232,
          "logprob": -4.328125,
          "special": false,
          "text": "Name"
        },
        {
          "id": 208,
          "logprob": -0.8881836,
          "special": false,
          "text": "</strong>"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " for the following:\n\n* <strong>Name</strong>"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 604,
          "logprob": -2.4238281,
          "special": false,
          "text": " for"
        },
        {
          "id": 573,
          "logprob": -2.4453125,
          "special": false,
          "text": " the"
        },
        {
          "id": 2412,
          "logprob": -2.859375,
          "special": false,
          "text": " following"
        },
        {
          "id": 235292,
          "logprob": -2.1445312,
          "special": false,
          "text": ":"
        },
        {
          "id": 109,
          "logprob": -0.7631836,
          "special": false,
          "text": "\n\n"
        },
        {
          "id": 235287,
          "logprob": -1.3642578,
          "special": false,
          "text": "*"
        },
        {
          "id": 235248,
          "logprob": -1.9960938,
          "special": false,
          "text": " "
        },
        {
          "id": 199,
          "logprob": -1.4179688,
          "special": false,
          "text": "<strong>"
        },
        {
          "id": 1232,
          "logprob": -4.3359375,
          "special": false,
          "text": "Name"
        },
        {
          "id": 208,
          "logprob": -0.8847656,
          "special": false,
          "text": "</strong>"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " for the following:\n\n* <strong>Name</strong>"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 604,
          "logprob": -2.4257812,
          "special": false,
          "text": " for"
        },
        {
          "id": 573,
          "logprob": -2.4453125,
          "special": false,
          "text": " the"
        },
        {
          "id": 2412,
          "logprob": -2.8789062,
          "special": false,
          "text": " following"
        },
        {
          "id": 235292,
          "logprob": -2.1367188,
          "special": false,
          "text": ":"
        },
        {
          "id": 109,
          "logprob": -0.76171875,
          "special": false,
          "text": "\n\n"
        },
        {
          "id": 235287,
          "logprob": -1.3515625,
          "special": false,
          "text": "*"
        },
        {
          "id": 235248,
          "logprob": -1.9873047,
          "special": false,
          "text": " "
        },
        {
          "id": 199,
          "logprob": -1.4169922,
          "special": false,
          "text": "<strong>"
        },
        {
          "id": 1232,
          "logprob": -4.3320312,
          "special": false,
          "text": "Name"
        },
        {
          "id": 208,
          "logprob": -0.8930664,
          "special": false,
          "text": "</strong>"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " for the following:\n\n* <strong>Name</strong>"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 604,
          "logprob": -2.4179688,
          "special": false,
          "text": " for"
        },
        {
          "id": 573,
          "logprob": -2.4492188,
          "special": false,
          "text": " the"
        },
        {
          "id": 2412,
          "logprob": -2.8574219,
          "special": false,
          "text": " following"
        },
        {
          "id": 235292,
          "logprob": -2.1445312,
          "special": false,
          "text": ":"
        },
        {
          "id": 109,
          "logprob": -0.7519531,
          "special": false,
          "text": "\n\n"
        },
        {
          "id": 235287,
          "logprob": -1.3623047,
          "special": false,
          "text": "*"
        },
        {
          "id": 235248,
          "logprob": -1.9707031,
          "special": false,
          "text": " "
        },
        {
          "id": 199,
          "logprob": -1.4267578,
          "special": false,
          "text": "<strong>"
        },
        {
          "id": 1232,
          "logprob": -4.3359375,
          "special": false,
          "text": "Name"
        },
        {
          "id": 208,
          "logprob": -0.88427734,
          "special": false,
          "text": "</strong>"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " for the following:\n\n* <strong>Name</strong>"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_flash_gpt2/test_flash_gpt2.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 198,
        "logprob": -0.68603516,
        "special": false,
        "text": "\n"
      },
      {
        "id": 198,
        "logprob": -0.005393982,
        "special": false,
        "text": "\n"
      },
      {
        "id": 29744,
        "logprob": -0.31079102,
        "special": false,
        "text": "Deep"
      },
      {
        "id": 4673,
        "logprob": -0.08300781,
        "special": false,
        "text": " learning"
      },
      {
        "id": 318,
        "logprob": -0.58984375,
        "special": false,
        "text": " is"
      },
      {
        "id": 257,
        "logprob": -0.953125,
        "special": false,
        "text": " a"
      },
      {
        "id": 649,
        "logprob": -2.0957031,
        "special": false,
        "text": " new"
      },
      {
        "id": 2214,
        "logprob": -1.8095703,
        "special": false,
        "text": " field"
      },
      {
        "id": 286,
        "logprob": -1.0673828,
        "special": false,
        "text": " of"
      },
      {
        "id": 2267,
        "logprob": -0.9375,
        "special": false,
        "text": " research"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "\n\nDeep learning is a new field of research"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_gpt2/test_flash_gpt2_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 198,
          "logprob": -0.68603516,
          "special": false,
          "text": "\n"
        },
        {
          "id": 198,
          "logprob": -0.005672455,
          "special": false,
          "text": "\n"
        },
        {
          "id": 29744,
          "logprob": -0.3251953,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 4673,
          "logprob": -0.08294678,
          "special": false,
          "text": " learning"
        },
        {
          "id": 318,
          "logprob": -0.5854492,
          "special": false,
          "text": " is"
        },
        {
          "id": 257,
          "logprob": -0.9423828,
          "special": false,
          "text": " a"
        },
        {
          "id": 649,
          "logprob": -2.0800781,
          "special": false,
          "text": " new"
        },
        {
          "id": 2214,
          "logprob": -1.8369141,
          "special": false,
          "text": " field"
        },
        {
          "id": 286,
          "logprob": -1.0683594,
          "special": false,
          "text": " of"
        },
        {
          "id": 2267,
          "logprob": -0.9711914,
          "special": false,
          "text": " research"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nDeep learning is a new field of research"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 198,
          "logprob": -0.7089844,
          "special": false,
          "text": "\n"
        },
        {
          "id": 198,
          "logprob": -0.0054779053,
          "special": false,
          "text": "\n"
        },
        {
          "id": 29744,
          "logprob": -0.3190918,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 4673,
          "logprob": -0.08319092,
          "special": false,
          "text": " learning"
        },
        {
          "id": 318,
          "logprob": -0.5839844,
          "special": false,
          "text": " is"
        },
        {
          "id": 257,
          "logprob": -0.9506836,
          "special": false,
          "text": " a"
        },
        {
          "id": 649,
          "logprob": -2.0878906,
          "special": false,
          "text": " new"
        },
        {
          "id": 2214,
          "logprob": -1.8496094,
          "special": false,
          "text": " field"
        },
        {
          "id": 286,
          "logprob": -1.0673828,
          "special": false,
          "text": " of"
        },
        {
          "id": 2267,
          "logprob": -0.9370117,
          "special": false,
          "text": " research"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nDeep learning is a new field of research"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 198,
          "logprob": -0.7089844,
          "special": false,
          "text": "\n"
        },
        {
          "id": 198,
          "logprob": -0.0054779053,
          "special": false,
          "text": "\n"
        },
        {
          "id": 29744,
          "logprob": -0.3190918,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 4673,
          "logprob": -0.08319092,
          "special": false,
          "text": " learning"
        },
        {
          "id": 318,
          "logprob": -0.5839844,
          "special": false,
          "text": " is"
        },
        {
          "id": 257,
          "logprob": -0.9506836,
          "special": false,
          "text": " a"
        },
        {
          "id": 649,
          "logprob": -2.0878906,
          "special": false,
          "text": " new"
        },
        {
          "id": 2214,
          "logprob": -1.8496094,
          "special": false,
          "text": " field"
        },
        {
          "id": 286,
          "logprob": -1.0673828,
          "special": false,
          "text": " of"
        },
        {
          "id": 2267,
          "logprob": -0.9370117,
          "special": false,
          "text": " research"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nDeep learning is a new field of research"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 198,
          "logprob": -0.7089844,
          "special": false,
          "text": "\n"
        },
        {
          "id": 198,
          "logprob": -0.0054779053,
          "special": false,
          "text": "\n"
        },
        {
          "id": 29744,
          "logprob": -0.3190918,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 4673,
          "logprob": -0.08319092,
          "special": false,
          "text": " learning"
        },
        {
          "id": 318,
          "logprob": -0.5839844,
          "special": false,
          "text": " is"
        },
        {
          "id": 257,
          "logprob": -0.9506836,
          "special": false,
          "text": " a"
        },
        {
          "id": 649,
          "logprob": -2.0878906,
          "special": false,
          "text": " new"
        },
        {
          "id": 2214,
          "logprob": -1.8496094,
          "special": false,
          "text": " field"
        },
        {
          "id": 286,
          "logprob": -1.0673828,
          "special": false,
          "text": " of"
        },
        {
          "id": 2267,
          "logprob": -0.9370117,
          "special": false,
          "text": " research"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nDeep learning is a new field of research"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_flash_grammar_llama/test_flash_llama_grammar.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 13,
        "logprob": -2.0566406,
        "special": false,
        "text": "\n"
      },
      {
        "id": 13,
        "logprob": -1.5253906,
        "special": false,
        "text": "\n"
      },
      {
        "id": 29902,
        "logprob": -2.7578125,
        "special": false,
        "text": "I"
      },
      {
        "id": 4966,
        "logprob": -1.9033203,
        "special": false,
        "text": " hope"
      },
      {
        "id": 445,
        "logprob": -0.5019531,
        "special": false,
        "text": " this"
      },
      {
        "id": 6911,
        "logprob": -0.21264648,
        "special": false,
        "text": " helps"
      },
      {
        "id": 29991,
        "logprob": -0.5991211,
        "special": false,
        "text": "!"
      },
      {
        "id": 2803,
        "logprob": -0.37475586,
        "special": false,
        "text": " Let"
      },
      {
        "id": 592,
        "logprob": -0.018463135,
        "special": false,
        "text": " me"
      },
      {
        "id": 1073,
        "logprob": -0.0008597374,
        "special": false,
        "text": " know"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "\n\nI hope this helps! Let me know"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_grammar_llama/test_flash_llama_grammar_json.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "eos_token",
    "generated_tokens": 30,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 6377,
        "logprob": -0.14916992,
        "special": false,
        "text": "{\""
      },
      {
        "id": 29888,
        "logprob": -0.13598633,
        "special": false,
        "text": "f"
      },
      {
        "id": 12935,
        "logprob": -0.017669678,
        "special": false,
        "text": "irs"
      },
      {
        "id": 29873,
        "logprob": -0.00085639954,
        "special": false,
        "text": "t"
      },
      {
        "id": 1170,
        "logprob": -0.0054016113,
        "special": false,
        "text": "Name"
      },
      {
        "id": 4710,
        "logprob": -0.13549805,
        "special": false,
        "text": "\":\""
      },
      {
        "id": 19504,
        "logprob": -0.8852539,
        "special": false,
        "text": "David"
      },
      {
        "id": 3284,
        "logprob": -0.16394043,
        "special": false,
        "text": "\",\""
      },
      {
        "id": 29882,
        "logprob": -0.08862305,
        "special": false,
        "text": "h"
      },
      {
        "id": 711,
        "logprob": -0.66259766,
        "special": false,
        "text": "ob"
      },
      {
        "id": 1609,
        "logprob": -5.51939e-05,
        "special": false,
        "text": "by"
      },
      {
        "id": 4710,
        "logprob": -0.23120117,
        "special": false,
        "text": "\":\""
      },
      {
        "id": 29911,
        "logprob": -2.3730469,
        "special": false,
        "text": "T"
      },
      {
        "id": 11003,
        "logprob": -0.032104492,
        "special": false,
        "text": "rees"
      },
      {
        "id": 3284,
        "logprob": -0.22021484,
        "special": false,
        "text": "\",\""
      },
      {
        "id": 4230,
        "logprob": -0.06726074,
        "special": false,
        "text": "last"
      },
      {
        "id": 1170,
        "logprob": -0.003501892,
        "special": false,
        "text": "Name"
      },
      {
        "id": 4710,
        "logprob": -0.0045661926,
        "special": false,
        "text": "\":\""
      },
      {
        "id": 29950,
        "logprob": -0.12512207,
        "special": false,
        "text": "H"
      },
      {
        "id": 14339,
        "logprob": -0.009552002,
        "special": false,
        "text": "olt"
      },
      {
        "id": 29920,
        "logprob": -0.00042438507,
        "special": false,
        "text": "z"
      },
      {
        "id": 3284,
        "logprob": -0.11651611,
        "special": false,
        "text": "\",\""
      },
      {
        "id": 29876,
        "logprob": -0.29736328,
        "special": false,
        "text": "n"
      },
      {
        "id": 398,
        "logprob": -0.003030777,
        "special": false,
        "text": "um"
      },
      {
        "id": 29907,
        "logprob": -0.3774414,
        "special": false,
        "text": "C"
      },
      {
        "id": 1446,
        "logprob": -0.0003130436,
        "special": false,
        "text": "ats"
      },
      {
        "id": 1115,
        "logprob": -0.0021514893,
        "special": false,
        "text": "\":"
      },
      {
        "id": 29906,
        "logprob": -0.071899414,
        "special": false,
        "text": "2"
      },
      {
        "id": 29913,
        "logprob": -0.018997192,
        "special": false,
        "text": "}"
      },
      {
        "id": 2,
        "logprob": 0.0,
        "special": true,
        "text": "</s>"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "{\"firstName\":\"David\",\"hobby\":\"Trees\",\"lastName\":\"Holtz\",\"numCats\":2}"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_grammar_llama/test_flash_llama_grammar_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 29896,
          "logprob": -0.7709961,
          "special": false,
          "text": "1"
        },
        {
          "id": 29906,
          "logprob": -0.33740234,
          "special": false,
          "text": "2"
        },
        {
          "id": 29941,
          "logprob": -0.00995636,
          "special": false,
          "text": "3"
        },
        {
          "id": 29946,
          "logprob": -0.64208984,
          "special": false,
          "text": "4"
        },
        {
          "id": 29945,
          "logprob": -0.4970703,
          "special": false,
          "text": "5"
        },
        {
          "id": 29953,
          "logprob": -0.46533203,
          "special": false,
          "text": "6"
        },
        {
          "id": 29992,
          "logprob": -0.5336914,
          "special": false,
          "text": "@"
        },
        {
          "id": 21980,
          "logprob": -0.5361328,
          "special": false,
          "text": "gmail"
        },
        {
          "id": 29889,
          "logprob": -0.00088739395,
          "special": false,
          "text": "."
        },
        {
          "id": 510,
          "logprob": -0.0022735596,
          "special": false,
          "text": "com"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "123456@gmail.com"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 29896,
          "logprob": -0.7685547,
          "special": false,
          "text": "1"
        },
        {
          "id": 29906,
          "logprob": -0.33666992,
          "special": false,
          "text": "2"
        },
        {
          "id": 29941,
          "logprob": -0.01008606,
          "special": false,
          "text": "3"
        },
        {
          "id": 29946,
          "logprob": -0.64160156,
          "special": false,
          "text": "4"
        },
        {
          "id": 29945,
          "logprob": -0.5,
          "special": false,
          "text": "5"
        },
        {
          "id": 29953,
          "logprob": -0.46557617,
          "special": false,
          "text": "6"
        },
        {
          "id": 29992,
          "logprob": -0.5341797,
          "special": false,
          "text": "@"
        },
        {
          "id": 21980,
          "logprob": -0.5361328,
          "special": false,
          "text": "gmail"
        },
        {
          "id": 29889,
          "logprob": -0.00088739395,
          "special": false,
          "text": "."
        },
        {
          "id": 510,
          "logprob": -0.0022907257,
          "special": false,
          "text": "com"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "123456@gmail.com"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 29896,
          "logprob": -0.7709961,
          "special": false,
          "text": "1"
        },
        {
          "id": 29906,
          "logprob": -0.33740234,
          "special": false,
          "text": "2"
        },
        {
          "id": 29941,
          "logprob": -0.00995636,
          "special": false,
          "text": "3"
        },
        {
          "id": 29946,
          "logprob": -0.64208984,
          "special": false,
          "text": "4"
        },
        {
          "id": 29945,
          "logprob": -0.4970703,
          "special": false,
          "text": "5"
        },
        {
          "id": 29953,
          "logprob": -0.46533203,
          "special": false,
          "text": "6"
        },
        {
          "id": 29992,
          "logprob": -0.5336914,
          "special": false,
          "text": "@"
        },
        {
          "id": 21980,
          "logprob": -0.5361328,
          "special": false,
          "text": "gmail"
        },
        {
          "id": 29889,
          "logprob": -0.00088739395,
          "special": false,
          "text": "."
        },
        {
          "id": 510,
          "logprob": -0.0022735596,
          "special": false,
          "text": "com"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "123456@gmail.com"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 29896,
          "logprob": -0.7709961,
          "special": false,
          "text": "1"
        },
        {
          "id": 29906,
          "logprob": -0.33740234,
          "special": false,
          "text": "2"
        },
        {
          "id": 29941,
          "logprob": -0.00995636,
          "special": false,
          "text": "3"
        },
        {
          "id": 29946,
          "logprob": -0.64208984,
          "special": false,
          "text": "4"
        },
        {
          "id": 29945,
          "logprob": -0.4970703,
          "special": false,
          "text": "5"
        },
        {
          "id": 29953,
          "logprob": -0.46533203,
          "special": false,
          "text": "6"
        },
        {
          "id": 29992,
          "logprob": -0.5336914,
          "special": false,
          "text": "@"
        },
        {
          "id": 21980,
          "logprob": -0.5361328,
          "special": false,
          "text": "gmail"
        },
        {
          "id": 29889,
          "logprob": -0.00088739395,
          "special": false,
          "text": "."
        },
        {
          "id": 510,
          "logprob": -0.0022735596,
          "special": false,
          "text": "com"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "123456@gmail.com"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_flash_grammar_llama/test_flash_llama_grammar_regex.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 29946,
        "logprob": -1.4765625,
        "special": false,
        "text": "4"
      },
      {
        "id": 29906,
        "logprob": -0.9199219,
        "special": false,
        "text": "2"
      },
      {
        "id": 29889,
        "logprob": 0.0,
        "special": false,
        "text": "."
      },
      {
        "id": 29896,
        "logprob": -1.1367188,
        "special": false,
        "text": "1"
      },
      {
        "id": 29889,
        "logprob": -1.4648438,
        "special": false,
        "text": "."
      },
      {
        "id": 29896,
        "logprob": -0.40722656,
        "special": false,
        "text": "1"
      },
      {
        "id": 29889,
        "logprob": -0.17419434,
        "special": false,
        "text": "."
      },
      {
        "id": 29896,
        "logprob": -0.20251465,
        "special": false,
        "text": "1"
      },
      {
        "id": 29900,
        "logprob": -1.5527344,
        "special": false,
        "text": "0"
      },
      {
        "id": 29896,
        "logprob": -1.3710938,
        "special": false,
        "text": "1"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "42.1.1.101"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_grammar_llama/test_flash_llama_grammar_single_load_instance.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 29896,
        "logprob": -0.7685547,
        "special": false,
        "text": "1"
      },
      {
        "id": 29906,
        "logprob": -0.33666992,
        "special": false,
        "text": "2"
      },
      {
        "id": 29941,
        "logprob": -0.009979248,
        "special": false,
        "text": "3"
      },
      {
        "id": 29946,
        "logprob": -0.64208984,
        "special": false,
        "text": "4"
      },
      {
        "id": 29945,
        "logprob": -0.4970703,
        "special": false,
        "text": "5"
      },
      {
        "id": 29953,
        "logprob": -0.46533203,
        "special": false,
        "text": "6"
      },
      {
        "id": 29992,
        "logprob": -0.5336914,
        "special": false,
        "text": "@"
      },
      {
        "id": 21980,
        "logprob": -0.53759766,
        "special": false,
        "text": "gmail"
      },
      {
        "id": 29889,
        "logprob": -0.0008878708,
        "special": false,
        "text": "."
      },
      {
        "id": 510,
        "logprob": -0.002275467,
        "special": false,
        "text": "com"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "123456@gmail.com"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama_all_params.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "stop_sequence",
    "generated_tokens": 5,
    "prefill": [],
    "seed": 0,
    "tokens": [
      {
        "id": 5229,
        "logprob": -2.5839844,
        "special": false,
        "text": " failed"
      },
      {
        "id": 29901,
        "logprob": -0.44970703,
        "special": false,
        "text": ":"
      },
      {
        "id": 4829,
        "logprob": -1.8339844,
        "special": false,
        "text": " Error"
      },
      {
        "id": 297,
        "logprob": -1.0556641,
        "special": false,
        "text": " in"
      },
      {
        "id": 1243,
        "logprob": 0.0,
        "special": false,
        "text": " test"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "Test request failed: Error in test"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 363,
          "logprob": -1.5351562,
          "special": false,
          "text": " for"
        },
        {
          "id": 847,
          "logprob": -2.5566406,
          "special": false,
          "text": " /"
        },
        {
          "id": 2754,
          "logprob": -2.2519531,
          "special": false,
          "text": "api"
        },
        {
          "id": 29914,
          "logprob": -0.03414917,
          "special": false,
          "text": "/"
        },
        {
          "id": 29894,
          "logprob": -0.96240234,
          "special": false,
          "text": "v"
        },
        {
          "id": 29896,
          "logprob": -0.3647461,
          "special": false,
          "text": "1"
        },
        {
          "id": 29914,
          "logprob": -0.012901306,
          "special": false,
          "text": "/"
        },
        {
          "id": 16418,
          "logprob": -3.1542969,
          "special": false,
          "text": "projects"
        },
        {
          "id": 29914,
          "logprob": -0.4362793,
          "special": false,
          "text": "/"
        },
        {
          "id": 29896,
          "logprob": -1.9394531,
          "special": false,
          "text": "1"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " for /api/v1/projects/1"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 363,
          "logprob": -1.5332031,
          "special": false,
          "text": " for"
        },
        {
          "id": 847,
          "logprob": -2.5625,
          "special": false,
          "text": " /"
        },
        {
          "id": 2754,
          "logprob": -2.2617188,
          "special": false,
          "text": "api"
        },
        {
          "id": 29914,
          "logprob": -0.033996582,
          "special": false,
          "text": "/"
        },
        {
          "id": 29894,
          "logprob": -0.9609375,
          "special": false,
          "text": "v"
        },
        {
          "id": 29896,
          "logprob": -0.36572266,
          "special": false,
          "text": "1"
        },
        {
          "id": 29914,
          "logprob": -0.0129776,
          "special": false,
          "text": "/"
        },
        {
          "id": 16418,
          "logprob": -3.15625,
          "special": false,
          "text": "projects"
        },
        {
          "id": 29914,
          "logprob": -0.4362793,
          "special": false,
          "text": "/"
        },
        {
          "id": 29896,
          "logprob": -1.9394531,
          "special": false,
          "text": "1"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " for /api/v1/projects/1"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 363,
          "logprob": -1.5332031,
          "special": false,
          "text": " for"
        },
        {
          "id": 847,
          "logprob": -2.5625,
          "special": false,
          "text": " /"
        },
        {
          "id": 2754,
          "logprob": -2.2617188,
          "special": false,
          "text": "api"
        },
        {
          "id": 29914,
          "logprob": -0.033996582,
          "special": false,
          "text": "/"
        },
        {
          "id": 29894,
          "logprob": -0.9609375,
          "special": false,
          "text": "v"
        },
        {
          "id": 29896,
          "logprob": -0.36572266,
          "special": false,
          "text": "1"
        },
        {
          "id": 29914,
          "logprob": -0.0129776,
          "special": false,
          "text": "/"
        },
        {
          "id": 16418,
          "logprob": -3.15625,
          "special": false,
          "text": "projects"
        },
        {
          "id": 29914,
          "logprob": -0.4362793,
          "special": false,
          "text": "/"
        },
        {
          "id": 29896,
          "logprob": -1.9394531,
          "special": false,
          "text": "1"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " for /api/v1/projects/1"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 363,
          "logprob": -1.5332031,
          "special": false,
          "text": " for"
        },
        {
          "id": 847,
          "logprob": -2.5625,
          "special": false,
          "text": " /"
        },
        {
          "id": 2754,
          "logprob": -2.2617188,
          "special": false,
          "text": "api"
        },
        {
          "id": 29914,
          "logprob": -0.033996582,
          "special": false,
          "text": "/"
        },
        {
          "id": 29894,
          "logprob": -0.9609375,
          "special": false,
          "text": "v"
        },
        {
          "id": 29896,
          "logprob": -0.36572266,
          "special": false,
          "text": "1"
        },
        {
          "id": 29914,
          "logprob": -0.0129776,
          "special": false,
          "text": "/"
        },
        {
          "id": 16418,
          "logprob": -3.15625,
          "special": false,
          "text": "projects"
        },
        {
          "id": 29914,
          "logprob": -0.4362793,
          "special": false,
          "text": "/"
        },
        {
          "id": 29896,
          "logprob": -1.9394531,
          "special": false,
          "text": "1"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " for /api/v1/projects/1"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama_simple.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 363,
        "logprob": -1.5351562,
        "special": false,
        "text": " for"
      },
      {
        "id": 847,
        "logprob": -2.5722656,
        "special": false,
        "text": " /"
      },
      {
        "id": 2754,
        "logprob": -2.2714844,
        "special": false,
        "text": "api"
      },
      {
        "id": 29914,
        "logprob": -0.03414917,
        "special": false,
        "text": "/"
      },
      {
        "id": 29894,
        "logprob": -0.95996094,
        "special": false,
        "text": "v"
      },
      {
        "id": 29896,
        "logprob": -0.3635254,
        "special": false,
        "text": "1"
      },
      {
        "id": 29914,
        "logprob": -0.013031006,
        "special": false,
        "text": "/"
      },
      {
        "id": 16418,
        "logprob": -3.1523438,
        "special": false,
        "text": "projects"
      },
      {
        "id": 29914,
        "logprob": -0.43701172,
        "special": false,
        "text": "/"
      },
      {
        "id": 29896,
        "logprob": -1.9394531,
        "special": false,
        "text": "1"
      }
    ],
    "top_tokens": null
  },
  "generated_text": " for /api/v1/projects/1"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_llama_exl2/test_flash_llama_exl2.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 25,
        "logprob": -2.9316406,
        "special": false,
        "text": ":"
      },
      {
        "id": 330,
        "logprob": -3.5136719,
        "special": false,
        "text": " \""
      },
      {
        "id": 489,
        "logprob": -0.7783203,
        "special": false,
        "text": " +"
      },
      {
        "id": 1715,
        "logprob": -1.2314453,
        "special": false,
        "text": " request"
      },
      {
        "id": 489,
        "logprob": -2.0019531,
        "special": false,
        "text": " +"
      },
      {
        "id": 2990,
        "logprob": -1.5009766,
        "special": false,
        "text": " \"\\"
      },
      {
        "id": 77,
        "logprob": -0.057434082,
        "special": false,
        "text": "n"
      },
      {
        "id": 702,
        "logprob": -1.4912109,
        "special": false,
        "text": "\"\n"
      },
      {
        "id": 262,
        "logprob": -1.2636719,
        "special": false,
        "text": "   "
      },
      {
        "id": 557,
        "logprob": -2.4042969,
        "special": false,
        "text": " }\n\n"
      }
    ],
    "top_tokens": null
  },
  "generated_text": ": \" + request + \"\\n\"\n    }\n\n"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_llama_exl2/test_flash_llama_exl2_all_params.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": 0,
    "tokens": [
      {
        "id": 13,
        "logprob": -1.9980469,
        "special": false,
        "text": "."
      },
      {
        "id": 578,
        "logprob": -0.15795898,
        "special": false,
        "text": " The"
      },
      {
        "id": 3622,
        "logprob": -1.0458984,
        "special": false,
        "text": " server"
      },
      {
        "id": 31680,
        "logprob": -1.3623047,
        "special": false,
        "text": " responds"
      },
      {
        "id": 449,
        "logprob": 0.0,
        "special": false,
        "text": " with"
      },
      {
        "id": 264,
        "logprob": 0.0,
        "special": false,
        "text": " a"
      },
      {
        "id": 330,
        "logprob": -0.5678711,
        "special": false,
        "text": " \""
      },
      {
        "id": 1049,
        "logprob": -0.12322998,
        "special": false,
        "text": "200"
      },
      {
        "id": 10619,
        "logprob": 0.0,
        "special": false,
        "text": " OK"
      },
      {
        "id": 1,
        "logprob": 0.0,
        "special": false,
        "text": "\""
      }
    ],
    "top_tokens": null
  },
  "generated_text": "Test request. The server responds with a \"200 OK\""
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_llama_exl2/test_flash_llama_exl2_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 25,
          "logprob": -2.9785156,
          "special": false,
          "text": ":"
        },
        {
          "id": 330,
          "logprob": -3.4941406,
          "special": false,
          "text": " \""
        },
        {
          "id": 489,
          "logprob": -0.79345703,
          "special": false,
          "text": " +"
        },
        {
          "id": 1715,
          "logprob": -1.2324219,
          "special": false,
          "text": " request"
        },
        {
          "id": 489,
          "logprob": -1.9794922,
          "special": false,
          "text": " +"
        },
        {
          "id": 2990,
          "logprob": -1.4892578,
          "special": false,
          "text": " \"\\"
        },
        {
          "id": 77,
          "logprob": -0.058258057,
          "special": false,
          "text": "n"
        },
        {
          "id": 702,
          "logprob": -1.4892578,
          "special": false,
          "text": "\"\n"
        },
        {
          "id": 262,
          "logprob": -1.2783203,
          "special": false,
          "text": "   "
        },
        {
          "id": 557,
          "logprob": -2.3945312,
          "special": false,
          "text": " }\n\n"
        }
      ],
      "top_tokens": null
    },
    "generated_text": ": \" + request + \"\\n\"\n    }\n\n"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 25,
          "logprob": -2.9433594,
          "special": false,
          "text": ":"
        },
        {
          "id": 330,
          "logprob": -3.4726562,
          "special": false,
          "text": " \""
        },
        {
          "id": 489,
          "logprob": -0.8022461,
          "special": false,
          "text": " +"
        },
        {
          "id": 1715,
          "logprob": -1.2509766,
          "special": false,
          "text": " request"
        },
        {
          "id": 489,
          "logprob": -1.984375,
          "special": false,
          "text": " +"
        },
        {
          "id": 2990,
          "logprob": -1.4677734,
          "special": false,
          "text": " \"\\"
        },
        {
          "id": 77,
          "logprob": -0.059173584,
          "special": false,
          "text": "n"
        },
        {
          "id": 702,
          "logprob": -1.4990234,
          "special": false,
          "text": "\"\n"
        },
        {
          "id": 262,
          "logprob": -1.2822266,
          "special": false,
          "text": "   "
        },
        {
          "id": 557,
          "logprob": -2.3867188,
          "special": false,
          "text": " }\n\n"
        }
      ],
      "top_tokens": null
    },
    "generated_text": ": \" + request + \"\\n\"\n    }\n\n"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 25,
          "logprob": -2.9511719,
          "special": false,
          "text": ":"
        },
        {
          "id": 330,
          "logprob": -3.46875,
          "special": false,
          "text": " \""
        },
        {
          "id": 489,
          "logprob": -0.77490234,
          "special": false,
          "text": " +"
        },
        {
          "id": 1715,
          "logprob": -1.2558594,
          "special": false,
          "text": " request"
        },
        {
          "id": 489,
          "logprob": -1.984375,
          "special": false,
          "text": " +"
        },
        {
          "id": 2990,
          "logprob": -1.4990234,
          "special": false,
          "text": " \"\\"
        },
        {
          "id": 77,
          "logprob": -0.059143066,
          "special": false,
          "text": "n"
        },
        {
          "id": 702,
          "logprob": -1.4941406,
          "special": false,
          "text": "\"\n"
        },
        {
          "id": 262,
          "logprob": -1.2578125,
          "special": false,
          "text": "   "
        },
        {
          "id": 557,
          "logprob": -2.3964844,
          "special": false,
          "text": " }\n\n"
        }
      ],
      "top_tokens": null
    },
    "generated_text": ": \" + request + \"\\n\"\n    }\n\n"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 25,
          "logprob": -2.9101562,
          "special": false,
          "text": ":"
        },
        {
          "id": 330,
          "logprob": -3.5039062,
          "special": false,
          "text": " \""
        },
        {
          "id": 489,
          "logprob": -0.8076172,
          "special": false,
          "text": " +"
        },
        {
          "id": 1715,
          "logprob": -1.2236328,
          "special": false,
          "text": " request"
        },
        {
          "id": 489,
          "logprob": -1.9853516,
          "special": false,
          "text": " +"
        },
        {
          "id": 2990,
          "logprob": -1.4892578,
          "special": false,
          "text": " \"\\"
        },
        {
          "id": 77,
          "logprob": -0.056671143,
          "special": false,
          "text": "n"
        },
        {
          "id": 702,
          "logprob": -1.5107422,
          "special": false,
          "text": "\"\n"
        },
        {
          "id": 262,
          "logprob": -1.2597656,
          "special": false,
          "text": "   "
        },
        {
          "id": 557,
          "logprob": -2.4042969,
          "special": false,
          "text": " }\n\n"
        }
      ],
      "top_tokens": null
    },
    "generated_text": ": \" + request + \"\\n\"\n    }\n\n"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_flash_llama_fp8/test_flash_llama_fp8.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 369,
        "logprob": -2.1816406,
        "special": false,
        "text": " for"
      },
      {
        "id": 279,
        "logprob": -2.6992188,
        "special": false,
        "text": " the"
      },
      {
        "id": 220,
        "logprob": -3.6308594,
        "special": false,
        "text": " "
      },
      {
        "id": 679,
        "logprob": -1.7900391,
        "special": false,
        "text": "201"
      },
      {
        "id": 24,
        "logprob": -1.3554688,
        "special": false,
        "text": "9"
      },
      {
        "id": 12,
        "logprob": -2.0039062,
        "special": false,
        "text": "-"
      },
      {
        "id": 2366,
        "logprob": -0.4489746,
        "special": false,
        "text": "202"
      },
      {
        "id": 15,
        "logprob": -0.037109375,
        "special": false,
        "text": "0"
      },
      {
        "id": 2978,
        "logprob": -0.8100586,
        "special": false,
        "text": " school"
      },
      {
        "id": 1060,
        "logprob": -0.013015747,
        "special": false,
        "text": " year"
      }
    ],
    "top_tokens": null
  },
  "generated_text": " for the 2019-2020 school year"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_llama_fp8/test_flash_llama_fp8_all_params.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": 0,
    "tokens": [
      {
        "id": 25,
        "logprob": -0.88183594,
        "special": false,
        "text": ":"
      },
      {
        "id": 2209,
        "logprob": -2.6699219,
        "special": false,
        "text": " Is"
      },
      {
        "id": 279,
        "logprob": -0.61083984,
        "special": false,
        "text": " the"
      },
      {
        "id": 734,
        "logprob": -2.6660156,
        "special": false,
        "text": " function"
      },
      {
        "id": 330,
        "logprob": -0.35498047,
        "special": false,
        "text": " \""
      },
      {
        "id": 4110,
        "logprob": -2.4101562,
        "special": false,
        "text": "Create"
      },
      {
        "id": 7575,
        "logprob": -2.2304688,
        "special": false,
        "text": "Process"
      },
      {
        "id": 1,
        "logprob": -0.080078125,
        "special": false,
        "text": "\""
      },
      {
        "id": 304,
        "logprob": -0.75439453,
        "special": false,
        "text": " in"
      },
      {
        "id": 12468,
        "logprob": -1.8769531,
        "special": false,
        "text": " Win"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "Test request: Is the function \"CreateProcess\" in Win"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_llama_fp8/test_flash_llama_fp8_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 369,
          "logprob": -2.15625,
          "special": false,
          "text": " for"
        },
        {
          "id": 279,
          "logprob": -2.703125,
          "special": false,
          "text": " the"
        },
        {
          "id": 220,
          "logprob": -3.640625,
          "special": false,
          "text": " "
        },
        {
          "id": 679,
          "logprob": -1.703125,
          "special": false,
          "text": "201"
        },
        {
          "id": 24,
          "logprob": -1.421875,
          "special": false,
          "text": "9"
        },
        {
          "id": 12,
          "logprob": -2.03125,
          "special": false,
          "text": "-"
        },
        {
          "id": 2366,
          "logprob": -0.49023438,
          "special": false,
          "text": "202"
        },
        {
          "id": 15,
          "logprob": -0.041503906,
          "special": false,
          "text": "0"
        },
        {
          "id": 2978,
          "logprob": -0.87109375,
          "special": false,
          "text": " school"
        },
        {
          "id": 1060,
          "logprob": -0.012939453,
          "special": false,
          "text": " year"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " for the 2019-2020 school year"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 369,
          "logprob": -2.15625,
          "special": false,
          "text": " for"
        },
        {
          "id": 279,
          "logprob": -2.703125,
          "special": false,
          "text": " the"
        },
        {
          "id": 220,
          "logprob": -3.640625,
          "special": false,
          "text": " "
        },
        {
          "id": 679,
          "logprob": -1.703125,
          "special": false,
          "text": "201"
        },
        {
          "id": 24,
          "logprob": -1.421875,
          "special": false,
          "text": "9"
        },
        {
          "id": 12,
          "logprob": -2.03125,
          "special": false,
          "text": "-"
        },
        {
          "id": 2366,
          "logprob": -0.49023438,
          "special": false,
          "text": "202"
        },
        {
          "id": 15,
          "logprob": -0.041503906,
          "special": false,
          "text": "0"
        },
        {
          "id": 2978,
          "logprob": -0.87109375,
          "special": false,
          "text": " school"
        },
        {
          "id": 1060,
          "logprob": -0.012939453,
          "special": false,
          "text": " year"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " for the 2019-2020 school year"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 369,
          "logprob": -2.15625,
          "special": false,
          "text": " for"
        },
        {
          "id": 279,
          "logprob": -2.703125,
          "special": false,
          "text": " the"
        },
        {
          "id": 220,
          "logprob": -3.640625,
          "special": false,
          "text": " "
        },
        {
          "id": 679,
          "logprob": -1.703125,
          "special": false,
          "text": "201"
        },
        {
          "id": 24,
          "logprob": -1.421875,
          "special": false,
          "text": "9"
        },
        {
          "id": 12,
          "logprob": -2.03125,
          "special": false,
          "text": "-"
        },
        {
          "id": 2366,
          "logprob": -0.49023438,
          "special": false,
          "text": "202"
        },
        {
          "id": 15,
          "logprob": -0.041503906,
          "special": false,
          "text": "0"
        },
        {
          "id": 2978,
          "logprob": -0.87109375,
          "special": false,
          "text": " school"
        },
        {
          "id": 1060,
          "logprob": -0.012939453,
          "special": false,
          "text": " year"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " for the 2019-2020 school year"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 369,
          "logprob": -2.15625,
          "special": false,
          "text": " for"
        },
        {
          "id": 279,
          "logprob": -2.703125,
          "special": false,
          "text": " the"
        },
        {
          "id": 220,
          "logprob": -3.640625,
          "special": false,
          "text": " "
        },
        {
          "id": 679,
          "logprob": -1.703125,
          "special": false,
          "text": "201"
        },
        {
          "id": 24,
          "logprob": -1.421875,
          "special": false,
          "text": "9"
        },
        {
          "id": 12,
          "logprob": -2.03125,
          "special": false,
          "text": "-"
        },
        {
          "id": 2366,
          "logprob": -0.49023438,
          "special": false,
          "text": "202"
        },
        {
          "id": 15,
          "logprob": -0.041503906,
          "special": false,
          "text": "0"
        },
        {
          "id": 2978,
          "logprob": -0.87109375,
          "special": false,
          "text": " school"
        },
        {
          "id": 1060,
          "logprob": -0.012939453,
          "special": false,
          "text": " year"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " for the 2019-2020 school year"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 18682,
        "logprob": -1.109375,
        "special": false,
        "text": " Deep"
      },
      {
        "id": 6975,
        "logprob": -0.005432129,
        "special": false,
        "text": " learning"
      },
      {
        "id": 374,
        "logprob": -0.028808594,
        "special": false,
        "text": " is"
      },
      {
        "id": 264,
        "logprob": -0.013671875,
        "special": false,
        "text": " a"
      },
      {
        "id": 27084,
        "logprob": -0.69921875,
        "special": false,
        "text": " subset"
      },
      {
        "id": 315,
        "logprob": -0.0005874634,
        "special": false,
        "text": " of"
      },
      {
        "id": 5780,
        "logprob": -0.026855469,
        "special": false,
        "text": " machine"
      },
      {
        "id": 6975,
        "logprob": -0.00020885468,
        "special": false,
        "text": " learning"
      },
      {
        "id": 430,
        "logprob": -0.17773438,
        "special": false,
        "text": " that"
      },
      {
        "id": 18065,
        "logprob": -0.703125,
        "special": false,
        "text": " involves"
      }
    ],
    "top_tokens": null
  },
  "generated_text": " Deep learning is a subset of machine learning that involves"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache_all_params.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": 0,
    "tokens": [
      {
        "id": 720,
        "logprob": 0.0,
        "special": false,
        "text": " \n"
      },
      {
        "id": 34564,
        "logprob": -0.12512207,
        "special": false,
        "text": "Deep"
      },
      {
        "id": 6975,
        "logprob": 0.0,
        "special": false,
        "text": " learning"
      },
      {
        "id": 320,
        "logprob": -0.23840332,
        "special": false,
        "text": " ("
      },
      {
        "id": 16931,
        "logprob": -2.0175781,
        "special": false,
        "text": "DL"
      },
      {
        "id": 8,
        "logprob": 0.0,
        "special": false,
        "text": ")"
      },
      {
        "id": 374,
        "logprob": -0.8613281,
        "special": false,
        "text": " is"
      },
      {
        "id": 264,
        "logprob": 0.0,
        "special": false,
        "text": " a"
      },
      {
        "id": 1207,
        "logprob": -1.2451172,
        "special": false,
        "text": " sub"
      },
      {
        "id": 2630,
        "logprob": 0.0,
        "special": false,
        "text": "field"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "What is deep learning? \nDeep learning (DL) is a subfield"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 18682,
          "logprob": -1.109375,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 6975,
          "logprob": -0.0047912598,
          "special": false,
          "text": " learning"
        },
        {
          "id": 374,
          "logprob": -0.025512695,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
          "logprob": -0.012145996,
          "special": false,
          "text": " a"
        },
        {
          "id": 27084,
          "logprob": -0.72265625,
          "special": false,
          "text": " subset"
        },
        {
          "id": 315,
          "logprob": -0.0005760193,
          "special": false,
          "text": " of"
        },
        {
          "id": 5780,
          "logprob": -0.02722168,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6975,
          "logprob": -0.00023651123,
          "special": false,
          "text": " learning"
        },
        {
          "id": 430,
          "logprob": -0.17285156,
          "special": false,
          "text": " that"
        },
        {
          "id": 18065,
          "logprob": -0.703125,
          "special": false,
          "text": " involves"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " Deep learning is a subset of machine learning that involves"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 18682,
          "logprob": -1.1796875,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 6975,
          "logprob": -0.005432129,
          "special": false,
          "text": " learning"
        },
        {
          "id": 374,
          "logprob": -0.02758789,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
          "logprob": -0.013366699,
          "special": false,
          "text": " a"
        },
        {
          "id": 27084,
          "logprob": -0.6953125,
          "special": false,
          "text": " subset"
        },
        {
          "id": 315,
          "logprob": -0.0004863739,
          "special": false,
          "text": " of"
        },
        {
          "id": 5780,
          "logprob": -0.02709961,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6975,
          "logprob": -0.00022506714,
          "special": false,
          "text": " learning"
        },
        {
          "id": 430,
          "logprob": -0.19726562,
          "special": false,
          "text": " that"
        },
        {
          "id": 18065,
          "logprob": -0.77734375,
          "special": false,
          "text": " involves"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " Deep learning is a subset of machine learning that involves"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 18682,
          "logprob": -1.1796875,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 6975,
          "logprob": -0.005432129,
          "special": false,
          "text": " learning"
        },
        {
          "id": 374,
          "logprob": -0.02758789,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
          "logprob": -0.013366699,
          "special": false,
          "text": " a"
        },
        {
          "id": 27084,
          "logprob": -0.6953125,
          "special": false,
          "text": " subset"
        },
        {
          "id": 315,
          "logprob": -0.0004863739,
          "special": false,
          "text": " of"
        },
        {
          "id": 5780,
          "logprob": -0.02709961,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6975,
          "logprob": -0.00022506714,
          "special": false,
          "text": " learning"
        },
        {
          "id": 430,
          "logprob": -0.19726562,
          "special": false,
          "text": " that"
        },
        {
          "id": 18065,
          "logprob": -0.77734375,
          "special": false,
          "text": " involves"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " Deep learning is a subset of machine learning that involves"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 18682,
          "logprob": -1.1796875,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 6975,
          "logprob": -0.005432129,
          "special": false,
          "text": " learning"
        },
        {
          "id": 374,
          "logprob": -0.02758789,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
          "logprob": -0.013366699,
          "special": false,
          "text": " a"
        },
        {
          "id": 27084,
          "logprob": -0.6953125,
          "special": false,
          "text": " subset"
        },
        {
          "id": 315,
          "logprob": -0.0004863739,
          "special": false,
          "text": " of"
        },
        {
          "id": 5780,
          "logprob": -0.02709961,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6975,
          "logprob": -0.00022506714,
          "special": false,
          "text": " learning"
        },
        {
          "id": 430,
          "logprob": -0.19726562,
          "special": false,
          "text": " that"
        },
        {
          "id": 18065,
          "logprob": -0.77734375,
          "special": false,
          "text": " involves"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " Deep learning is a subset of machine learning that involves"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 198,
        "logprob": -2.5742188,
        "special": false,
        "text": "\n"
      },
      {
        "id": 262,
        "logprob": -1.6230469,
        "special": false,
        "text": "   "
      },
      {
        "id": 3270,
        "logprob": -2.046875,
        "special": false,
        "text": " \"\"\"\n"
      },
      {
        "id": 262,
        "logprob": -0.015281677,
        "special": false,
        "text": "   "
      },
      {
        "id": 422,
        "logprob": -2.1425781,
        "special": false,
        "text": " if"
      },
      {
        "id": 1715,
        "logprob": -0.9238281,
        "special": false,
        "text": " request"
      },
      {
        "id": 13204,
        "logprob": -0.076660156,
        "special": false,
        "text": ".method"
      },
      {
        "id": 624,
        "logprob": -0.021987915,
        "special": false,
        "text": " =="
      },
      {
        "id": 364,
        "logprob": -0.39208984,
        "special": false,
        "text": " '"
      },
      {
        "id": 3019,
        "logprob": -0.10821533,
        "special": false,
        "text": "POST"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "\n    \"\"\"\n    if request.method == 'POST"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_all_params.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": 0,
    "tokens": [
      {
        "id": 13,
        "logprob": -2.2539062,
        "special": false,
        "text": "."
      },
      {
        "id": 578,
        "logprob": -0.15563965,
        "special": false,
        "text": " The"
      },
      {
        "id": 3622,
        "logprob": -0.8203125,
        "special": false,
        "text": " server"
      },
      {
        "id": 706,
        "logprob": 0.0,
        "special": false,
        "text": " has"
      },
      {
        "id": 539,
        "logprob": 0.0,
        "special": false,
        "text": " not"
      },
      {
        "id": 3686,
        "logprob": 0.0,
        "special": false,
        "text": " yet"
      },
      {
        "id": 3288,
        "logprob": 0.0,
        "special": false,
        "text": " sent"
      },
      {
        "id": 904,
        "logprob": 0.0,
        "special": false,
        "text": " any"
      },
      {
        "id": 828,
        "logprob": 0.0,
        "special": false,
        "text": " data"
      },
      {
        "id": 382,
        "logprob": -1.5517578,
        "special": false,
        "text": ".\n\n"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "Test request. The server has not yet sent any data.\n\n"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 198,
          "logprob": -2.5742188,
          "special": false,
          "text": "\n"
        },
        {
          "id": 262,
          "logprob": -1.6220703,
          "special": false,
          "text": "   "
        },
        {
          "id": 3270,
          "logprob": -2.0410156,
          "special": false,
          "text": " \"\"\"\n"
        },
        {
          "id": 262,
          "logprob": -0.015281677,
          "special": false,
          "text": "   "
        },
        {
          "id": 422,
          "logprob": -2.1445312,
          "special": false,
          "text": " if"
        },
        {
          "id": 1715,
          "logprob": -0.92333984,
          "special": false,
          "text": " request"
        },
        {
          "id": 13204,
          "logprob": -0.07672119,
          "special": false,
          "text": ".method"
        },
        {
          "id": 624,
          "logprob": -0.021987915,
          "special": false,
          "text": " =="
        },
        {
          "id": 364,
          "logprob": -0.39208984,
          "special": false,
          "text": " '"
        },
        {
          "id": 3019,
          "logprob": -0.10638428,
          "special": false,
          "text": "POST"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n    \"\"\"\n    if request.method == 'POST"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 198,
          "logprob": -2.5742188,
          "special": false,
          "text": "\n"
        },
        {
          "id": 262,
          "logprob": -1.6220703,
          "special": false,
          "text": "   "
        },
        {
          "id": 3270,
          "logprob": -2.0410156,
          "special": false,
          "text": " \"\"\"\n"
        },
        {
          "id": 262,
          "logprob": -0.015281677,
          "special": false,
          "text": "   "
        },
        {
          "id": 422,
          "logprob": -2.1445312,
          "special": false,
          "text": " if"
        },
        {
          "id": 1715,
          "logprob": -0.92333984,
          "special": false,
          "text": " request"
        },
        {
          "id": 13204,
          "logprob": -0.07672119,
          "special": false,
          "text": ".method"
        },
        {
          "id": 624,
          "logprob": -0.021987915,
          "special": false,
          "text": " =="
        },
        {
          "id": 364,
          "logprob": -0.39208984,
          "special": false,
          "text": " '"
        },
        {
          "id": 3019,
          "logprob": -0.10638428,
          "special": false,
          "text": "POST"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n    \"\"\"\n    if request.method == 'POST"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 198,
          "logprob": -2.5742188,
          "special": false,
          "text": "\n"
        },
        {
          "id": 262,
          "logprob": -1.6220703,
          "special": false,
          "text": "   "
        },
        {
          "id": 3270,
          "logprob": -2.0410156,
          "special": false,
          "text": " \"\"\"\n"
        },
        {
          "id": 262,
          "logprob": -0.015281677,
          "special": false,
          "text": "   "
        },
        {
          "id": 422,
          "logprob": -2.1445312,
          "special": false,
          "text": " if"
        },
        {
          "id": 1715,
          "logprob": -0.92333984,
          "special": false,
          "text": " request"
        },
        {
          "id": 13204,
          "logprob": -0.07672119,
          "special": false,
          "text": ".method"
        },
        {
          "id": 624,
          "logprob": -0.021987915,
          "special": false,
          "text": " =="
        },
        {
          "id": 364,
          "logprob": -0.39208984,
          "special": false,
          "text": " '"
        },
        {
          "id": 3019,
          "logprob": -0.10638428,
          "special": false,
          "text": "POST"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n    \"\"\"\n    if request.method == 'POST"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 198,
          "logprob": -2.5742188,
          "special": false,
          "text": "\n"
        },
        {
          "id": 262,
          "logprob": -1.6220703,
          "special": false,
          "text": "   "
        },
        {
          "id": 3270,
          "logprob": -2.0410156,
          "special": false,
          "text": " \"\"\"\n"
        },
        {
          "id": 262,
          "logprob": -0.015281677,
          "special": false,
          "text": "   "
        },
        {
          "id": 422,
          "logprob": -2.1445312,
          "special": false,
          "text": " if"
        },
        {
          "id": 1715,
          "logprob": -0.92333984,
          "special": false,
          "text": " request"
        },
        {
          "id": 13204,
          "logprob": -0.07672119,
          "special": false,
          "text": ".method"
        },
        {
          "id": 624,
          "logprob": -0.021987915,
          "special": false,
          "text": " =="
        },
        {
          "id": 364,
          "logprob": -0.39208984,
          "special": false,
          "text": " '"
        },
        {
          "id": 3019,
          "logprob": -0.10638428,
          "special": false,
          "text": "POST"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n    \"\"\"\n    if request.method == 'POST"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_flash_llama_marlin/test_flash_llama_marlin.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 13,
        "logprob": -2.0507812,
        "special": false,
        "text": "\n"
      },
      {
        "id": 13,
        "logprob": -2.3007812,
        "special": false,
        "text": "\n"
      },
      {
        "id": 29902,
        "logprob": -2.0449219,
        "special": false,
        "text": "I"
      },
      {
        "id": 505,
        "logprob": -1.3242188,
        "special": false,
        "text": " have"
      },
      {
        "id": 263,
        "logprob": -0.2076416,
        "special": false,
        "text": " a"
      },
      {
        "id": 1243,
        "logprob": -2.0273438,
        "special": false,
        "text": " test"
      },
      {
        "id": 2009,
        "logprob": -0.6845703,
        "special": false,
        "text": " request"
      },
      {
        "id": 515,
        "logprob": -1.1748047,
        "special": false,
        "text": " from"
      },
      {
        "id": 263,
        "logprob": -1.0644531,
        "special": false,
        "text": " a"
      },
      {
        "id": 1404,
        "logprob": -1.5224609,
        "special": false,
        "text": " user"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "\n\nI have a test request from a user"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_llama_marlin/test_flash_llama_marlin_all_params.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": 0,
    "tokens": [
      {
        "id": 5229,
        "logprob": -1.2607422,
        "special": false,
        "text": " failed"
      },
      {
        "id": 29901,
        "logprob": 0.0,
        "special": false,
        "text": ":"
      },
      {
        "id": 6527,
        "logprob": -0.11450195,
        "special": false,
        "text": " Could"
      },
      {
        "id": 451,
        "logprob": 0.0,
        "special": false,
        "text": " not"
      },
      {
        "id": 4511,
        "logprob": -0.2286377,
        "special": false,
        "text": " connect"
      },
      {
        "id": 304,
        "logprob": 0.0,
        "special": false,
        "text": " to"
      },
      {
        "id": 1923,
        "logprob": -1.2568359,
        "special": false,
        "text": " server"
      },
      {
        "id": 13,
        "logprob": 0.0,
        "special": false,
        "text": "\n"
      },
      {
        "id": 13,
        "logprob": -0.15905762,
        "special": false,
        "text": "\n"
      },
      {
        "id": 29902,
        "logprob": -0.21618652,
        "special": false,
        "text": "I"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "Test request failed: Could not connect to server\n\nI"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_llama_marlin/test_flash_llama_marlin_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 13,
          "logprob": -2.0507812,
          "special": false,
          "text": "\n"
        },
        {
          "id": 13,
          "logprob": -2.3007812,
          "special": false,
          "text": "\n"
        },
        {
          "id": 29902,
          "logprob": -2.0449219,
          "special": false,
          "text": "I"
        },
        {
          "id": 505,
          "logprob": -1.3242188,
          "special": false,
          "text": " have"
        },
        {
          "id": 263,
          "logprob": -0.2076416,
          "special": false,
          "text": " a"
        },
        {
          "id": 1243,
          "logprob": -2.0273438,
          "special": false,
          "text": " test"
        },
        {
          "id": 2009,
          "logprob": -0.6845703,
          "special": false,
          "text": " request"
        },
        {
          "id": 515,
          "logprob": -1.1748047,
          "special": false,
          "text": " from"
        },
        {
          "id": 263,
          "logprob": -1.0595703,
          "special": false,
          "text": " a"
        },
        {
          "id": 1404,
          "logprob": -1.5224609,
          "special": false,
          "text": " user"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nI have a test request from a user"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 13,
          "logprob": -2.0507812,
          "special": false,
          "text": "\n"
        },
        {
          "id": 13,
          "logprob": -2.3007812,
          "special": false,
          "text": "\n"
        },
        {
          "id": 29902,
          "logprob": -2.0449219,
          "special": false,
          "text": "I"
        },
        {
          "id": 505,
          "logprob": -1.3242188,
          "special": false,
          "text": " have"
        },
        {
          "id": 263,
          "logprob": -0.2076416,
          "special": false,
          "text": " a"
        },
        {
          "id": 1243,
          "logprob": -2.0273438,
          "special": false,
          "text": " test"
        },
        {
          "id": 2009,
          "logprob": -0.6845703,
          "special": false,
          "text": " request"
        },
        {
          "id": 515,
          "logprob": -1.1748047,
          "special": false,
          "text": " from"
        },
        {
          "id": 263,
          "logprob": -1.0595703,
          "special": false,
          "text": " a"
        },
        {
          "id": 1404,
          "logprob": -1.5224609,
          "special": false,
          "text": " user"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nI have a test request from a user"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 13,
          "logprob": -2.0507812,
          "special": false,
          "text": "\n"
        },
        {
          "id": 13,
          "logprob": -2.3007812,
          "special": false,
          "text": "\n"
        },
        {
          "id": 29902,
          "logprob": -2.0449219,
          "special": false,
          "text": "I"
        },
        {
          "id": 505,
          "logprob": -1.3242188,
          "special": false,
          "text": " have"
        },
        {
          "id": 263,
          "logprob": -0.2076416,
          "special": false,
          "text": " a"
        },
        {
          "id": 1243,
          "logprob": -2.0273438,
          "special": false,
          "text": " test"
        },
        {
          "id": 2009,
          "logprob": -0.6845703,
          "special": false,
          "text": " request"
        },
        {
          "id": 515,
          "logprob": -1.1748047,
          "special": false,
          "text": " from"
        },
        {
          "id": 263,
          "logprob": -1.0595703,
          "special": false,
          "text": " a"
        },
        {
          "id": 1404,
          "logprob": -1.5224609,
          "special": false,
          "text": " user"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nI have a test request from a user"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 13,
          "logprob": -2.0507812,
          "special": false,
          "text": "\n"
        },
        {
          "id": 13,
          "logprob": -2.3007812,
          "special": false,
          "text": "\n"
        },
        {
          "id": 29902,
          "logprob": -2.0449219,
          "special": false,
          "text": "I"
        },
        {
          "id": 505,
          "logprob": -1.3242188,
          "special": false,
          "text": " have"
        },
        {
          "id": 263,
          "logprob": -0.2076416,
          "special": false,
          "text": " a"
        },
        {
          "id": 1243,
          "logprob": -2.0273438,
          "special": false,
          "text": " test"
        },
        {
          "id": 2009,
          "logprob": -0.6845703,
          "special": false,
          "text": " request"
        },
        {
          "id": 515,
          "logprob": -1.1748047,
          "special": false,
          "text": " from"
        },
        {
          "id": 263,
          "logprob": -1.0595703,
          "special": false,
          "text": " a"
        },
        {
          "id": 1404,
          "logprob": -1.5224609,
          "special": false,
          "text": " user"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nI have a test request from a user"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_flash_llama_marlin_24/test_flash_llama_marlin.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 5229,
        "logprob": -2.7988281,
        "special": false,
        "text": " failed"
      },
      {
        "id": 29901,
        "logprob": -0.91259766,
        "special": false,
        "text": ":"
      },
      {
        "id": 853,
        "logprob": -2.8496094,
        "special": false,
        "text": " Un"
      },
      {
        "id": 23765,
        "logprob": -1.1894531,
        "special": false,
        "text": "supported"
      },
      {
        "id": 4714,
        "logprob": -1.5917969,
        "special": false,
        "text": " browser"
      },
      {
        "id": 29892,
        "logprob": -0.34765625,
        "special": false,
        "text": ","
      },
      {
        "id": 1873,
        "logprob": -1.2695312,
        "special": false,
        "text": " version"
      },
      {
        "id": 470,
        "logprob": -0.25170898,
        "special": false,
        "text": " or"
      },
      {
        "id": 7481,
        "logprob": -0.21411133,
        "special": false,
        "text": " platform"
      },
      {
        "id": 13,
        "logprob": -1.1162109,
        "special": false,
        "text": "\n"
      }
    ],
    "top_tokens": null
  },
  "generated_text": " failed: Unsupported browser, version or platform\n"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_llama_marlin_24/test_flash_llama_marlin24_all_params.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": 0,
    "tokens": [
      {
        "id": 5229,
        "logprob": -0.6645508,
        "special": false,
        "text": " failed"
      },
      {
        "id": 29901,
        "logprob": 0.0,
        "special": false,
        "text": ":"
      },
      {
        "id": 6527,
        "logprob": -2.2324219,
        "special": false,
        "text": " Could"
      },
      {
        "id": 451,
        "logprob": 0.0,
        "special": false,
        "text": " not"
      },
      {
        "id": 6088,
        "logprob": -1.6074219,
        "special": false,
        "text": " parse"
      },
      {
        "id": 1243,
        "logprob": -1.6298828,
        "special": false,
        "text": " test"
      },
      {
        "id": 1206,
        "logprob": -0.72558594,
        "special": false,
        "text": " case"
      },
      {
        "id": 1024,
        "logprob": -0.40429688,
        "special": false,
        "text": " name"
      },
      {
        "id": 515,
        "logprob": 0.0,
        "special": false,
        "text": " from"
      },
      {
        "id": 525,
        "logprob": -1.2519531,
        "special": false,
        "text": " '"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "Test request failed: Could not parse test case name from '"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_llama_marlin_24/test_flash_llama_marlin24_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 5229,
          "logprob": -2.7988281,
          "special": false,
          "text": " failed"
        },
        {
          "id": 29901,
          "logprob": -0.91259766,
          "special": false,
          "text": ":"
        },
        {
          "id": 853,
          "logprob": -2.8496094,
          "special": false,
          "text": " Un"
        },
        {
          "id": 23765,
          "logprob": -1.1894531,
          "special": false,
          "text": "supported"
        },
        {
          "id": 4714,
          "logprob": -1.5917969,
          "special": false,
          "text": " browser"
        },
        {
          "id": 29892,
          "logprob": -0.34765625,
          "special": false,
          "text": ","
        },
        {
          "id": 1873,
          "logprob": -1.2695312,
          "special": false,
          "text": " version"
        },
        {
          "id": 470,
          "logprob": -0.25170898,
          "special": false,
          "text": " or"
        },
        {
          "id": 7481,
          "logprob": -0.21411133,
          "special": false,
          "text": " platform"
        },
        {
          "id": 13,
          "logprob": -1.1162109,
          "special": false,
          "text": "\n"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " failed: Unsupported browser, version or platform\n"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 5229,
          "logprob": -2.7988281,
          "special": false,
          "text": " failed"
        },
        {
          "id": 29901,
          "logprob": -0.91259766,
          "special": false,
          "text": ":"
        },
        {
          "id": 853,
          "logprob": -2.8496094,
          "special": false,
          "text": " Un"
        },
        {
          "id": 23765,
          "logprob": -1.1894531,
          "special": false,
          "text": "supported"
        },
        {
          "id": 4714,
          "logprob": -1.5917969,
          "special": false,
          "text": " browser"
        },
        {
          "id": 29892,
          "logprob": -0.34765625,
          "special": false,
          "text": ","
        },
        {
          "id": 1873,
          "logprob": -1.2695312,
          "special": false,
          "text": " version"
        },
        {
          "id": 470,
          "logprob": -0.25170898,
          "special": false,
          "text": " or"
        },
        {
          "id": 7481,
          "logprob": -0.21411133,
          "special": false,
          "text": " platform"
        },
        {
          "id": 13,
          "logprob": -1.1162109,
          "special": false,
          "text": "\n"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " failed: Unsupported browser, version or platform\n"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 5229,
          "logprob": -2.7988281,
          "special": false,
          "text": " failed"
        },
        {
          "id": 29901,
          "logprob": -0.91259766,
          "special": false,
          "text": ":"
        },
        {
          "id": 853,
          "logprob": -2.8496094,
          "special": false,
          "text": " Un"
        },
        {
          "id": 23765,
          "logprob": -1.1894531,
          "special": false,
          "text": "supported"
        },
        {
          "id": 4714,
          "logprob": -1.5917969,
          "special": false,
          "text": " browser"
        },
        {
          "id": 29892,
          "logprob": -0.34765625,
          "special": false,
          "text": ","
        },
        {
          "id": 1873,
          "logprob": -1.2695312,
          "special": false,
          "text": " version"
        },
        {
          "id": 470,
          "logprob": -0.25170898,
          "special": false,
          "text": " or"
        },
        {
          "id": 7481,
          "logprob": -0.21411133,
          "special": false,
          "text": " platform"
        },
        {
          "id": 13,
          "logprob": -1.1162109,
          "special": false,
          "text": "\n"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " failed: Unsupported browser, version or platform\n"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 5229,
          "logprob": -2.7988281,
          "special": false,
          "text": " failed"
        },
        {
          "id": 29901,
          "logprob": -0.91259766,
          "special": false,
          "text": ":"
        },
        {
          "id": 853,
          "logprob": -2.8496094,
          "special": false,
          "text": " Un"
        },
        {
          "id": 23765,
          "logprob": -1.1894531,
          "special": false,
          "text": "supported"
        },
        {
          "id": 4714,
          "logprob": -1.5917969,
          "special": false,
          "text": " browser"
        },
        {
          "id": 29892,
          "logprob": -0.34765625,
          "special": false,
          "text": ","
        },
        {
          "id": 1873,
          "logprob": -1.2695312,
          "special": false,
          "text": " version"
        },
        {
          "id": 470,
          "logprob": -0.25170898,
          "special": false,
          "text": " or"
        },
        {
          "id": 7481,
          "logprob": -0.21411133,
          "special": false,
          "text": " platform"
        },
        {
          "id": 13,
          "logprob": -1.1162109,
          "special": false,
          "text": "\n"
        }
      ],
      "top_tokens": null
    },
    "generated_text": " failed: Unsupported browser, version or platform\n"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_flash_llama_prefix/test_flash_llama_load.json
================================================
[
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Jeff Walker's Product Launch Formula is a comprehensive system",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 69,
      "total_tokens": 79
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Here are three key indicators to determine if a customer",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085331,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 52,
      "total_tokens": 62
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "You can use the `String.format()` method in",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 97,
      "total_tokens": 107
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "In a realm of binary mysticism, we find",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085331,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 126,
      "total_tokens": 136
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "The `dummy` variable is being used to consume",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085336,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 305,
      "total_tokens": 315
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "You can add multiple new columns in Power Query (",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085331,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 51,
      "total_tokens": 61
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "There are many exciting new technologies emerging across various fields",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085336,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 52,
      "total_tokens": 62
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Poly Ether Ether Ketone (PEEK) is",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 40,
      "total_tokens": 50
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Here's a technical overview of a referral system similar",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 85,
      "total_tokens": 95
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Here's an example of how you can add an",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085336,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 45,
      "total_tokens": 55
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "I'd be happy to help with Java. What",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 43,
      "total_tokens": 53
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "I can help you plan a road trip from Pune",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 82,
      "total_tokens": 92
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "I'd be happy to explain more about a topic",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085331,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 38,
      "total_tokens": 48
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "I'd be happy to help you brainstorm and provide",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 47,
      "total_tokens": 57
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Implementing a Minesweeper algorithm using algebraic",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 54,
      "total_tokens": 64
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "There are several issues with the provided code:\n\n1",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085331,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 375,
      "total_tokens": 385
    }
  },
  {
    "choices": [
      {
        "finish_reason": "stop",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": ";)",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085330,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 2,
      "prompt_tokens": 105,
      "total_tokens": 107
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "As I delved into the world of high-st",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085331,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 2097,
      "total_tokens": 2107
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "/u/CruxHub: Hi, I'm",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085336,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 2614,
      "total_tokens": 2624
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "To simulate a conversation between Alice and /u/C",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085331,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 1070,
      "total_tokens": 1080
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Alice: Hey /u/CruxHub,",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085336,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 1847,
      "total_tokens": 1857
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Alice: Hi /u/CruxHub,",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085331,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 1849,
      "total_tokens": 1859
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "/u/CruxHub: Hey Alice, I",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085336,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 1004,
      "total_tokens": 1014
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "/u/CruxHub: Hey Alice, I",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085331,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 1100,
      "total_tokens": 1110
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "/u/CruxHub: Hey Alice, I",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085336,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 1044,
      "total_tokens": 1054
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "The Dogme approach and the Lexical Approach are",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085336,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 54,
      "total_tokens": 64
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Implementing a netfilter in Linux with a Rust",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085336,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 48,
      "total_tokens": 58
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Damage to the Ulnar nerve can cause numb",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085331,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 56,
      "total_tokens": 66
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "The Space Shuttle's Reaction Control System (RCS",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085331,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 50,
      "total_tokens": 60
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "I can provide you with a basic Python script that",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 65,
      "total_tokens": 75
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Farming meat has several negative impacts on the environment",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085331,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 43,
      "total_tokens": 53
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "The photograph filter you're referring to is called \"",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 51,
      "total_tokens": 61
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Here's a sample geological database structure with some example",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 59,
      "total_tokens": 69
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "**Web Marketing: A Simplified Explanation**\n\nWeb",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 45,
      "total_tokens": 55
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Here's a rewritten and improved version of the story",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085336,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 447,
      "total_tokens": 457
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Here are the questions rewritten in a more conversational",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085331,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 168,
      "total_tokens": 178
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "**Learning Progress: 0%**\n\n| Topic",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085331,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 216,
      "total_tokens": 226
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "I couldn't find any information on a person named",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 44,
      "total_tokens": 54
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Here's a list of the largest outdoor retailers in",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 43,
      "total_tokens": 53
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "To create a WordPress shortcode that includes Facebook SDK code",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 49,
      "total_tokens": 59
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "The sentence is mostly grammatically correct, but there",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 78,
      "total_tokens": 88
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "I'd be happy to engage in a debate with",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 59,
      "total_tokens": 69
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "I'd love to hear about your business. As",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085336,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 64,
      "total_tokens": 74
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "I'll wait for your request to proceed with part",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085331,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 2410,
      "total_tokens": 2420
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "The final part of the Day Sculpting program emphasizes",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085336,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 2699,
      "total_tokens": 2709
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "**Analysis of the Coming of Age Story Archetype",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085336,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 349,
      "total_tokens": 359
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "The Apostle John is one of the most prominent figures",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 49,
      "total_tokens": 59
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "To build a Google Places autocomplete feature on Jetpack",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085336,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 427,
      "total_tokens": 437
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "The information provided does not mention the captain's name",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085331,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 169,
      "total_tokens": 179
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "The metaverse is a shared, immersive and interactive",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085336,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 39,
      "total_tokens": 49
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Here are some ideas for a series of articles for",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 50,
      "total_tokens": 60
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "\"Purim Palooza Alert: \n\nTo",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085331,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 78,
      "total_tokens": 88
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "**Summary of the paper in 10 points:",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085331,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 2022,
      "total_tokens": 2032
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "You'll provide three pieces of text, and then",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085336,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 58,
      "total_tokens": 68
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "I'm ready to proceed with text 3.",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085336,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 1650,
      "total_tokens": 1660
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "I'm ready to answer questions on Text 1",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 1116,
      "total_tokens": 1126
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "This is a Solidity contract written in the older",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085331,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 334,
      "total_tokens": 344
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "**Speech Recognition and Synthesis using Python**\n\nTo",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 84,
      "total_tokens": 94
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "I'd be happy to help you discuss a paper",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085336,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 42,
      "total_tokens": 52
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "To handle the given utterance, we can use",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085336,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 375,
      "total_tokens": 385
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "**Subscription Services Template:**\n\n**Title:** Virtual",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085331,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 443,
      "total_tokens": 453
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Hello. How can I assist you today?",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085331,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 36,
      "total_tokens": 46
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Differentiating yourself from other Etsy shops is crucial to",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 102,
      "total_tokens": 112
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "To become a Licensed Marriage and Family Therapist (",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 53,
      "total_tokens": 63
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "**What is Quantum Computing?**\n\nQuantum computing",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 42,
      "total_tokens": 52
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Aquí te dejo 40 opciones de nombres",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 108,
      "total_tokens": 118
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Deposition is a geological process that involves the transportation",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085331,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 38,
      "total_tokens": 48
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Here are some good e-governance initiatives in",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085336,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 55,
      "total_tokens": 65
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Here's a simple Python program that accepts a command",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 56,
      "total_tokens": 66
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Imagine you're playing with a toy box. You",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 47,
      "total_tokens": 57
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Here's an example of a question they might ask",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 66,
      "total_tokens": 76
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Arduino Uno adalah sebuah papan mikrokontrol",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 38,
      "total_tokens": 48
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "To edit an array that is within an object,",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 42,
      "total_tokens": 52
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Microsoft ENTRA (Enterprise Mobility + Security) is",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085336,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 56,
      "total_tokens": 66
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "To calculate the difference in interest paid between a simple",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085331,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 69,
      "total_tokens": 79
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Yes, you can use Spring State Machine and Spring",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 49,
      "total_tokens": 59
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "The issue lies in the fact that the `meta",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 142,
      "total_tokens": 152
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Here are some effective marketing tactics for local small businesses",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085336,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 46,
      "total_tokens": 56
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "The French Revolution, which lasted from 1789",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085331,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 41,
      "total_tokens": 51
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "**Roles of a Network Driver:**\n\nA network",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085336,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 65,
      "total_tokens": 75
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Yes, I'm familiar with the SAS (Stat",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085331,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 44,
      "total_tokens": 54
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Using relays to control 12V solen",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085336,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 60,
      "total_tokens": 70
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "You can use the following Python code to achieve this",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085331,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 55,
      "total_tokens": 65
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Here are some prompts for viral comics:\n\n1.",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085336,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 336,
      "total_tokens": 346
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "To simplify and make the comic funnier, consider",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085336,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 301,
      "total_tokens": 311
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Here's a rewritten version of the 4-panel",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085331,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 282,
      "total_tokens": 292
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Subject: Request for E-Waste Collection and Computer",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085331,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 110,
      "total_tokens": 120
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "In the context of conference calls, the state you",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085331,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 84,
      "total_tokens": 94
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "I can provide a general classification of companies based on",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 56,
      "total_tokens": 66
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Here are some user stories that describe the concept in",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085336,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 44,
      "total_tokens": 54
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "You can check your Python version by running the following",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 39,
      "total_tokens": 49
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "**Scenario:**\n\n15-year-old Black youth,",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085336,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 473,
      "total_tokens": 483
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "As a Demand Generation Manager for a B2B",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085336,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 50,
      "total_tokens": 60
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "The error is due to a typo in your code",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085336,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 369,
      "total_tokens": 379
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "고등교육의 필요성에 관한 영어 에",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 72,
      "total_tokens": 82
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Here's a simple C# program that uses the",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 51,
      "total_tokens": 61
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "The error message \"connection refused\" indicates that the",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085331,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 85,
      "total_tokens": 95
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "To load an image, you can use various methods",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726085326,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 41,
      "total_tokens": 51
    }
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_flash_llama_prefix_flashdecoding/test_flash_llama_flashdecoding.json
================================================
[
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Jeff Walker's Product Launch Formula is a comprehensive system",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243286,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 69,
      "total_tokens": 79
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Here are three key indicators to determine if a customer",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243278,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 52,
      "total_tokens": 62
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "You can use the `String.format()` method in",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243277,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 97,
      "total_tokens": 107
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "In a realm of binary mysticism, we find",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243278,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 126,
      "total_tokens": 136
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "The `dummy` variable is being used to consume",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243283,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 305,
      "total_tokens": 315
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "You can add multiple new columns in Power Query (",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243286,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 51,
      "total_tokens": 61
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "There are many exciting new technologies emerging across various fields",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243277,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 52,
      "total_tokens": 62
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Poly Ether Ether Ketone (PEEK) is",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243278,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 40,
      "total_tokens": 50
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Here's a technical overview of a referral system similar",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243277,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 85,
      "total_tokens": 95
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Here's an example of how you can add an",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243284,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 45,
      "total_tokens": 55
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "I'd be happy to help with Java. What",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243283,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 43,
      "total_tokens": 53
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "I can help you plan a road trip from Pune",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243277,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 82,
      "total_tokens": 92
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "I'd be happy to explain more about a topic",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243284,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 38,
      "total_tokens": 48
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "I'd be happy to help you brainstorm and provide",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243278,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 47,
      "total_tokens": 57
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Implementing a Minesweeper algorithm using algebraic",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243278,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 54,
      "total_tokens": 64
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "There are several issues with the provided code:\n\n1",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243284,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 375,
      "total_tokens": 385
    }
  },
  {
    "choices": [
      {
        "finish_reason": "stop",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": ";)",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243277,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 2,
      "prompt_tokens": 105,
      "total_tokens": 107
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "As I delved into the world of high-st",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243277,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 2097,
      "total_tokens": 2107
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "/u/CruxHub: Hi, I'm",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243283,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 2614,
      "total_tokens": 2624
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "To simulate a conversation between Alice and /u/C",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243278,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 1070,
      "total_tokens": 1080
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Alice: Hey /u/CruxHub,",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243283,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 1847,
      "total_tokens": 1857
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Alice: Hi /u/CruxHub,",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243286,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 1849,
      "total_tokens": 1859
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "/u/CruxHub: Hey Alice, I",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243278,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 1004,
      "total_tokens": 1014
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "/u/CruxHub: Hey Alice, I",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243286,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 1100,
      "total_tokens": 1110
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "/u/CruxHub: Hey Alice, I",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243278,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 1044,
      "total_tokens": 1054
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "The Dogme approach and the Lexical Approach are",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243278,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 54,
      "total_tokens": 64
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Implementing a netfilter in Linux with a Rust",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243278,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 48,
      "total_tokens": 58
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Damage to the Ulnar nerve can cause numb",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243277,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 56,
      "total_tokens": 66
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "The Space Shuttle's Reaction Control System (RCS",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243277,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 50,
      "total_tokens": 60
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "I can provide you with a basic Python script that",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243278,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 65,
      "total_tokens": 75
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Farming meat has several negative impacts on the environment",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243277,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 43,
      "total_tokens": 53
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "The photograph filter you're referring to is called \"",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243277,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 51,
      "total_tokens": 61
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Here's a sample geological database structure with some example",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243283,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 59,
      "total_tokens": 69
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "**Web Marketing: A Simplified Explanation**\n\nWeb",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243278,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 45,
      "total_tokens": 55
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Here's a rewritten and improved version of the story",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243278,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 447,
      "total_tokens": 457
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Here are the questions rewritten in a more conversational",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243278,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 168,
      "total_tokens": 178
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "**Learning Progress: 0%**\n\n| Topic",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243284,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 216,
      "total_tokens": 226
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "I couldn't find any information on a person named",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243284,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 44,
      "total_tokens": 54
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Here's a list of the largest outdoor retailers in",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243286,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 43,
      "total_tokens": 53
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "To create a WordPress shortcode that includes Facebook SDK code",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243284,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 49,
      "total_tokens": 59
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "The sentence is mostly grammatically correct, but there",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243277,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 78,
      "total_tokens": 88
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "I'd be happy to engage in a debate with",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243284,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 59,
      "total_tokens": 69
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "I'd love to hear about your business. As",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243284,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 64,
      "total_tokens": 74
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "I'll wait for your request to proceed with part",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243286,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 2410,
      "total_tokens": 2420
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "The final part of the Day Sculpting program emphasizes",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243284,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 2699,
      "total_tokens": 2709
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "**Analysis of the Coming of Age Story Archetype",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243278,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 349,
      "total_tokens": 359
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "The Apostle John is one of the most prominent figures",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243277,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 49,
      "total_tokens": 59
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "To build a Google Places autocomplete feature on Jetpack",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243277,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 427,
      "total_tokens": 437
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "The information provided does not mention the captain's name",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243283,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 169,
      "total_tokens": 179
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "The metaverse is a shared, immersive and interactive",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243277,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 39,
      "total_tokens": 49
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Here are some ideas for a series of articles for",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243278,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 50,
      "total_tokens": 60
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "\"Purim Palooza Alert: \n\nTo",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243284,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 78,
      "total_tokens": 88
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "**Summary of the paper in 10 points:",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243286,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 2022,
      "total_tokens": 2032
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "You'll provide three pieces of text, and then",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243283,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 58,
      "total_tokens": 68
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "I'm ready to proceed with text 3.",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243284,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 1650,
      "total_tokens": 1660
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "I'm ready to answer questions on Text 1",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243286,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 1116,
      "total_tokens": 1126
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "This is a Solidity contract written in the older",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243278,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 334,
      "total_tokens": 344
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "**Speech Recognition and Synthesis using Python**\n\nTo",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243283,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 84,
      "total_tokens": 94
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "I'd be happy to help you discuss a paper",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243286,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 42,
      "total_tokens": 52
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "To handle the given utterance, we can use",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243277,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 375,
      "total_tokens": 385
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "**Subscription Services Template:**\n\n**Title:** Virtual",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243286,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 443,
      "total_tokens": 453
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Hello. How can I assist you today?",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243278,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 36,
      "total_tokens": 46
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Differentiating yourself from other Etsy shops is crucial to",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243283,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 102,
      "total_tokens": 112
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "To become a Licensed Marriage and Family Therapist (",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243284,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 53,
      "total_tokens": 63
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "**What is Quantum Computing?**\n\nQuantum computing",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243278,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 42,
      "total_tokens": 52
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Aquí te dejo 40 opciones de nombres",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243283,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 108,
      "total_tokens": 118
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Deposition is a geological process that involves the transportation",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243278,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 38,
      "total_tokens": 48
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Here are some good e-governance initiatives in",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243284,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 55,
      "total_tokens": 65
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Here's a simple Python program that accepts a command",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243278,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 56,
      "total_tokens": 66
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Imagine you're playing with a toy box. You",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243284,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 47,
      "total_tokens": 57
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Here's an example of a question they might ask",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243278,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 66,
      "total_tokens": 76
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Arduino Uno adalah sebuah papan mikrokontrol",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243283,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 38,
      "total_tokens": 48
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "To edit an array that is within an object,",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243278,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 42,
      "total_tokens": 52
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Microsoft ENTRA (Enterprise Mobility + Security) is",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243286,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 56,
      "total_tokens": 66
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "To calculate the difference in interest paid between a simple",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243277,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 69,
      "total_tokens": 79
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Yes, you can use Spring State Machine and Spring",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243278,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 49,
      "total_tokens": 59
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "The issue lies in the fact that the `meta",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243278,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 142,
      "total_tokens": 152
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Here are some effective marketing tactics for local small businesses",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243286,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 46,
      "total_tokens": 56
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "The French Revolution, which lasted from 1789",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243283,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 41,
      "total_tokens": 51
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "**Roles of a Network Driver:**\n\nA network",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243278,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 65,
      "total_tokens": 75
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Yes, I'm familiar with the SAS (Stat",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243277,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 44,
      "total_tokens": 54
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Using relays to control 12V solen",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243284,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 60,
      "total_tokens": 70
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "You can use the following Python code to achieve this",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243278,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 55,
      "total_tokens": 65
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Here are some prompts for viral comics:\n\n1.",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243284,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 336,
      "total_tokens": 346
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "To simplify and make the comic funnier, consider",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243278,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 301,
      "total_tokens": 311
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Here's a rewritten version of the 4-panel",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243278,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 282,
      "total_tokens": 292
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Subject: Request for E-Waste Collection and Computer",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243284,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 110,
      "total_tokens": 120
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "In the context of conference calls, the state you",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243277,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 84,
      "total_tokens": 94
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "I can provide a general classification of companies based on",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243284,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 56,
      "total_tokens": 66
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Here are some user stories that describe the concept in",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243283,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 44,
      "total_tokens": 54
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "You can check your Python version by running the following",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243284,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 39,
      "total_tokens": 49
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "**Scenario:**\n\n15-year-old Black youth,",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243284,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 473,
      "total_tokens": 483
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "As a Demand Generation Manager for a B2B",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243277,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 50,
      "total_tokens": 60
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "The error is due to a typo in your code",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243277,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 369,
      "total_tokens": 379
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "고등교육의 필요성에 관한 영어 에",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243286,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 72,
      "total_tokens": 82
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "Here's a simple C# program that uses the",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243283,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 51,
      "total_tokens": 61
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "The error message \"connection refused\" indicates that the",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243277,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 85,
      "total_tokens": 95
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "To load an image, you can use various methods",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1726243284,
    "id": "",
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "2.2.1-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 41,
      "total_tokens": 51
    }
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_flash_medusa/test_flash_medusa_all_params.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": 0,
    "tokens": [
      {
        "id": 13,
        "logprob": -1.1582031,
        "special": false,
        "text": "\n"
      },
      {
        "id": 2772,
        "logprob": -0.23083496,
        "special": false,
        "text": "De"
      },
      {
        "id": 1022,
        "logprob": 0.0,
        "special": false,
        "text": "ep"
      },
      {
        "id": 6509,
        "logprob": 0.0,
        "special": false,
        "text": " learning"
      },
      {
        "id": 29892,
        "logprob": -0.61816406,
        "special": false,
        "text": ","
      },
      {
        "id": 607,
        "logprob": -0.7089844,
        "special": false,
        "text": " which"
      },
      {
        "id": 508,
        "logprob": -1.7724609,
        "special": false,
        "text": " can"
      },
      {
        "id": 367,
        "logprob": 0.0,
        "special": false,
        "text": " be"
      },
      {
        "id": 5545,
        "logprob": 0.0,
        "special": false,
        "text": " considered"
      },
      {
        "id": 408,
        "logprob": -0.3869629,
        "special": false,
        "text": " as"
      }
    ]
  },
  "generated_text": "What is Deep Learning?\nDeep learning, which can be considered as"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_medusa/test_flash_medusa_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 13,
          "logprob": -1.1845703,
          "special": false,
          "text": "\n"
        },
        {
          "id": 2772,
          "logprob": -0.5727539,
          "special": false,
          "text": "De"
        },
        {
          "id": 1022,
          "logprob": -0.00010967255,
          "special": false,
          "text": "ep"
        },
        {
          "id": 6509,
          "logprob": -0.1239624,
          "special": false,
          "text": " learning"
        },
        {
          "id": 338,
          "logprob": -0.04510498,
          "special": false,
          "text": " is"
        },
        {
          "id": 263,
          "logprob": -0.018295288,
          "special": false,
          "text": " a"
        },
        {
          "id": 11306,
          "logprob": -0.45922852,
          "special": false,
          "text": " subset"
        },
        {
          "id": 310,
          "logprob": -0.00020992756,
          "special": false,
          "text": " of"
        },
        {
          "id": 4933,
          "logprob": -0.0046539307,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6509,
          "logprob": -0.00025844574,
          "special": false,
          "text": " learning"
        }
      ]
    },
    "generated_text": "\nDeep learning is a subset of machine learning"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 13,
          "logprob": -1.1826172,
          "special": false,
          "text": "\n"
        },
        {
          "id": 2772,
          "logprob": -0.56689453,
          "special": false,
          "text": "De"
        },
        {
          "id": 1022,
          "logprob": -0.000108003616,
          "special": false,
          "text": "ep"
        },
        {
          "id": 6509,
          "logprob": -0.1239624,
          "special": false,
          "text": " learning"
        },
        {
          "id": 338,
          "logprob": -0.044433594,
          "special": false,
          "text": " is"
        },
        {
          "id": 263,
          "logprob": -0.018295288,
          "special": false,
          "text": " a"
        },
        {
          "id": 11306,
          "logprob": -0.45922852,
          "special": false,
          "text": " subset"
        },
        {
          "id": 310,
          "logprob": -0.0002104044,
          "special": false,
          "text": " of"
        },
        {
          "id": 4933,
          "logprob": -0.004711151,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6509,
          "logprob": -0.00025892258,
          "special": false,
          "text": " learning"
        }
      ]
    },
    "generated_text": "\nDeep learning is a subset of machine learning"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 13,
          "logprob": -1.1826172,
          "special": false,
          "text": "\n"
        },
        {
          "id": 2772,
          "logprob": -0.56689453,
          "special": false,
          "text": "De"
        },
        {
          "id": 1022,
          "logprob": -0.000108003616,
          "special": false,
          "text": "ep"
        },
        {
          "id": 6509,
          "logprob": -0.1239624,
          "special": false,
          "text": " learning"
        },
        {
          "id": 338,
          "logprob": -0.044433594,
          "special": false,
          "text": " is"
        },
        {
          "id": 263,
          "logprob": -0.018295288,
          "special": false,
          "text": " a"
        },
        {
          "id": 11306,
          "logprob": -0.45922852,
          "special": false,
          "text": " subset"
        },
        {
          "id": 310,
          "logprob": -0.0002104044,
          "special": false,
          "text": " of"
        },
        {
          "id": 4933,
          "logprob": -0.004711151,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6509,
          "logprob": -0.00025892258,
          "special": false,
          "text": " learning"
        }
      ]
    },
    "generated_text": "\nDeep learning is a subset of machine learning"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 13,
          "logprob": -1.1826172,
          "special": false,
          "text": "\n"
        },
        {
          "id": 2772,
          "logprob": -0.56689453,
          "special": false,
          "text": "De"
        },
        {
          "id": 1022,
          "logprob": -0.000108003616,
          "special": false,
          "text": "ep"
        },
        {
          "id": 6509,
          "logprob": -0.1239624,
          "special": false,
          "text": " learning"
        },
        {
          "id": 338,
          "logprob": -0.044433594,
          "special": false,
          "text": " is"
        },
        {
          "id": 263,
          "logprob": -0.018295288,
          "special": false,
          "text": " a"
        },
        {
          "id": 11306,
          "logprob": -0.45922852,
          "special": false,
          "text": " subset"
        },
        {
          "id": 310,
          "logprob": -0.0002104044,
          "special": false,
          "text": " of"
        },
        {
          "id": 4933,
          "logprob": -0.004711151,
          "special": false,
          "text": " machine"
        },
        {
          "id": 6509,
          "logprob": -0.00025892258,
          "special": false,
          "text": " learning"
        }
      ]
    },
    "generated_text": "\nDeep learning is a subset of machine learning"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_flash_medusa/test_flash_medusa_simple.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 13,
        "logprob": -1.1845703,
        "special": false,
        "text": "\n"
      },
      {
        "id": 2772,
        "logprob": -0.5727539,
        "special": false,
        "text": "De"
      },
      {
        "id": 1022,
        "logprob": -0.000108122826,
        "special": false,
        "text": "ep"
      },
      {
        "id": 6509,
        "logprob": -0.1239624,
        "special": false,
        "text": " learning"
      },
      {
        "id": 338,
        "logprob": -0.044433594,
        "special": false,
        "text": " is"
      },
      {
        "id": 263,
        "logprob": -0.01852417,
        "special": false,
        "text": " a"
      },
      {
        "id": 11306,
        "logprob": -0.45922852,
        "special": false,
        "text": " subset"
      },
      {
        "id": 310,
        "logprob": -0.0002104044,
        "special": false,
        "text": " of"
      },
      {
        "id": 4933,
        "logprob": -0.004787445,
        "special": false,
        "text": " machine"
      },
      {
        "id": 6509,
        "logprob": -0.00026226044,
        "special": false,
        "text": " learning"
      }
    ]
  },
  "generated_text": "\nDeep learning is a subset of machine learning"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_mistral/test_flash_mistral.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 28747,
        "logprob": -0.54785156,
        "special": false,
        "text": ":"
      },
      {
        "id": 3169,
        "logprob": -1.4091797,
        "special": false,
        "text": " Let"
      },
      {
        "id": 307,
        "logprob": -3.0273438,
        "special": false,
        "text": " n"
      },
      {
        "id": 327,
        "logprob": -0.94433594,
        "special": false,
        "text": " ="
      },
      {
        "id": 28705,
        "logprob": -0.81347656,
        "special": false,
        "text": " "
      },
      {
        "id": 28740,
        "logprob": -1.2958984,
        "special": false,
        "text": "1"
      },
      {
        "id": 28734,
        "logprob": -2.0644531,
        "special": false,
        "text": "0"
      },
      {
        "id": 387,
        "logprob": -1.9580078,
        "special": false,
        "text": " -"
      },
      {
        "id": 28705,
        "logprob": -0.5073242,
        "special": false,
        "text": " "
      },
      {
        "id": 28740,
        "logprob": -1.1816406,
        "special": false,
        "text": "1"
      }
    ],
    "top_tokens": null
  },
  "generated_text": ": Let n = 10 - 1"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_mistral/test_flash_mistral_all_params.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": 0,
    "tokens": [
      {
        "id": 28747,
        "logprob": 0.0,
        "special": false,
        "text": ":"
      },
      {
        "id": 3169,
        "logprob": -0.1307373,
        "special": false,
        "text": " Let"
      },
      {
        "id": 332,
        "logprob": -2.3359375,
        "special": false,
        "text": " u"
      },
      {
        "id": 347,
        "logprob": 0.0,
        "special": false,
        "text": " be"
      },
      {
        "id": 325,
        "logprob": -1.0234375,
        "special": false,
        "text": " ("
      },
      {
        "id": 28734,
        "logprob": -2.0292969,
        "special": false,
        "text": "0"
      },
      {
        "id": 648,
        "logprob": -1.0439453,
        "special": false,
        "text": " +"
      },
      {
        "id": 28705,
        "logprob": -0.24499512,
        "special": false,
        "text": " "
      },
      {
        "id": 28770,
        "logprob": -0.5073242,
        "special": false,
        "text": "3"
      },
      {
        "id": 387,
        "logprob": -1.5507812,
        "special": false,
        "text": " -"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "Test request: Let u be (0 + 3 -"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_mistral/test_flash_mistral_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 28747,
          "logprob": -0.55078125,
          "special": false,
          "text": ":"
        },
        {
          "id": 3169,
          "logprob": -1.4140625,
          "special": false,
          "text": " Let"
        },
        {
          "id": 307,
          "logprob": -3.0273438,
          "special": false,
          "text": " n"
        },
        {
          "id": 327,
          "logprob": -0.94140625,
          "special": false,
          "text": " ="
        },
        {
          "id": 28705,
          "logprob": -0.8173828,
          "special": false,
          "text": " "
        },
        {
          "id": 28740,
          "logprob": -1.2978516,
          "special": false,
          "text": "1"
        },
        {
          "id": 28734,
          "logprob": -2.0664062,
          "special": false,
          "text": "0"
        },
        {
          "id": 387,
          "logprob": -1.9560547,
          "special": false,
          "text": " -"
        },
        {
          "id": 28705,
          "logprob": -0.5078125,
          "special": false,
          "text": " "
        },
        {
          "id": 28740,
          "logprob": -1.1787109,
          "special": false,
          "text": "1"
        }
      ],
      "top_tokens": null
    },
    "generated_text": ": Let n = 10 - 1"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 28747,
          "logprob": -0.54785156,
          "special": false,
          "text": ":"
        },
        {
          "id": 3169,
          "logprob": -1.4111328,
          "special": false,
          "text": " Let"
        },
        {
          "id": 307,
          "logprob": -3.0292969,
          "special": false,
          "text": " n"
        },
        {
          "id": 327,
          "logprob": -0.94433594,
          "special": false,
          "text": " ="
        },
        {
          "id": 28705,
          "logprob": -0.8178711,
          "special": false,
          "text": " "
        },
        {
          "id": 28740,
          "logprob": -1.2939453,
          "special": false,
          "text": "1"
        },
        {
          "id": 28734,
          "logprob": -2.0644531,
          "special": false,
          "text": "0"
        },
        {
          "id": 387,
          "logprob": -1.9550781,
          "special": false,
          "text": " -"
        },
        {
          "id": 28705,
          "logprob": -0.5078125,
          "special": false,
          "text": " "
        },
        {
          "id": 28740,
          "logprob": -1.1796875,
          "special": false,
          "text": "1"
        }
      ],
      "top_tokens": null
    },
    "generated_text": ": Let n = 10 - 1"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 28747,
          "logprob": -0.55078125,
          "special": false,
          "text": ":"
        },
        {
          "id": 3169,
          "logprob": -1.4140625,
          "special": false,
          "text": " Let"
        },
        {
          "id": 307,
          "logprob": -3.0273438,
          "special": false,
          "text": " n"
        },
        {
          "id": 327,
          "logprob": -0.94140625,
          "special": false,
          "text": " ="
        },
        {
          "id": 28705,
          "logprob": -0.8173828,
          "special": false,
          "text": " "
        },
        {
          "id": 28740,
          "logprob": -1.2978516,
          "special": false,
          "text": "1"
        },
        {
          "id": 28734,
          "logprob": -2.0664062,
          "special": false,
          "text": "0"
        },
        {
          "id": 387,
          "logprob": -1.9560547,
          "special": false,
          "text": " -"
        },
        {
          "id": 28705,
          "logprob": -0.5078125,
          "special": false,
          "text": " "
        },
        {
          "id": 28740,
          "logprob": -1.1787109,
          "special": false,
          "text": "1"
        }
      ],
      "top_tokens": null
    },
    "generated_text": ": Let n = 10 - 1"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 28747,
          "logprob": -0.55078125,
          "special": false,
          "text": ":"
        },
        {
          "id": 3169,
          "logprob": -1.4140625,
          "special": false,
          "text": " Let"
        },
        {
          "id": 307,
          "logprob": -3.0273438,
          "special": false,
          "text": " n"
        },
        {
          "id": 327,
          "logprob": -0.94140625,
          "special": false,
          "text": " ="
        },
        {
          "id": 28705,
          "logprob": -0.8173828,
          "special": false,
          "text": " "
        },
        {
          "id": 28740,
          "logprob": -1.2978516,
          "special": false,
          "text": "1"
        },
        {
          "id": 28734,
          "logprob": -2.0664062,
          "special": false,
          "text": "0"
        },
        {
          "id": 387,
          "logprob": -1.9560547,
          "special": false,
          "text": " -"
        },
        {
          "id": 28705,
          "logprob": -0.5078125,
          "special": false,
          "text": " "
        },
        {
          "id": 28740,
          "logprob": -1.1787109,
          "special": false,
          "text": "1"
        }
      ],
      "top_tokens": null
    },
    "generated_text": ": Let n = 10 - 1"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_flash_mixtral/test_flash_mixtral.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 20910,
        "logprob": -0.96484375,
        "special": false,
        "text": "Grad"
      },
      {
        "id": 722,
        "logprob": -0.003168106,
        "special": false,
        "text": "ient"
      },
      {
        "id": 24871,
        "logprob": -0.16540527,
        "special": false,
        "text": " descent"
      },
      {
        "id": 349,
        "logprob": -0.08886719,
        "special": false,
        "text": " is"
      },
      {
        "id": 396,
        "logprob": -0.75878906,
        "special": false,
        "text": " an"
      },
      {
        "id": 18586,
        "logprob": -0.5703125,
        "special": false,
        "text": " optimization"
      },
      {
        "id": 9464,
        "logprob": -0.11242676,
        "special": false,
        "text": " algorithm"
      },
      {
        "id": 1307,
        "logprob": -0.7939453,
        "special": false,
        "text": " used"
      },
      {
        "id": 298,
        "logprob": -0.17102051,
        "special": false,
        "text": " to"
      },
      {
        "id": 26518,
        "logprob": -0.34326172,
        "special": false,
        "text": " minimize"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "Gradient descent is an optimization algorithm used to minimize"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_mixtral/test_flash_mixtral_all_params.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": 0,
    "tokens": [
      {
        "id": 1313,
        "logprob": -2.3613281,
        "special": false,
        "text": "It"
      },
      {
        "id": 3969,
        "logprob": -0.7285156,
        "special": false,
        "text": " seems"
      },
      {
        "id": 298,
        "logprob": -1.3466797,
        "special": false,
        "text": " to"
      },
      {
        "id": 528,
        "logprob": 0.0,
        "special": false,
        "text": " me"
      },
      {
        "id": 28725,
        "logprob": -1.6757812,
        "special": false,
        "text": ","
      },
      {
        "id": 369,
        "logprob": 0.0,
        "special": false,
        "text": " that"
      },
      {
        "id": 513,
        "logprob": -1.1269531,
        "special": false,
        "text": " if"
      },
      {
        "id": 368,
        "logprob": 0.0,
        "special": false,
        "text": " you"
      },
      {
        "id": 28742,
        "logprob": -2.4921875,
        "special": false,
        "text": "'"
      },
      {
        "id": 267,
        "logprob": 0.0,
        "special": false,
        "text": "re"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "What is gradient descent?\n\nIt seems to me, that if you're"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_mixtral/test_flash_mixtral_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 20910,
          "logprob": -0.96484375,
          "special": false,
          "text": "Grad"
        },
        {
          "id": 722,
          "logprob": -0.003168106,
          "special": false,
          "text": "ient"
        },
        {
          "id": 24871,
          "logprob": -0.16369629,
          "special": false,
          "text": " descent"
        },
        {
          "id": 349,
          "logprob": -0.0881958,
          "special": false,
          "text": " is"
        },
        {
          "id": 396,
          "logprob": -0.76708984,
          "special": false,
          "text": " an"
        },
        {
          "id": 18586,
          "logprob": -0.57373047,
          "special": false,
          "text": " optimization"
        },
        {
          "id": 9464,
          "logprob": -0.11291504,
          "special": false,
          "text": " algorithm"
        },
        {
          "id": 1307,
          "logprob": -0.79589844,
          "special": false,
          "text": " used"
        },
        {
          "id": 298,
          "logprob": -0.1694336,
          "special": false,
          "text": " to"
        },
        {
          "id": 26518,
          "logprob": -0.34350586,
          "special": false,
          "text": " minimize"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "Gradient descent is an optimization algorithm used to minimize"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 20910,
          "logprob": -0.9628906,
          "special": false,
          "text": "Grad"
        },
        {
          "id": 722,
          "logprob": -0.0032176971,
          "special": false,
          "text": "ient"
        },
        {
          "id": 24871,
          "logprob": -0.16540527,
          "special": false,
          "text": " descent"
        },
        {
          "id": 349,
          "logprob": -0.08898926,
          "special": false,
          "text": " is"
        },
        {
          "id": 396,
          "logprob": -0.765625,
          "special": false,
          "text": " an"
        },
        {
          "id": 18586,
          "logprob": -0.5708008,
          "special": false,
          "text": " optimization"
        },
        {
          "id": 9464,
          "logprob": -0.11401367,
          "special": false,
          "text": " algorithm"
        },
        {
          "id": 1307,
          "logprob": -0.7963867,
          "special": false,
          "text": " used"
        },
        {
          "id": 298,
          "logprob": -0.17028809,
          "special": false,
          "text": " to"
        },
        {
          "id": 26518,
          "logprob": -0.34326172,
          "special": false,
          "text": " minimize"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "Gradient descent is an optimization algorithm used to minimize"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 20910,
          "logprob": -0.9580078,
          "special": false,
          "text": "Grad"
        },
        {
          "id": 722,
          "logprob": -0.0032176971,
          "special": false,
          "text": "ient"
        },
        {
          "id": 24871,
          "logprob": -0.16552734,
          "special": false,
          "text": " descent"
        },
        {
          "id": 349,
          "logprob": -0.08874512,
          "special": false,
          "text": " is"
        },
        {
          "id": 396,
          "logprob": -0.75878906,
          "special": false,
          "text": " an"
        },
        {
          "id": 18586,
          "logprob": -0.5703125,
          "special": false,
          "text": " optimization"
        },
        {
          "id": 9464,
          "logprob": -0.11236572,
          "special": false,
          "text": " algorithm"
        },
        {
          "id": 1307,
          "logprob": -0.79541016,
          "special": false,
          "text": " used"
        },
        {
          "id": 298,
          "logprob": -0.17102051,
          "special": false,
          "text": " to"
        },
        {
          "id": 26518,
          "logprob": -0.34326172,
          "special": false,
          "text": " minimize"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "Gradient descent is an optimization algorithm used to minimize"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 20910,
          "logprob": -0.9609375,
          "special": false,
          "text": "Grad"
        },
        {
          "id": 722,
          "logprob": -0.003168106,
          "special": false,
          "text": "ient"
        },
        {
          "id": 24871,
          "logprob": -0.16601562,
          "special": false,
          "text": " descent"
        },
        {
          "id": 349,
          "logprob": -0.088134766,
          "special": false,
          "text": " is"
        },
        {
          "id": 396,
          "logprob": -0.7597656,
          "special": false,
          "text": " an"
        },
        {
          "id": 18586,
          "logprob": -0.5708008,
          "special": false,
          "text": " optimization"
        },
        {
          "id": 9464,
          "logprob": -0.11291504,
          "special": false,
          "text": " algorithm"
        },
        {
          "id": 1307,
          "logprob": -0.7944336,
          "special": false,
          "text": " used"
        },
        {
          "id": 298,
          "logprob": -0.17102051,
          "special": false,
          "text": " to"
        },
        {
          "id": 26518,
          "logprob": -0.34399414,
          "special": false,
          "text": " minimize"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "Gradient descent is an optimization algorithm used to minimize"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_flash_mixtral_awq/test_flash_mixtral_awq.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 13,
        "logprob": -0.50878906,
        "special": false,
        "text": "\n"
      },
      {
        "id": 13,
        "logprob": -0.8876953,
        "special": false,
        "text": "\n"
      },
      {
        "id": 23229,
        "logprob": -0.15124512,
        "special": false,
        "text": "Deep"
      },
      {
        "id": 5168,
        "logprob": -0.030288696,
        "special": false,
        "text": " learning"
      },
      {
        "id": 349,
        "logprob": -0.16687012,
        "special": false,
        "text": " is"
      },
      {
        "id": 264,
        "logprob": -0.17858887,
        "special": false,
        "text": " a"
      },
      {
        "id": 19804,
        "logprob": -0.8046875,
        "special": false,
        "text": " subset"
      },
      {
        "id": 302,
        "logprob": -0.007205963,
        "special": false,
        "text": " of"
      },
      {
        "id": 5599,
        "logprob": -0.090026855,
        "special": false,
        "text": " machine"
      },
      {
        "id": 5168,
        "logprob": -0.0030670166,
        "special": false,
        "text": " learning"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "\n\nDeep learning is a subset of machine learning"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_mixtral_awq/test_flash_mixtral_awq_all_params.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": 0,
    "tokens": [
      {
        "id": 13,
        "logprob": 0.0,
        "special": false,
        "text": "\n"
      },
      {
        "id": 23229,
        "logprob": -0.5229492,
        "special": false,
        "text": "Deep"
      },
      {
        "id": 17504,
        "logprob": 0.0,
        "special": false,
        "text": " Learning"
      },
      {
        "id": 349,
        "logprob": -0.5151367,
        "special": false,
        "text": " is"
      },
      {
        "id": 264,
        "logprob": 0.0,
        "special": false,
        "text": " a"
      },
      {
        "id": 19804,
        "logprob": 0.0,
        "special": false,
        "text": " subset"
      },
      {
        "id": 302,
        "logprob": 0.0,
        "special": false,
        "text": " of"
      },
      {
        "id": 13253,
        "logprob": -1.3359375,
        "special": false,
        "text": " Machine"
      },
      {
        "id": 17504,
        "logprob": 0.0,
        "special": false,
        "text": " Learning"
      },
      {
        "id": 28725,
        "logprob": 0.0,
        "special": false,
        "text": ","
      }
    ],
    "top_tokens": null
  },
  "generated_text": "What is deep learning?\nDeep Learning is a subset of Machine Learning,"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_mixtral_awq/test_flash_mixtral_awq_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 13,
          "logprob": -0.50878906,
          "special": false,
          "text": "\n"
        },
        {
          "id": 13,
          "logprob": -0.8876953,
          "special": false,
          "text": "\n"
        },
        {
          "id": 23229,
          "logprob": -0.15136719,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 5168,
          "logprob": -0.030273438,
          "special": false,
          "text": " learning"
        },
        {
          "id": 349,
          "logprob": -0.1665039,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
          "logprob": -0.1776123,
          "special": false,
          "text": " a"
        },
        {
          "id": 19804,
          "logprob": -0.8076172,
          "special": false,
          "text": " subset"
        },
        {
          "id": 302,
          "logprob": -0.007183075,
          "special": false,
          "text": " of"
        },
        {
          "id": 5599,
          "logprob": -0.090148926,
          "special": false,
          "text": " machine"
        },
        {
          "id": 5168,
          "logprob": -0.0030670166,
          "special": false,
          "text": " learning"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nDeep learning is a subset of machine learning"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 13,
          "logprob": -0.51220703,
          "special": false,
          "text": "\n"
        },
        {
          "id": 13,
          "logprob": -0.87402344,
          "special": false,
          "text": "\n"
        },
        {
          "id": 23229,
          "logprob": -0.15039062,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 5168,
          "logprob": -0.030288696,
          "special": false,
          "text": " learning"
        },
        {
          "id": 349,
          "logprob": -0.1652832,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
          "logprob": -0.17858887,
          "special": false,
          "text": " a"
        },
        {
          "id": 19804,
          "logprob": -0.81103516,
          "special": false,
          "text": " subset"
        },
        {
          "id": 302,
          "logprob": -0.007183075,
          "special": false,
          "text": " of"
        },
        {
          "id": 5599,
          "logprob": -0.08880615,
          "special": false,
          "text": " machine"
        },
        {
          "id": 5168,
          "logprob": -0.0030612946,
          "special": false,
          "text": " learning"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nDeep learning is a subset of machine learning"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 13,
          "logprob": -0.51220703,
          "special": false,
          "text": "\n"
        },
        {
          "id": 13,
          "logprob": -0.87402344,
          "special": false,
          "text": "\n"
        },
        {
          "id": 23229,
          "logprob": -0.15039062,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 5168,
          "logprob": -0.030288696,
          "special": false,
          "text": " learning"
        },
        {
          "id": 349,
          "logprob": -0.1652832,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
          "logprob": -0.17858887,
          "special": false,
          "text": " a"
        },
        {
          "id": 19804,
          "logprob": -0.81103516,
          "special": false,
          "text": " subset"
        },
        {
          "id": 302,
          "logprob": -0.007183075,
          "special": false,
          "text": " of"
        },
        {
          "id": 5599,
          "logprob": -0.08880615,
          "special": false,
          "text": " machine"
        },
        {
          "id": 5168,
          "logprob": -0.0030612946,
          "special": false,
          "text": " learning"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nDeep learning is a subset of machine learning"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 13,
          "logprob": -0.51220703,
          "special": false,
          "text": "\n"
        },
        {
          "id": 13,
          "logprob": -0.87402344,
          "special": false,
          "text": "\n"
        },
        {
          "id": 23229,
          "logprob": -0.15039062,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 5168,
          "logprob": -0.030288696,
          "special": false,
          "text": " learning"
        },
        {
          "id": 349,
          "logprob": -0.1652832,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
          "logprob": -0.17858887,
          "special": false,
          "text": " a"
        },
        {
          "id": 19804,
          "logprob": -0.81103516,
          "special": false,
          "text": " subset"
        },
        {
          "id": 302,
          "logprob": -0.007183075,
          "special": false,
          "text": " of"
        },
        {
          "id": 5599,
          "logprob": -0.08880615,
          "special": false,
          "text": " machine"
        },
        {
          "id": 5168,
          "logprob": -0.0030612946,
          "special": false,
          "text": " learning"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nDeep learning is a subset of machine learning"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 13,
        "logprob": -0.6953125,
        "special": false,
        "text": "\n"
      },
      {
        "id": 13,
        "logprob": -0.4777832,
        "special": false,
        "text": "\n"
      },
      {
        "id": 23229,
        "logprob": -0.13256836,
        "special": false,
        "text": "Deep"
      },
      {
        "id": 5168,
        "logprob": -0.023849487,
        "special": false,
        "text": " learning"
      },
      {
        "id": 349,
        "logprob": -0.13977051,
        "special": false,
        "text": " is"
      },
      {
        "id": 264,
        "logprob": -0.14489746,
        "special": false,
        "text": " a"
      },
      {
        "id": 19804,
        "logprob": -0.63183594,
        "special": false,
        "text": " subset"
      },
      {
        "id": 302,
        "logprob": -0.010314941,
        "special": false,
        "text": " of"
      },
      {
        "id": 5599,
        "logprob": -0.0635376,
        "special": false,
        "text": " machine"
      },
      {
        "id": 5168,
        "logprob": -0.0028572083,
        "special": false,
        "text": " learning"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "\n\nDeep learning is a subset of machine learning"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq_all_params.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": 0,
    "tokens": [
      {
        "id": 13,
        "logprob": 0.0,
        "special": false,
        "text": "\n"
      },
      {
        "id": 23229,
        "logprob": -0.18237305,
        "special": false,
        "text": "Deep"
      },
      {
        "id": 17504,
        "logprob": 0.0,
        "special": false,
        "text": " Learning"
      },
      {
        "id": 349,
        "logprob": 0.0,
        "special": false,
        "text": " is"
      },
      {
        "id": 264,
        "logprob": 0.0,
        "special": false,
        "text": " a"
      },
      {
        "id": 19804,
        "logprob": 0.0,
        "special": false,
        "text": " subset"
      },
      {
        "id": 302,
        "logprob": 0.0,
        "special": false,
        "text": " of"
      },
      {
        "id": 13253,
        "logprob": -0.6040039,
        "special": false,
        "text": " Machine"
      },
      {
        "id": 17504,
        "logprob": 0.0,
        "special": false,
        "text": " Learning"
      },
      {
        "id": 28725,
        "logprob": -0.11621094,
        "special": false,
        "text": ","
      }
    ],
    "top_tokens": null
  },
  "generated_text": "What is deep learning?\nDeep Learning is a subset of Machine Learning,"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 13,
          "logprob": -0.6953125,
          "special": false,
          "text": "\n"
        },
        {
          "id": 13,
          "logprob": -0.4777832,
          "special": false,
          "text": "\n"
        },
        {
          "id": 23229,
          "logprob": -0.13232422,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 5168,
          "logprob": -0.023834229,
          "special": false,
          "text": " learning"
        },
        {
          "id": 349,
          "logprob": -0.13977051,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
          "logprob": -0.14416504,
          "special": false,
          "text": " a"
        },
        {
          "id": 19804,
          "logprob": -0.63183594,
          "special": false,
          "text": " subset"
        },
        {
          "id": 302,
          "logprob": -0.010223389,
          "special": false,
          "text": " of"
        },
        {
          "id": 5599,
          "logprob": -0.064208984,
          "special": false,
          "text": " machine"
        },
        {
          "id": 5168,
          "logprob": -0.0028266907,
          "special": false,
          "text": " learning"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nDeep learning is a subset of machine learning"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 13,
          "logprob": -0.6953125,
          "special": false,
          "text": "\n"
        },
        {
          "id": 13,
          "logprob": -0.48339844,
          "special": false,
          "text": "\n"
        },
        {
          "id": 23229,
          "logprob": -0.13256836,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 5168,
          "logprob": -0.02420044,
          "special": false,
          "text": " learning"
        },
        {
          "id": 349,
          "logprob": -0.13977051,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
          "logprob": -0.14501953,
          "special": false,
          "text": " a"
        },
        {
          "id": 19804,
          "logprob": -0.63134766,
          "special": false,
          "text": " subset"
        },
        {
          "id": 302,
          "logprob": -0.010223389,
          "special": false,
          "text": " of"
        },
        {
          "id": 5599,
          "logprob": -0.06427002,
          "special": false,
          "text": " machine"
        },
        {
          "id": 5168,
          "logprob": -0.002817154,
          "special": false,
          "text": " learning"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nDeep learning is a subset of machine learning"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 13,
          "logprob": -0.6953125,
          "special": false,
          "text": "\n"
        },
        {
          "id": 13,
          "logprob": -0.48339844,
          "special": false,
          "text": "\n"
        },
        {
          "id": 23229,
          "logprob": -0.13256836,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 5168,
          "logprob": -0.02420044,
          "special": false,
          "text": " learning"
        },
        {
          "id": 349,
          "logprob": -0.13977051,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
          "logprob": -0.14501953,
          "special": false,
          "text": " a"
        },
        {
          "id": 19804,
          "logprob": -0.63134766,
          "special": false,
          "text": " subset"
        },
        {
          "id": 302,
          "logprob": -0.010223389,
          "special": false,
          "text": " of"
        },
        {
          "id": 5599,
          "logprob": -0.06427002,
          "special": false,
          "text": " machine"
        },
        {
          "id": 5168,
          "logprob": -0.002817154,
          "special": false,
          "text": " learning"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nDeep learning is a subset of machine learning"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 13,
          "logprob": -0.6953125,
          "special": false,
          "text": "\n"
        },
        {
          "id": 13,
          "logprob": -0.48339844,
          "special": false,
          "text": "\n"
        },
        {
          "id": 23229,
          "logprob": -0.13256836,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 5168,
          "logprob": -0.02420044,
          "special": false,
          "text": " learning"
        },
        {
          "id": 349,
          "logprob": -0.13977051,
          "special": false,
          "text": " is"
        },
        {
          "id": 264,
          "logprob": -0.14501953,
          "special": false,
          "text": " a"
        },
        {
          "id": 19804,
          "logprob": -0.63134766,
          "special": false,
          "text": " subset"
        },
        {
          "id": 302,
          "logprob": -0.010223389,
          "special": false,
          "text": " of"
        },
        {
          "id": 5599,
          "logprob": -0.06427002,
          "special": false,
          "text": " machine"
        },
        {
          "id": 5168,
          "logprob": -0.002817154,
          "special": false,
          "text": " learning"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nDeep learning is a subset of machine learning"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_flash_neox/test_flash_neox.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 42,
        "logprob": -0.88378906,
        "special": false,
        "text": "I"
      },
      {
        "id": 1353,
        "logprob": -0.94921875,
        "special": false,
        "text": "'m"
      },
      {
        "id": 417,
        "logprob": -2.2402344,
        "special": false,
        "text": " not"
      },
      {
        "id": 2119,
        "logprob": -0.3725586,
        "special": false,
        "text": " sure"
      },
      {
        "id": 13,
        "logprob": -1.078125,
        "special": false,
        "text": ","
      },
      {
        "id": 534,
        "logprob": -0.67822266,
        "special": false,
        "text": " which"
      },
      {
        "id": 310,
        "logprob": -1.3837891,
        "special": false,
        "text": " is"
      },
      {
        "id": 253,
        "logprob": -1.7050781,
        "special": false,
        "text": " the"
      },
      {
        "id": 1682,
        "logprob": -0.052001953,
        "special": false,
        "text": " best"
      },
      {
        "id": 1039,
        "logprob": -2.0390625,
        "special": false,
        "text": " way"
      }
    ]
  },
  "generated_text": "I'm not sure, which is the best way"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_neox/test_flash_neox_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 42,
          "logprob": -0.8886719,
          "special": false,
          "text": "I"
        },
        {
          "id": 1353,
          "logprob": -0.98046875,
          "special": false,
          "text": "'m"
        },
        {
          "id": 417,
          "logprob": -2.2265625,
          "special": false,
          "text": " not"
        },
        {
          "id": 2119,
          "logprob": -0.3479004,
          "special": false,
          "text": " sure"
        },
        {
          "id": 13,
          "logprob": -1.0117188,
          "special": false,
          "text": ","
        },
        {
          "id": 534,
          "logprob": -0.67871094,
          "special": false,
          "text": " which"
        },
        {
          "id": 310,
          "logprob": -1.421875,
          "special": false,
          "text": " is"
        },
        {
          "id": 253,
          "logprob": -1.7382812,
          "special": false,
          "text": " the"
        },
        {
          "id": 1682,
          "logprob": -0.051330566,
          "special": false,
          "text": " best"
        },
        {
          "id": 1039,
          "logprob": -2.0390625,
          "special": false,
          "text": " way"
        }
      ]
    },
    "generated_text": "I'm not sure, which is the best way"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 42,
          "logprob": -0.88378906,
          "special": false,
          "text": "I"
        },
        {
          "id": 1353,
          "logprob": -0.9819336,
          "special": false,
          "text": "'m"
        },
        {
          "id": 417,
          "logprob": -2.2421875,
          "special": false,
          "text": " not"
        },
        {
          "id": 2119,
          "logprob": -0.3474121,
          "special": false,
          "text": " sure"
        },
        {
          "id": 13,
          "logprob": -1.078125,
          "special": false,
          "text": ","
        },
        {
          "id": 534,
          "logprob": -0.69140625,
          "special": false,
          "text": " which"
        },
        {
          "id": 310,
          "logprob": -1.4072266,
          "special": false,
          "text": " is"
        },
        {
          "id": 253,
          "logprob": -1.7041016,
          "special": false,
          "text": " the"
        },
        {
          "id": 1682,
          "logprob": -0.053375244,
          "special": false,
          "text": " best"
        },
        {
          "id": 1039,
          "logprob": -2.0351562,
          "special": false,
          "text": " way"
        }
      ]
    },
    "generated_text": "I'm not sure, which is the best way"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 42,
          "logprob": -0.8886719,
          "special": false,
          "text": "I"
        },
        {
          "id": 1353,
          "logprob": -0.98046875,
          "special": false,
          "text": "'m"
        },
        {
          "id": 417,
          "logprob": -2.2265625,
          "special": false,
          "text": " not"
        },
        {
          "id": 2119,
          "logprob": -0.3479004,
          "special": false,
          "text": " sure"
        },
        {
          "id": 13,
          "logprob": -1.0117188,
          "special": false,
          "text": ","
        },
        {
          "id": 534,
          "logprob": -0.67871094,
          "special": false,
          "text": " which"
        },
        {
          "id": 310,
          "logprob": -1.421875,
          "special": false,
          "text": " is"
        },
        {
          "id": 253,
          "logprob": -1.7382812,
          "special": false,
          "text": " the"
        },
        {
          "id": 1682,
          "logprob": -0.051330566,
          "special": false,
          "text": " best"
        },
        {
          "id": 1039,
          "logprob": -2.0390625,
          "special": false,
          "text": " way"
        }
      ]
    },
    "generated_text": "I'm not sure, which is the best way"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 42,
          "logprob": -0.8886719,
          "special": false,
          "text": "I"
        },
        {
          "id": 1353,
          "logprob": -0.98046875,
          "special": false,
          "text": "'m"
        },
        {
          "id": 417,
          "logprob": -2.2265625,
          "special": false,
          "text": " not"
        },
        {
          "id": 2119,
          "logprob": -0.3479004,
          "special": false,
          "text": " sure"
        },
        {
          "id": 13,
          "logprob": -1.0117188,
          "special": false,
          "text": ","
        },
        {
          "id": 534,
          "logprob": -0.67871094,
          "special": false,
          "text": " which"
        },
        {
          "id": 310,
          "logprob": -1.421875,
          "special": false,
          "text": " is"
        },
        {
          "id": 253,
          "logprob": -1.7382812,
          "special": false,
          "text": " the"
        },
        {
          "id": 1682,
          "logprob": -0.051330566,
          "special": false,
          "text": " best"
        },
        {
          "id": 1039,
          "logprob": -2.0390625,
          "special": false,
          "text": " way"
        }
      ]
    },
    "generated_text": "I'm not sure, which is the best way"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_flash_neox_sharded/test_flash_neox.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 510,
        "logprob": -0.63183594,
        "special": false,
        "text": "The"
      },
      {
        "id": 3159,
        "logprob": -0.5390625,
        "special": false,
        "text": " word"
      },
      {
        "id": 346,
        "logprob": -0.045684814,
        "special": false,
        "text": " \""
      },
      {
        "id": 6441,
        "logprob": -0.002090454,
        "special": false,
        "text": "mem"
      },
      {
        "id": 70,
        "logprob": -1.3589859e-05,
        "special": false,
        "text": "e"
      },
      {
        "id": 3,
        "logprob": -0.0009455681,
        "special": false,
        "text": "\""
      },
      {
        "id": 369,
        "logprob": -0.088012695,
        "special": false,
        "text": " was"
      },
      {
        "id": 806,
        "logprob": -0.12585449,
        "special": false,
        "text": " first"
      },
      {
        "id": 908,
        "logprob": -0.017196655,
        "special": false,
        "text": " used"
      },
      {
        "id": 275,
        "logprob": -0.49731445,
        "special": false,
        "text": " in"
      }
    ]
  },
  "generated_text": "The word \"meme\" was first used in"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_neox_sharded/test_flash_neox_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 510,
          "logprob": -0.63183594,
          "special": false,
          "text": "The"
        },
        {
          "id": 3159,
          "logprob": -0.5488281,
          "special": false,
          "text": " word"
        },
        {
          "id": 346,
          "logprob": -0.045684814,
          "special": false,
          "text": " \""
        },
        {
          "id": 6441,
          "logprob": -0.00207901,
          "special": false,
          "text": "mem"
        },
        {
          "id": 70,
          "logprob": -1.335144e-05,
          "special": false,
          "text": "e"
        },
        {
          "id": 3,
          "logprob": -0.00097227097,
          "special": false,
          "text": "\""
        },
        {
          "id": 369,
          "logprob": -0.0892334,
          "special": false,
          "text": " was"
        },
        {
          "id": 806,
          "logprob": -0.12463379,
          "special": false,
          "text": " first"
        },
        {
          "id": 908,
          "logprob": -0.01737976,
          "special": false,
          "text": " used"
        },
        {
          "id": 275,
          "logprob": -0.50341797,
          "special": false,
          "text": " in"
        }
      ]
    },
    "generated_text": "The word \"meme\" was first used in"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 510,
          "logprob": -0.63183594,
          "special": false,
          "text": "The"
        },
        {
          "id": 3159,
          "logprob": -0.5488281,
          "special": false,
          "text": " word"
        },
        {
          "id": 346,
          "logprob": -0.045684814,
          "special": false,
          "text": " \""
        },
        {
          "id": 6441,
          "logprob": -0.00207901,
          "special": false,
          "text": "mem"
        },
        {
          "id": 70,
          "logprob": -1.335144e-05,
          "special": false,
          "text": "e"
        },
        {
          "id": 3,
          "logprob": -0.00097227097,
          "special": false,
          "text": "\""
        },
        {
          "id": 369,
          "logprob": -0.0892334,
          "special": false,
          "text": " was"
        },
        {
          "id": 806,
          "logprob": -0.12463379,
          "special": false,
          "text": " first"
        },
        {
          "id": 908,
          "logprob": -0.01737976,
          "special": false,
          "text": " used"
        },
        {
          "id": 275,
          "logprob": -0.50341797,
          "special": false,
          "text": " in"
        }
      ]
    },
    "generated_text": "The word \"meme\" was first used in"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 510,
          "logprob": -0.63183594,
          "special": false,
          "text": "The"
        },
        {
          "id": 3159,
          "logprob": -0.5488281,
          "special": false,
          "text": " word"
        },
        {
          "id": 346,
          "logprob": -0.045684814,
          "special": false,
          "text": " \""
        },
        {
          "id": 6441,
          "logprob": -0.00207901,
          "special": false,
          "text": "mem"
        },
        {
          "id": 70,
          "logprob": -1.335144e-05,
          "special": false,
          "text": "e"
        },
        {
          "id": 3,
          "logprob": -0.00097227097,
          "special": false,
          "text": "\""
        },
        {
          "id": 369,
          "logprob": -0.0892334,
          "special": false,
          "text": " was"
        },
        {
          "id": 806,
          "logprob": -0.12463379,
          "special": false,
          "text": " first"
        },
        {
          "id": 908,
          "logprob": -0.01737976,
          "special": false,
          "text": " used"
        },
        {
          "id": 275,
          "logprob": -0.50341797,
          "special": false,
          "text": " in"
        }
      ]
    },
    "generated_text": "The word \"meme\" was first used in"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 510,
          "logprob": -0.63183594,
          "special": false,
          "text": "The"
        },
        {
          "id": 3159,
          "logprob": -0.5488281,
          "special": false,
          "text": " word"
        },
        {
          "id": 346,
          "logprob": -0.045684814,
          "special": false,
          "text": " \""
        },
        {
          "id": 6441,
          "logprob": -0.00207901,
          "special": false,
          "text": "mem"
        },
        {
          "id": 70,
          "logprob": -1.335144e-05,
          "special": false,
          "text": "e"
        },
        {
          "id": 3,
          "logprob": -0.00097227097,
          "special": false,
          "text": "\""
        },
        {
          "id": 369,
          "logprob": -0.0892334,
          "special": false,
          "text": " was"
        },
        {
          "id": 806,
          "logprob": -0.12463379,
          "special": false,
          "text": " first"
        },
        {
          "id": 908,
          "logprob": -0.01737976,
          "special": false,
          "text": " used"
        },
        {
          "id": 275,
          "logprob": -0.50341797,
          "special": false,
          "text": " in"
        }
      ]
    },
    "generated_text": "The word \"meme\" was first used in"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_flash_pali_gemma/test_flash_pali_gemma.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "eos_token",
    "generated_tokens": 2,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 54901,
        "logprob": -0.84765625,
        "special": false,
        "text": "beach"
      },
      {
        "id": 1,
        "logprob": -0.008666992,
        "special": true,
        "text": "<eos>"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "beach"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_pali_gemma/test_flash_pali_gemma_two_images.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "eos_token",
    "generated_tokens": 8,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 2502,
        "logprob": -1.7890625,
        "special": false,
        "text": "image"
      },
      {
        "id": 2196,
        "logprob": -0.53125,
        "special": false,
        "text": " result"
      },
      {
        "id": 604,
        "logprob": -0.0077209473,
        "special": false,
        "text": " for"
      },
      {
        "id": 12254,
        "logprob": -1.703125,
        "special": false,
        "text": " chicken"
      },
      {
        "id": 611,
        "logprob": -0.21582031,
        "special": false,
        "text": " on"
      },
      {
        "id": 573,
        "logprob": -0.734375,
        "special": false,
        "text": " the"
      },
      {
        "id": 8318,
        "logprob": -0.026000977,
        "special": false,
        "text": " beach"
      },
      {
        "id": 1,
        "logprob": -0.2109375,
        "special": true,
        "text": "<eos>"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "image result for chicken on the beach"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_pali_gemma2/test_flash_pali_gemma_image.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 20,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 108,
        "logprob": -0.48046875,
        "special": false,
        "text": "\n"
      },
      {
        "id": 30234,
        "logprob": -2.21875,
        "special": false,
        "text": "Brown"
      },
      {
        "id": 108,
        "logprob": -0.119140625,
        "special": false,
        "text": "\n"
      },
      {
        "id": 3726,
        "logprob": -1.703125,
        "special": false,
        "text": "Car"
      },
      {
        "id": 108,
        "logprob": -0.0390625,
        "special": false,
        "text": "\n"
      },
      {
        "id": 2915,
        "logprob": -1.8203125,
        "special": false,
        "text": "Color"
      },
      {
        "id": 108,
        "logprob": -0.035888672,
        "special": false,
        "text": "\n"
      },
      {
        "id": 19178,
        "logprob": -2.015625,
        "special": false,
        "text": "Cool"
      },
      {
        "id": 108,
        "logprob": -0.08105469,
        "special": false,
        "text": "\n"
      },
      {
        "id": 40544,
        "logprob": -2.09375,
        "special": false,
        "text": "Decor"
      },
      {
        "id": 108,
        "logprob": -0.038330078,
        "special": false,
        "text": "\n"
      },
      {
        "id": 108,
        "logprob": -1.515625,
        "special": false,
        "text": "\n"
      },
      {
        "id": 108,
        "logprob": -1.8671875,
        "special": false,
        "text": "\n"
      },
      {
        "id": 108,
        "logprob": -1.6328125,
        "special": false,
        "text": "\n"
      },
      {
        "id": 108,
        "logprob": -1.265625,
        "special": false,
        "text": "\n"
      },
      {
        "id": 108,
        "logprob": -1.0078125,
        "special": false,
        "text": "\n"
      },
      {
        "id": 108,
        "logprob": -1.03125,
        "special": false,
        "text": "\n"
      },
      {
        "id": 235336,
        "logprob": -1.2109375,
        "special": false,
        "text": "?"
      },
      {
        "id": 108,
        "logprob": -0.29101562,
        "special": false,
        "text": "\n"
      },
      {
        "id": 235336,
        "logprob": -0.08935547,
        "special": false,
        "text": "?"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "\nBrown\nCar\nColor\nCool\nDecor\n\n\n\n\n\n\n?\n?"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_phi/test_flash_phi.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 25,
        "logprob": -2.3203125,
        "special": false,
        "text": ":"
      },
      {
        "id": 1391,
        "logprob": -0.98779297,
        "special": false,
        "text": " {"
      },
      {
        "id": 25927,
        "logprob": -0.76660156,
        "special": false,
        "text": "request"
      },
      {
        "id": 92,
        "logprob": -0.7246094,
        "special": false,
        "text": "}"
      },
      {
        "id": 4943,
        "logprob": -0.41333008,
        "special": false,
        "text": "\")"
      },
      {
        "id": 198,
        "logprob": -0.11785889,
        "special": false,
        "text": "\n"
      },
      {
        "id": 50280,
        "logprob": -0.97265625,
        "special": false,
        "text": "        "
      },
      {
        "id": 26209,
        "logprob": -1.4414062,
        "special": false,
        "text": "response"
      },
      {
        "id": 796,
        "logprob": -0.0569458,
        "special": false,
        "text": " ="
      },
      {
        "id": 2116,
        "logprob": -1.1533203,
        "special": false,
        "text": " self"
      }
    ],
    "top_tokens": null
  },
  "generated_text": ": {request}\")\n        response = self"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_phi/test_flash_phi_all_params.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "stop_sequence",
    "generated_tokens": 6,
    "prefill": [],
    "seed": 0,
    "tokens": [
      {
        "id": 284,
        "logprob": -0.28955078,
        "special": false,
        "text": " to"
      },
      {
        "id": 3758,
        "logprob": -0.7739258,
        "special": false,
        "text": " send"
      },
      {
        "id": 1366,
        "logprob": -0.85253906,
        "special": false,
        "text": " data"
      },
      {
        "id": 625,
        "logprob": -0.8984375,
        "special": false,
        "text": " over"
      },
      {
        "id": 257,
        "logprob": -1.0830078,
        "special": false,
        "text": " a"
      },
      {
        "id": 3127,
        "logprob": -1.9404297,
        "special": false,
        "text": " network"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "Test request to send data over a network"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_phi/test_flash_phi_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 25,
          "logprob": -2.3203125,
          "special": false,
          "text": ":"
        },
        {
          "id": 1391,
          "logprob": -0.98779297,
          "special": false,
          "text": " {"
        },
        {
          "id": 25927,
          "logprob": -0.7729492,
          "special": false,
          "text": "request"
        },
        {
          "id": 92,
          "logprob": -0.7241211,
          "special": false,
          "text": "}"
        },
        {
          "id": 4943,
          "logprob": -0.4091797,
          "special": false,
          "text": "\")"
        },
        {
          "id": 198,
          "logprob": -0.119018555,
          "special": false,
          "text": "\n"
        },
        {
          "id": 50280,
          "logprob": -0.9707031,
          "special": false,
          "text": "        "
        },
        {
          "id": 26209,
          "logprob": -1.4414062,
          "special": false,
          "text": "response"
        },
        {
          "id": 796,
          "logprob": -0.056854248,
          "special": false,
          "text": " ="
        },
        {
          "id": 2116,
          "logprob": -1.1533203,
          "special": false,
          "text": " self"
        }
      ],
      "top_tokens": null
    },
    "generated_text": ": {request}\")\n        response = self"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 25,
          "logprob": -2.3203125,
          "special": false,
          "text": ":"
        },
        {
          "id": 1391,
          "logprob": -0.98779297,
          "special": false,
          "text": " {"
        },
        {
          "id": 25927,
          "logprob": -0.7729492,
          "special": false,
          "text": "request"
        },
        {
          "id": 92,
          "logprob": -0.7241211,
          "special": false,
          "text": "}"
        },
        {
          "id": 4943,
          "logprob": -0.4091797,
          "special": false,
          "text": "\")"
        },
        {
          "id": 198,
          "logprob": -0.119018555,
          "special": false,
          "text": "\n"
        },
        {
          "id": 50280,
          "logprob": -0.9707031,
          "special": false,
          "text": "        "
        },
        {
          "id": 26209,
          "logprob": -1.4414062,
          "special": false,
          "text": "response"
        },
        {
          "id": 796,
          "logprob": -0.056854248,
          "special": false,
          "text": " ="
        },
        {
          "id": 2116,
          "logprob": -1.1533203,
          "special": false,
          "text": " self"
        }
      ],
      "top_tokens": null
    },
    "generated_text": ": {request}\")\n        response = self"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 25,
          "logprob": -2.3203125,
          "special": false,
          "text": ":"
        },
        {
          "id": 1391,
          "logprob": -0.98779297,
          "special": false,
          "text": " {"
        },
        {
          "id": 25927,
          "logprob": -0.7729492,
          "special": false,
          "text": "request"
        },
        {
          "id": 92,
          "logprob": -0.7241211,
          "special": false,
          "text": "}"
        },
        {
          "id": 4943,
          "logprob": -0.4091797,
          "special": false,
          "text": "\")"
        },
        {
          "id": 198,
          "logprob": -0.119018555,
          "special": false,
          "text": "\n"
        },
        {
          "id": 50280,
          "logprob": -0.9707031,
          "special": false,
          "text": "        "
        },
        {
          "id": 26209,
          "logprob": -1.4414062,
          "special": false,
          "text": "response"
        },
        {
          "id": 796,
          "logprob": -0.056854248,
          "special": false,
          "text": " ="
        },
        {
          "id": 2116,
          "logprob": -1.1533203,
          "special": false,
          "text": " self"
        }
      ],
      "top_tokens": null
    },
    "generated_text": ": {request}\")\n        response = self"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 25,
          "logprob": -2.3203125,
          "special": false,
          "text": ":"
        },
        {
          "id": 1391,
          "logprob": -0.98779297,
          "special": false,
          "text": " {"
        },
        {
          "id": 25927,
          "logprob": -0.7729492,
          "special": false,
          "text": "request"
        },
        {
          "id": 92,
          "logprob": -0.7241211,
          "special": false,
          "text": "}"
        },
        {
          "id": 4943,
          "logprob": -0.4091797,
          "special": false,
          "text": "\")"
        },
        {
          "id": 198,
          "logprob": -0.119018555,
          "special": false,
          "text": "\n"
        },
        {
          "id": 50280,
          "logprob": -0.9707031,
          "special": false,
          "text": "        "
        },
        {
          "id": 26209,
          "logprob": -1.4414062,
          "special": false,
          "text": "response"
        },
        {
          "id": 796,
          "logprob": -0.056854248,
          "special": false,
          "text": " ="
        },
        {
          "id": 2116,
          "logprob": -1.1533203,
          "special": false,
          "text": " self"
        }
      ],
      "top_tokens": null
    },
    "generated_text": ": {request}\")\n        response = self"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 25584,
        "logprob": -0.008979797,
        "special": false,
        "text": "Grad"
      },
      {
        "id": 993,
        "logprob": -8.34465e-07,
        "special": false,
        "text": "ient"
      },
      {
        "id": 26815,
        "logprob": -0.0009407997,
        "special": false,
        "text": " descent"
      },
      {
        "id": 338,
        "logprob": -0.0003838539,
        "special": false,
        "text": " is"
      },
      {
        "id": 385,
        "logprob": -0.24499512,
        "special": false,
        "text": " an"
      },
      {
        "id": 13883,
        "logprob": -0.010406494,
        "special": false,
        "text": " optimization"
      },
      {
        "id": 5687,
        "logprob": -0.00024354458,
        "special": false,
        "text": " algorithm"
      },
      {
        "id": 15574,
        "logprob": -0.6582031,
        "special": false,
        "text": " commonly"
      },
      {
        "id": 1304,
        "logprob": -0.00092840195,
        "special": false,
        "text": " used"
      },
      {
        "id": 297,
        "logprob": -0.19470215,
        "special": false,
        "text": " in"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "Gradient descent is an optimization algorithm commonly used in"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_all_params.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": 0,
    "tokens": [
      {
        "id": 25584,
        "logprob": 0.0,
        "special": false,
        "text": "Grad"
      },
      {
        "id": 993,
        "logprob": 0.0,
        "special": false,
        "text": "ient"
      },
      {
        "id": 2726,
        "logprob": 0.0,
        "special": false,
        "text": " Des"
      },
      {
        "id": 1760,
        "logprob": 0.0,
        "special": false,
        "text": "cent"
      },
      {
        "id": 313,
        "logprob": -0.12322998,
        "special": false,
        "text": " ("
      },
      {
        "id": 29954,
        "logprob": 0.0,
        "special": false,
        "text": "G"
      },
      {
        "id": 29928,
        "logprob": 0.0,
        "special": false,
        "text": "D"
      },
      {
        "id": 29897,
        "logprob": 0.0,
        "special": false,
        "text": ")"
      },
      {
        "id": 338,
        "logprob": -0.6040039,
        "special": false,
        "text": " is"
      },
      {
        "id": 385,
        "logprob": -0.1796875,
        "special": false,
        "text": " an"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "What is gradient descent?\nGradient Descent (GD) is an"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 25584,
          "logprob": -0.008979797,
          "special": false,
          "text": "Grad"
        },
        {
          "id": 993,
          "logprob": -8.34465e-07,
          "special": false,
          "text": "ient"
        },
        {
          "id": 26815,
          "logprob": -0.00097084045,
          "special": false,
          "text": " descent"
        },
        {
          "id": 338,
          "logprob": -0.0003838539,
          "special": false,
          "text": " is"
        },
        {
          "id": 385,
          "logprob": -0.23840332,
          "special": false,
          "text": " an"
        },
        {
          "id": 13883,
          "logprob": -0.010406494,
          "special": false,
          "text": " optimization"
        },
        {
          "id": 5687,
          "logprob": -0.0002501011,
          "special": false,
          "text": " algorithm"
        },
        {
          "id": 15574,
          "logprob": -0.6582031,
          "special": false,
          "text": " commonly"
        },
        {
          "id": 1304,
          "logprob": -0.00092840195,
          "special": false,
          "text": " used"
        },
        {
          "id": 297,
          "logprob": -0.18933105,
          "special": false,
          "text": " in"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "Gradient descent is an optimization algorithm commonly used in"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 25584,
          "logprob": -0.009017944,
          "special": false,
          "text": "Grad"
        },
        {
          "id": 993,
          "logprob": -9.536743e-07,
          "special": false,
          "text": "ient"
        },
        {
          "id": 26815,
          "logprob": -0.00097084045,
          "special": false,
          "text": " descent"
        },
        {
          "id": 338,
          "logprob": -0.0003838539,
          "special": false,
          "text": " is"
        },
        {
          "id": 385,
          "logprob": -0.24499512,
          "special": false,
          "text": " an"
        },
        {
          "id": 13883,
          "logprob": -0.010406494,
          "special": false,
          "text": " optimization"
        },
        {
          "id": 5687,
          "logprob": -0.0002501011,
          "special": false,
          "text": " algorithm"
        },
        {
          "id": 15574,
          "logprob": -0.6435547,
          "special": false,
          "text": " commonly"
        },
        {
          "id": 1304,
          "logprob": -0.0009279251,
          "special": false,
          "text": " used"
        },
        {
          "id": 297,
          "logprob": -0.18933105,
          "special": false,
          "text": " in"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "Gradient descent is an optimization algorithm commonly used in"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 25584,
          "logprob": -0.008956909,
          "special": false,
          "text": "Grad"
        },
        {
          "id": 993,
          "logprob": -8.34465e-07,
          "special": false,
          "text": "ient"
        },
        {
          "id": 26815,
          "logprob": -0.0009407997,
          "special": false,
          "text": " descent"
        },
        {
          "id": 338,
          "logprob": -0.0003721714,
          "special": false,
          "text": " is"
        },
        {
          "id": 385,
          "logprob": -0.24499512,
          "special": false,
          "text": " an"
        },
        {
          "id": 13883,
          "logprob": -0.010406494,
          "special": false,
          "text": " optimization"
        },
        {
          "id": 5687,
          "logprob": -0.0002501011,
          "special": false,
          "text": " algorithm"
        },
        {
          "id": 15574,
          "logprob": -0.6435547,
          "special": false,
          "text": " commonly"
        },
        {
          "id": 1304,
          "logprob": -0.00092601776,
          "special": false,
          "text": " used"
        },
        {
          "id": 297,
          "logprob": -0.19177246,
          "special": false,
          "text": " in"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "Gradient descent is an optimization algorithm commonly used in"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 25584,
          "logprob": -0.008979797,
          "special": false,
          "text": "Grad"
        },
        {
          "id": 993,
          "logprob": -9.536743e-07,
          "special": false,
          "text": "ient"
        },
        {
          "id": 26815,
          "logprob": -0.0009407997,
          "special": false,
          "text": " descent"
        },
        {
          "id": 338,
          "logprob": -0.00038409233,
          "special": false,
          "text": " is"
        },
        {
          "id": 385,
          "logprob": -0.24499512,
          "special": false,
          "text": " an"
        },
        {
          "id": 13883,
          "logprob": -0.010414124,
          "special": false,
          "text": " optimization"
        },
        {
          "id": 5687,
          "logprob": -0.00024354458,
          "special": false,
          "text": " algorithm"
        },
        {
          "id": 15574,
          "logprob": -0.6435547,
          "special": false,
          "text": " commonly"
        },
        {
          "id": 1304,
          "logprob": -0.0009279251,
          "special": false,
          "text": " used"
        },
        {
          "id": 297,
          "logprob": -0.19470215,
          "special": false,
          "text": " in"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "Gradient descent is an optimization algorithm commonly used in"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_flash_qwen2/test_flash_qwen2.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 198,
        "logprob": -2.9023438,
        "special": false,
        "text": "\n"
      },
      {
        "id": 2,
        "logprob": -2.9160156,
        "special": false,
        "text": "#"
      },
      {
        "id": 4230,
        "logprob": -3.1035156,
        "special": false,
        "text": " Create"
      },
      {
        "id": 264,
        "logprob": -1.1025391,
        "special": false,
        "text": " a"
      },
      {
        "id": 1681,
        "logprob": -1.6914062,
        "special": false,
        "text": " request"
      },
      {
        "id": 198,
        "logprob": -1.1953125,
        "special": false,
        "text": "\n"
      },
      {
        "id": 2035,
        "logprob": -1.3203125,
        "special": false,
        "text": "request"
      },
      {
        "id": 284,
        "logprob": -0.13537598,
        "special": false,
        "text": " ="
      },
      {
        "id": 7388,
        "logprob": -1.2402344,
        "special": false,
        "text": " requests"
      },
      {
        "id": 670,
        "logprob": -0.2775879,
        "special": false,
        "text": ".get"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "\n# Create a request\nrequest = requests.get"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_qwen2/test_flash_qwen2_all_params.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": 0,
    "tokens": [
      {
        "id": 311,
        "logprob": -1.4277344,
        "special": false,
        "text": " to"
      },
      {
        "id": 279,
        "logprob": -0.65478516,
        "special": false,
        "text": " the"
      },
      {
        "id": 2473,
        "logprob": -1.8300781,
        "special": false,
        "text": " service"
      },
      {
        "id": 382,
        "logprob": -0.75,
        "special": false,
        "text": ".\n\n"
      },
      {
        "id": 286,
        "logprob": -0.11621094,
        "special": false,
        "text": "       "
      },
      {
        "id": 549,
        "logprob": 0.0,
        "special": false,
        "text": " :"
      },
      {
        "id": 689,
        "logprob": -0.48608398,
        "special": false,
        "text": "return"
      },
      {
        "id": 25,
        "logprob": 0.0,
        "special": false,
        "text": ":"
      },
      {
        "id": 5949,
        "logprob": -0.5756836,
        "special": false,
        "text": " Response"
      },
      {
        "id": 504,
        "logprob": -0.24499512,
        "special": false,
        "text": " from"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "Test request to the service.\n\n        :return: Response from"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_qwen2/test_flash_qwen2_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 198,
          "logprob": -2.9023438,
          "special": false,
          "text": "\n"
        },
        {
          "id": 2,
          "logprob": -2.9140625,
          "special": false,
          "text": "#"
        },
        {
          "id": 4230,
          "logprob": -3.1054688,
          "special": false,
          "text": " Create"
        },
        {
          "id": 264,
          "logprob": -1.0966797,
          "special": false,
          "text": " a"
        },
        {
          "id": 1681,
          "logprob": -1.6914062,
          "special": false,
          "text": " request"
        },
        {
          "id": 198,
          "logprob": -1.1923828,
          "special": false,
          "text": "\n"
        },
        {
          "id": 2035,
          "logprob": -1.3193359,
          "special": false,
          "text": "request"
        },
        {
          "id": 284,
          "logprob": -0.13586426,
          "special": false,
          "text": " ="
        },
        {
          "id": 7388,
          "logprob": -1.2412109,
          "special": false,
          "text": " requests"
        },
        {
          "id": 670,
          "logprob": -0.2775879,
          "special": false,
          "text": ".get"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n# Create a request\nrequest = requests.get"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 198,
          "logprob": -2.9023438,
          "special": false,
          "text": "\n"
        },
        {
          "id": 2,
          "logprob": -2.9140625,
          "special": false,
          "text": "#"
        },
        {
          "id": 4230,
          "logprob": -3.1054688,
          "special": false,
          "text": " Create"
        },
        {
          "id": 264,
          "logprob": -1.0966797,
          "special": false,
          "text": " a"
        },
        {
          "id": 1681,
          "logprob": -1.6914062,
          "special": false,
          "text": " request"
        },
        {
          "id": 198,
          "logprob": -1.1923828,
          "special": false,
          "text": "\n"
        },
        {
          "id": 2035,
          "logprob": -1.3193359,
          "special": false,
          "text": "request"
        },
        {
          "id": 284,
          "logprob": -0.13586426,
          "special": false,
          "text": " ="
        },
        {
          "id": 7388,
          "logprob": -1.2412109,
          "special": false,
          "text": " requests"
        },
        {
          "id": 670,
          "logprob": -0.2775879,
          "special": false,
          "text": ".get"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n# Create a request\nrequest = requests.get"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 198,
          "logprob": -2.9023438,
          "special": false,
          "text": "\n"
        },
        {
          "id": 2,
          "logprob": -2.9140625,
          "special": false,
          "text": "#"
        },
        {
          "id": 4230,
          "logprob": -3.1054688,
          "special": false,
          "text": " Create"
        },
        {
          "id": 264,
          "logprob": -1.0966797,
          "special": false,
          "text": " a"
        },
        {
          "id": 1681,
          "logprob": -1.6914062,
          "special": false,
          "text": " request"
        },
        {
          "id": 198,
          "logprob": -1.1923828,
          "special": false,
          "text": "\n"
        },
        {
          "id": 2035,
          "logprob": -1.3193359,
          "special": false,
          "text": "request"
        },
        {
          "id": 284,
          "logprob": -0.13586426,
          "special": false,
          "text": " ="
        },
        {
          "id": 7388,
          "logprob": -1.2412109,
          "special": false,
          "text": " requests"
        },
        {
          "id": 670,
          "logprob": -0.2775879,
          "special": false,
          "text": ".get"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n# Create a request\nrequest = requests.get"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 198,
          "logprob": -2.9023438,
          "special": false,
          "text": "\n"
        },
        {
          "id": 2,
          "logprob": -2.9140625,
          "special": false,
          "text": "#"
        },
        {
          "id": 4230,
          "logprob": -3.1054688,
          "special": false,
          "text": " Create"
        },
        {
          "id": 264,
          "logprob": -1.0966797,
          "special": false,
          "text": " a"
        },
        {
          "id": 1681,
          "logprob": -1.6914062,
          "special": false,
          "text": " request"
        },
        {
          "id": 198,
          "logprob": -1.1923828,
          "special": false,
          "text": "\n"
        },
        {
          "id": 2035,
          "logprob": -1.3193359,
          "special": false,
          "text": "request"
        },
        {
          "id": 284,
          "logprob": -0.13586426,
          "special": false,
          "text": " ="
        },
        {
          "id": 7388,
          "logprob": -1.2412109,
          "special": false,
          "text": " requests"
        },
        {
          "id": 670,
          "logprob": -0.2775879,
          "special": false,
          "text": ".get"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n# Create a request\nrequest = requests.get"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_flash_qwen2_5_vl/test_flash_qwen2_5_vl_bay.json
================================================
{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The image showcases the Statue of Liberty, a colossal bronze statue located in New York Harbor, a heritage building in the United States. The statue has a majestic presence, with one arm raised towards the sun and the other hitched on her hip. It sits atop a keeper's walkway, observed from the water. Surrounding the statue is a lush green meadow, where picnic spots, walkways, and a visitor desk can be found. In front of the statue, a large marina can accommodate fourteen different kinds of boats. In the backdrop stands the Empire State Building, marking the crowded skyscrapers of New York City.",
        "name": null,
        "role": "assistant",
        "tool_calls": null
      },
      "usage": null
    }
  ],
  "created": 1738342753,
  "id": "",
  "model": "Qwen/Qwen2.5-VL-3B-Instruct",
  "object": "chat.completion",
  "system_fingerprint": "3.0.2-dev0-native",
  "usage": {
    "completion_tokens": 128,
    "prompt_tokens": 8736,
    "total_tokens": 8864
  }
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_qwen2_5_vl/test_flash_qwen2_5_vl_inpaint.json
================================================
{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The image shows a whimsical scene set in what appears to be a fast-food restaurant. Dominating the foreground is a large, green, inflatable dinosaur with realistic textures, giving it a Jurassic Park-like appearance. The dinosaur is wearing a red Adult Swim logo hat, adding a humorous touch to its appearance.\n\nSurrounding the dinosaur are various food items typically found in a fast-food restaurant, including French fries in a plastic cup, a hamburger on a plate, and a beverage in another cup. The hamburger is detailed with lettuce, tomato, and other typical fast-food ingredients.\n\nAccompanying the dinosaur is a realistic-looking owl perched on the table, which adds to the surreal and playful atmosphere of the scene. The background features the interior of the restaurant with neon signs and other typical decor elements, enhancing the overall theme of a fun and fantastical fast-food experience.\n\nOverall, the image is a playful and imaginative blend of a standard fast-food setting with an unexpected and amusing twist provided by the dinosaur and owl characters.",
        "name": null,
        "role": "assistant",
        "tool_calls": null
      },
      "usage": null
    }
  ],
  "created": 1738343775,
  "id": "",
  "model": "Qwen/Qwen2.5-VL-3B-Instruct",
  "object": "chat.completion",
  "system_fingerprint": "3.0.2-dev0-native",
  "usage": {
    "completion_tokens": 206,
    "prompt_tokens": 5375,
    "total_tokens": 5581
  }
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_qwen2_5_vl/test_flash_qwen2_5_vl_simple.json
================================================
{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The image depicts an anthropomorphic rabbit character wearing an intricate space suit, which includes a helmet with a starry face pattern and multiple suitors. The rabbit's ears are significantly large and upright, and it has a hitchhiker-like star antennas on its chest. The background is a reddish-orange, rocky landscape, suggesting a Martian environment. The suit has various buttons, a red button on the chest, and a reflective or illuminated dome on the head. The overall color scheme is dominated by shades of red, orange, and gray, giving a sense of a rugged, otherworldly setting.",
        "name": null,
        "role": "assistant",
        "tool_calls": null
      },
      "usage": null
    }
  ],
  "created": 1738342872,
  "id": "",
  "model": "Qwen/Qwen2.5-VL-3B-Instruct",
  "object": "chat.completion",
  "system_fingerprint": "3.0.2-dev0-native",
  "usage": {
    "completion_tokens": 121,
    "prompt_tokens": 1363,
    "total_tokens": 1484
  }
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_qwen2_5_vl/test_flash_qwen2_5_vl_simple_streaming.json
================================================
{
  "choices": [
    {
      "delta": {
        "content": "",
        "role": "assistant",
        "tool_calls": null
      },
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null
    }
  ],
  "created": 1738343559,
  "id": "",
  "model": "Qwen/Qwen2.5-VL-3B-Instruct",
  "object": "chat.completion.chunk",
  "system_fingerprint": "3.0.2-dev0-native",
  "usage": null
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_bay.json
================================================
{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The image showcases a stunning cityscape, featuring the iconic Statue of Liberty in the foreground. The image displays Lady Liberty's imposing presence, with her towering base standing beside her. Behind the statue, the city's skyline extends across the horizon, adorned with numerous tall buildings, including the Empire State Building and other notable skyscrapers. The water reflecting the sun's rays creates a serene and picturesque scene, emphasizing the beauty and resilience of this global landmark. The sky is a clear, pale blue, adding to the overall tranquility of the scene.",
        "name": null,
        "role": "assistant",
        "tool_calls": null
      },
      "usage": null
    }
  ],
  "created": 1738348090,
  "id": "",
  "model": "Qwen/Qwen2-VL-7B-Instruct",
  "object": "chat.completion",
  "system_fingerprint": "3.1.1-dev0-native",
  "usage": {
    "completion_tokens": 110,
    "prompt_tokens": 8736,
    "total_tokens": 8846
  }
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_inpaint.json
================================================
{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The image shows a stylized scene set in what appears to be a diner or restaurant. In the foreground, there is a table with various food items, including a burger with lettuce and tomato, a bowl of fries, and a drink in a cup with a straw. On the right side of the table, there is an owl sitting alertly, looking directly at the camera. Behind the owl and the table, there is a large, green, dinosaur-like creature resembling Godzilla, with its mouth open and tongue visible. In the background, the diner's decor includes various signs and posters, with a green sign reading \"Basta\" and another sign that says \"Tabasco.\" The setting has a retro or vintage feel, with fluorescent lighting overhead and clean, polished surfaces.",
        "name": null,
        "role": "assistant",
        "tool_calls": null
      },
      "usage": null
    }
  ],
  "created": 1738348100,
  "id": "",
  "model": "Qwen/Qwen2-VL-7B-Instruct",
  "object": "chat.completion",
  "system_fingerprint": "3.1.1-dev0-native",
  "usage": {
    "completion_tokens": 156,
    "prompt_tokens": 5375,
    "total_tokens": 5531
  }
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_simple.json
================================================
{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The image depicts an anthropomorphic rabbit, wearing a spacesuit, standing in a barren, rocky landscape that resembles the surface of another planet, possibly Mars. The rabbit has a red digestive system label on its chest, and the surrounding environment features red sandy terrain and a hazy, floating planet or moon in the background. The scene has a surreal, fantastical quality, blending elements of science fiction and space exploration with a whimsical character.",
        "name": null,
        "role": "assistant",
        "tool_calls": null
      },
      "usage": null
    }
  ],
  "created": 1738347908,
  "id": "",
  "model": "Qwen/Qwen2-VL-7B-Instruct",
  "object": "chat.completion",
  "system_fingerprint": "3.1.1-dev0-native",
  "usage": {
    "completion_tokens": 89,
    "prompt_tokens": 1364,
    "total_tokens": 1453
  }
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_simple_streaming.json
================================================
{
  "choices": [
    {
      "delta": {
        "content": "",
        "role": "assistant",
        "tool_calls": null
      },
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null
    }
  ],
  "created": 1737646031,
  "id": "",
  "model": "Qwen/Qwen2-VL-7B-Instruct",
  "object": "chat.completion.chunk",
  "system_fingerprint": "3.0.2-dev0-native",
  "usage": null
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_santacoder/test_flash_santacoder.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 1241,
        "logprob": -0.9863281,
        "special": false,
        "text": "():"
      },
      {
        "id": 258,
        "logprob": -0.21447754,
        "special": false,
        "text": "\n   "
      },
      {
        "id": 942,
        "logprob": -0.43701172,
        "special": false,
        "text": " print"
      },
      {
        "id": 372,
        "logprob": -0.5361328,
        "special": false,
        "text": "(\""
      },
      {
        "id": 7371,
        "logprob": -0.44555664,
        "special": false,
        "text": "Hello"
      },
      {
        "id": 9956,
        "logprob": -1.2412109,
        "special": false,
        "text": " World"
      },
      {
        "id": 8657,
        "logprob": -0.7583008,
        "special": false,
        "text": "!\")"
      },
      {
        "id": 185,
        "logprob": -0.76171875,
        "special": false,
        "text": "\n"
      },
      {
        "id": 185,
        "logprob": -0.20837402,
        "special": false,
        "text": "\n"
      },
      {
        "id": 1018,
        "logprob": -1.2470703,
        "special": false,
        "text": "print"
      }
    ]
  },
  "generated_text": "():\n    print(\"Hello World!\")\n\nprint"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_santacoder/test_flash_santacoder_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 1241,
          "logprob": -0.9863281,
          "special": false,
          "text": "():"
        },
        {
          "id": 258,
          "logprob": -0.21362305,
          "special": false,
          "text": "\n   "
        },
        {
          "id": 942,
          "logprob": -0.44360352,
          "special": false,
          "text": " print"
        },
        {
          "id": 372,
          "logprob": -0.54248047,
          "special": false,
          "text": "(\""
        },
        {
          "id": 7371,
          "logprob": -0.44555664,
          "special": false,
          "text": "Hello"
        },
        {
          "id": 9956,
          "logprob": -1.2441406,
          "special": false,
          "text": " World"
        },
        {
          "id": 8657,
          "logprob": -0.75878906,
          "special": false,
          "text": "!\")"
        },
        {
          "id": 185,
          "logprob": -0.76171875,
          "special": false,
          "text": "\n"
        },
        {
          "id": 185,
          "logprob": -0.2084961,
          "special": false,
          "text": "\n"
        },
        {
          "id": 1018,
          "logprob": -1.2460938,
          "special": false,
          "text": "print"
        }
      ]
    },
    "generated_text": "():\n    print(\"Hello World!\")\n\nprint"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 1241,
          "logprob": -0.9863281,
          "special": false,
          "text": "():"
        },
        {
          "id": 258,
          "logprob": -0.21362305,
          "special": false,
          "text": "\n   "
        },
        {
          "id": 942,
          "logprob": -0.44360352,
          "special": false,
          "text": " print"
        },
        {
          "id": 372,
          "logprob": -0.54248047,
          "special": false,
          "text": "(\""
        },
        {
          "id": 7371,
          "logprob": -0.44555664,
          "special": false,
          "text": "Hello"
        },
        {
          "id": 9956,
          "logprob": -1.2441406,
          "special": false,
          "text": " World"
        },
        {
          "id": 8657,
          "logprob": -0.75878906,
          "special": false,
          "text": "!\")"
        },
        {
          "id": 185,
          "logprob": -0.76171875,
          "special": false,
          "text": "\n"
        },
        {
          "id": 185,
          "logprob": -0.2084961,
          "special": false,
          "text": "\n"
        },
        {
          "id": 1018,
          "logprob": -1.2460938,
          "special": false,
          "text": "print"
        }
      ]
    },
    "generated_text": "():\n    print(\"Hello World!\")\n\nprint"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 1241,
          "logprob": -0.9863281,
          "special": false,
          "text": "():"
        },
        {
          "id": 258,
          "logprob": -0.21362305,
          "special": false,
          "text": "\n   "
        },
        {
          "id": 942,
          "logprob": -0.44360352,
          "special": false,
          "text": " print"
        },
        {
          "id": 372,
          "logprob": -0.54248047,
          "special": false,
          "text": "(\""
        },
        {
          "id": 7371,
          "logprob": -0.44555664,
          "special": false,
          "text": "Hello"
        },
        {
          "id": 9956,
          "logprob": -1.2441406,
          "special": false,
          "text": " World"
        },
        {
          "id": 8657,
          "logprob": -0.75878906,
          "special": false,
          "text": "!\")"
        },
        {
          "id": 185,
          "logprob": -0.76171875,
          "special": false,
          "text": "\n"
        },
        {
          "id": 185,
          "logprob": -0.2084961,
          "special": false,
          "text": "\n"
        },
        {
          "id": 1018,
          "logprob": -1.2460938,
          "special": false,
          "text": "print"
        }
      ]
    },
    "generated_text": "():\n    print(\"Hello World!\")\n\nprint"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 1241,
          "logprob": -0.9863281,
          "special": false,
          "text": "():"
        },
        {
          "id": 258,
          "logprob": -0.21362305,
          "special": false,
          "text": "\n   "
        },
        {
          "id": 942,
          "logprob": -0.44360352,
          "special": false,
          "text": " print"
        },
        {
          "id": 372,
          "logprob": -0.54248047,
          "special": false,
          "text": "(\""
        },
        {
          "id": 7371,
          "logprob": -0.44555664,
          "special": false,
          "text": "Hello"
        },
        {
          "id": 9956,
          "logprob": -1.2441406,
          "special": false,
          "text": " World"
        },
        {
          "id": 8657,
          "logprob": -0.75878906,
          "special": false,
          "text": "!\")"
        },
        {
          "id": 185,
          "logprob": -0.76171875,
          "special": false,
          "text": "\n"
        },
        {
          "id": 185,
          "logprob": -0.2084961,
          "special": false,
          "text": "\n"
        },
        {
          "id": 1018,
          "logprob": -1.2460938,
          "special": false,
          "text": "print"
        }
      ]
    },
    "generated_text": "():\n    print(\"Hello World!\")\n\nprint"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_flash_starcoder/test_flash_starcoder.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 2262,
        "logprob": -0.7705078,
        "special": false,
        "text": "():"
      },
      {
        "id": 284,
        "logprob": -0.2590332,
        "special": false,
        "text": "\n   "
      },
      {
        "id": 1459,
        "logprob": -0.39379883,
        "special": false,
        "text": " print"
      },
      {
        "id": 440,
        "logprob": -0.61376953,
        "special": false,
        "text": "(\""
      },
      {
        "id": 8279,
        "logprob": -0.47338867,
        "special": false,
        "text": "Hello"
      },
      {
        "id": 10896,
        "logprob": -1.5068359,
        "special": false,
        "text": " World"
      },
      {
        "id": 657,
        "logprob": -0.80810547,
        "special": false,
        "text": "\")"
      },
      {
        "id": 203,
        "logprob": -0.7397461,
        "special": false,
        "text": "\n"
      },
      {
        "id": 203,
        "logprob": -0.35229492,
        "special": false,
        "text": "\n"
      },
      {
        "id": 589,
        "logprob": -1.0371094,
        "special": false,
        "text": "def"
      }
    ]
  },
  "generated_text": "():\n    print(\"Hello World\")\n\ndef"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_starcoder/test_flash_starcoder_default_params.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 60,
    "prefill": [],
    "seed": 0,
    "tokens": [
      {
        "id": 2262,
        "logprob": -0.045715332,
        "special": false,
        "text": "():"
      },
      {
        "id": 284,
        "logprob": 0.0,
        "special": false,
        "text": "\n   "
      },
      {
        "id": 1459,
        "logprob": 0.0,
        "special": false,
        "text": " print"
      },
      {
        "id": 440,
        "logprob": 0.0,
        "special": false,
        "text": "(\""
      },
      {
        "id": 8279,
        "logprob": 0.0,
        "special": false,
        "text": "Hello"
      },
      {
        "id": 10896,
        "logprob": -0.3659668,
        "special": false,
        "text": " World"
      },
      {
        "id": 657,
        "logprob": -0.5229492,
        "special": false,
        "text": "\")"
      },
      {
        "id": 203,
        "logprob": -0.10632324,
        "special": false,
        "text": "\n"
      },
      {
        "id": 203,
        "logprob": 0.0,
        "special": false,
        "text": "\n"
      },
      {
        "id": 589,
        "logprob": -0.20141602,
        "special": false,
        "text": "def"
      },
      {
        "id": 1459,
        "logprob": 0.0,
        "special": false,
        "text": " print"
      },
      {
        "id": 81,
        "logprob": 0.0,
        "special": false,
        "text": "_"
      },
      {
        "id": 7656,
        "logprob": 0.0,
        "special": false,
        "text": "hello"
      },
      {
        "id": 81,
        "logprob": 0.0,
        "special": false,
        "text": "_"
      },
      {
        "id": 426,
        "logprob": -0.051635742,
        "special": false,
        "text": "name"
      },
      {
        "id": 26,
        "logprob": 0.0,
        "special": false,
        "text": "("
      },
      {
        "id": 426,
        "logprob": 0.0,
        "special": false,
        "text": "name"
      },
      {
        "id": 711,
        "logprob": 0.0,
        "special": false,
        "text": "):"
      },
      {
        "id": 284,
        "logprob": 0.0,
        "special": false,
        "text": "\n   "
      },
      {
        "id": 1459,
        "logprob": 0.0,
        "special": false,
        "text": " print"
      },
      {
        "id": 440,
        "logprob": -0.16027832,
        "special": false,
        "text": "(\""
      },
      {
        "id": 8279,
        "logprob": 0.0,
        "special": false,
        "text": "Hello"
      },
      {
        "id": 313,
        "logprob": 0.0,
        "special": false,
        "text": " \""
      },
      {
        "id": 474,
        "logprob": 0.0,
        "special": false,
        "text": " +"
      },
      {
        "id": 636,
        "logprob": 0.0,
        "special": false,
        "text": " name"
      },
      {
        "id": 27,
        "logprob": 0.0,
        "special": false,
        "text": ")"
      },
      {
        "id": 203,
        "logprob": 0.0,
        "special": false,
        "text": "\n"
      },
      {
        "id": 203,
        "logprob": 0.0,
        "special": false,
        "text": "\n"
      },
      {
        "id": 589,
        "logprob": 0.0,
        "special": false,
        "text": "def"
      },
      {
        "id": 1459,
        "logprob": 0.0,
        "special": false,
        "text": " print"
      },
      {
        "id": 81,
        "logprob": 0.0,
        "special": false,
        "text": "_"
      },
      {
        "id": 7656,
        "logprob": 0.0,
        "special": false,
        "text": "hello"
      },
      {
        "id": 81,
        "logprob": 0.0,
        "special": false,
        "text": "_"
      },
      {
        "id": 426,
        "logprob": 0.0,
        "special": false,
        "text": "name"
      },
      {
        "id": 81,
        "logprob": 0.0,
        "special": false,
        "text": "_"
      },
      {
        "id": 381,
        "logprob": 0.0,
        "special": false,
        "text": "age"
      },
      {
        "id": 26,
        "logprob": 0.0,
        "special": false,
        "text": "("
      },
      {
        "id": 426,
        "logprob": 0.0,
        "special": false,
        "text": "name"
      },
      {
        "id": 30,
        "logprob": 0.0,
        "special": false,
        "text": ","
      },
      {
        "id": 11442,
        "logprob": 0.0,
        "special": false,
        "text": " age"
      },
      {
        "id": 711,
        "logprob": 0.0,
        "special": false,
        "text": "):"
      },
      {
        "id": 284,
        "logprob": 0.0,
        "special": false,
        "text": "\n   "
      },
      {
        "id": 1459,
        "logprob": 0.0,
        "special": false,
        "text": " print"
      },
      {
        "id": 440,
        "logprob": 0.0,
        "special": false,
        "text": "(\""
      },
      {
        "id": 8279,
        "logprob": 0.0,
        "special": false,
        "text": "Hello"
      },
      {
        "id": 313,
        "logprob": 0.0,
        "special": false,
        "text": " \""
      },
      {
        "id": 474,
        "logprob": 0.0,
        "special": false,
        "text": " +"
      },
      {
        "id": 636,
        "logprob": 0.0,
        "special": false,
        "text": " name"
      },
      {
        "id": 474,
        "logprob": 0.0,
        "special": false,
        "text": " +"
      },
      {
        "id": 313,
        "logprob": -0.6933594,
        "special": false,
        "text": " \""
      },
      {
        "id": 313,
        "logprob": -1.7011719,
        "special": false,
        "text": " \""
      },
      {
        "id": 474,
        "logprob": 0.0,
        "special": false,
        "text": " +"
      },
      {
        "id": 596,
        "logprob": 0.0,
        "special": false,
        "text": " str"
      },
      {
        "id": 26,
        "logprob": 0.0,
        "special": false,
        "text": "("
      },
      {
        "id": 381,
        "logprob": 0.0,
        "special": false,
        "text": "age"
      },
      {
        "id": 490,
        "logprob": 0.0,
        "special": false,
        "text": "))"
      },
      {
        "id": 203,
        "logprob": 0.0,
        "special": false,
        "text": "\n"
      },
      {
        "id": 203,
        "logprob": 0.0,
        "special": false,
        "text": "\n"
      },
      {
        "id": 589,
        "logprob": 0.0,
        "special": false,
        "text": "def"
      },
      {
        "id": 1459,
        "logprob": 0.0,
        "special": false,
        "text": " print"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "():\n    print(\"Hello World\")\n\ndef print_hello_name(name):\n    print(\"Hello \" + name)\n\ndef print_hello_name_age(name, age):\n    print(\"Hello \" + name + \" \" + str(age))\n\ndef print"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_starcoder/test_flash_starcoder_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 2262,
          "logprob": -0.7705078,
          "special": false,
          "text": "():"
        },
        {
          "id": 284,
          "logprob": -0.2602539,
          "special": false,
          "text": "\n   "
        },
        {
          "id": 1459,
          "logprob": -0.39282227,
          "special": false,
          "text": " print"
        },
        {
          "id": 440,
          "logprob": -0.6113281,
          "special": false,
          "text": "(\""
        },
        {
          "id": 8279,
          "logprob": -0.4765625,
          "special": false,
          "text": "Hello"
        },
        {
          "id": 10896,
          "logprob": -1.5068359,
          "special": false,
          "text": " World"
        },
        {
          "id": 657,
          "logprob": -0.8154297,
          "special": false,
          "text": "\")"
        },
        {
          "id": 203,
          "logprob": -0.7319336,
          "special": false,
          "text": "\n"
        },
        {
          "id": 203,
          "logprob": -0.35229492,
          "special": false,
          "text": "\n"
        },
        {
          "id": 589,
          "logprob": -1.0380859,
          "special": false,
          "text": "def"
        }
      ]
    },
    "generated_text": "():\n    print(\"Hello World\")\n\ndef"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 2262,
          "logprob": -0.7705078,
          "special": false,
          "text": "():"
        },
        {
          "id": 284,
          "logprob": -0.2602539,
          "special": false,
          "text": "\n   "
        },
        {
          "id": 1459,
          "logprob": -0.39282227,
          "special": false,
          "text": " print"
        },
        {
          "id": 440,
          "logprob": -0.6113281,
          "special": false,
          "text": "(\""
        },
        {
          "id": 8279,
          "logprob": -0.4765625,
          "special": false,
          "text": "Hello"
        },
        {
          "id": 10896,
          "logprob": -1.5068359,
          "special": false,
          "text": " World"
        },
        {
          "id": 657,
          "logprob": -0.8154297,
          "special": false,
          "text": "\")"
        },
        {
          "id": 203,
          "logprob": -0.7319336,
          "special": false,
          "text": "\n"
        },
        {
          "id": 203,
          "logprob": -0.35229492,
          "special": false,
          "text": "\n"
        },
        {
          "id": 589,
          "logprob": -1.0380859,
          "special": false,
          "text": "def"
        }
      ]
    },
    "generated_text": "():\n    print(\"Hello World\")\n\ndef"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 2262,
          "logprob": -0.7705078,
          "special": false,
          "text": "():"
        },
        {
          "id": 284,
          "logprob": -0.2602539,
          "special": false,
          "text": "\n   "
        },
        {
          "id": 1459,
          "logprob": -0.39282227,
          "special": false,
          "text": " print"
        },
        {
          "id": 440,
          "logprob": -0.6113281,
          "special": false,
          "text": "(\""
        },
        {
          "id": 8279,
          "logprob": -0.4765625,
          "special": false,
          "text": "Hello"
        },
        {
          "id": 10896,
          "logprob": -1.5068359,
          "special": false,
          "text": " World"
        },
        {
          "id": 657,
          "logprob": -0.8154297,
          "special": false,
          "text": "\")"
        },
        {
          "id": 203,
          "logprob": -0.7319336,
          "special": false,
          "text": "\n"
        },
        {
          "id": 203,
          "logprob": -0.35229492,
          "special": false,
          "text": "\n"
        },
        {
          "id": 589,
          "logprob": -1.0380859,
          "special": false,
          "text": "def"
        }
      ]
    },
    "generated_text": "():\n    print(\"Hello World\")\n\ndef"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 2262,
          "logprob": -0.7705078,
          "special": false,
          "text": "():"
        },
        {
          "id": 284,
          "logprob": -0.2602539,
          "special": false,
          "text": "\n   "
        },
        {
          "id": 1459,
          "logprob": -0.39282227,
          "special": false,
          "text": " print"
        },
        {
          "id": 440,
          "logprob": -0.6113281,
          "special": false,
          "text": "(\""
        },
        {
          "id": 8279,
          "logprob": -0.4765625,
          "special": false,
          "text": "Hello"
        },
        {
          "id": 10896,
          "logprob": -1.5068359,
          "special": false,
          "text": " World"
        },
        {
          "id": 657,
          "logprob": -0.8154297,
          "special": false,
          "text": "\")"
        },
        {
          "id": 203,
          "logprob": -0.7319336,
          "special": false,
          "text": "\n"
        },
        {
          "id": 203,
          "logprob": -0.35229492,
          "special": false,
          "text": "\n"
        },
        {
          "id": 589,
          "logprob": -1.0380859,
          "special": false,
          "text": "def"
        }
      ]
    },
    "generated_text": "():\n    print(\"Hello World\")\n\ndef"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_flash_starcoder2/test_flash_starcoder2.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 2284,
        "logprob": -0.92626953,
        "special": false,
        "text": "():"
      },
      {
        "id": 303,
        "logprob": -0.40844727,
        "special": false,
        "text": "\n   "
      },
      {
        "id": 1489,
        "logprob": -0.27905273,
        "special": false,
        "text": " print"
      },
      {
        "id": 459,
        "logprob": -0.6118164,
        "special": false,
        "text": "(\""
      },
      {
        "id": 8302,
        "logprob": -0.68652344,
        "special": false,
        "text": "Hello"
      },
      {
        "id": 10914,
        "logprob": -1.4619141,
        "special": false,
        "text": " World"
      },
      {
        "id": 16013,
        "logprob": -0.7993164,
        "special": false,
        "text": "!\")"
      },
      {
        "id": 222,
        "logprob": -0.63134766,
        "special": false,
        "text": "\n"
      },
      {
        "id": 222,
        "logprob": -0.23278809,
        "special": false,
        "text": "\n"
      },
      {
        "id": 610,
        "logprob": -1.2294922,
        "special": false,
        "text": "def"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "():\n    print(\"Hello World!\")\n\ndef"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_starcoder2/test_flash_starcoder2_default_params.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 60,
    "prefill": [],
    "seed": 0,
    "tokens": [
      {
        "id": 2284,
        "logprob": -0.31323242,
        "special": false,
        "text": "():"
      },
      {
        "id": 303,
        "logprob": 0.0,
        "special": false,
        "text": "\n   "
      },
      {
        "id": 1489,
        "logprob": 0.0,
        "special": false,
        "text": " print"
      },
      {
        "id": 459,
        "logprob": 0.0,
        "special": false,
        "text": "(\""
      },
      {
        "id": 8302,
        "logprob": -0.26611328,
        "special": false,
        "text": "Hello"
      },
      {
        "id": 10914,
        "logprob": -0.7871094,
        "special": false,
        "text": " World"
      },
      {
        "id": 16013,
        "logprob": -0.64746094,
        "special": false,
        "text": "!\")"
      },
      {
        "id": 222,
        "logprob": -0.054870605,
        "special": false,
        "text": "\n"
      },
      {
        "id": 222,
        "logprob": 0.0,
        "special": false,
        "text": "\n"
      },
      {
        "id": 610,
        "logprob": -0.41064453,
        "special": false,
        "text": "def"
      },
      {
        "id": 1489,
        "logprob": 0.0,
        "special": false,
        "text": " print"
      },
      {
        "id": 100,
        "logprob": 0.0,
        "special": false,
        "text": "_"
      },
      {
        "id": 7670,
        "logprob": 0.0,
        "special": false,
        "text": "hello"
      },
      {
        "id": 100,
        "logprob": 0.0,
        "special": false,
        "text": "_"
      },
      {
        "id": 444,
        "logprob": -0.21655273,
        "special": false,
        "text": "name"
      },
      {
        "id": 45,
        "logprob": 0.0,
        "special": false,
        "text": "("
      },
      {
        "id": 444,
        "logprob": 0.0,
        "special": false,
        "text": "name"
      },
      {
        "id": 731,
        "logprob": 0.0,
        "special": false,
        "text": "):"
      },
      {
        "id": 303,
        "logprob": 0.0,
        "special": false,
        "text": "\n   "
      },
      {
        "id": 1489,
        "logprob": 0.0,
        "special": false,
        "text": " print"
      },
      {
        "id": 459,
        "logprob": 0.0,
        "special": false,
        "text": "(\""
      },
      {
        "id": 8302,
        "logprob": 0.0,
        "special": false,
        "text": "Hello"
      },
      {
        "id": 332,
        "logprob": -0.034698486,
        "special": false,
        "text": " \""
      },
      {
        "id": 494,
        "logprob": 0.0,
        "special": false,
        "text": " +"
      },
      {
        "id": 655,
        "logprob": 0.0,
        "special": false,
        "text": " name"
      },
      {
        "id": 494,
        "logprob": -0.20141602,
        "special": false,
        "text": " +"
      },
      {
        "id": 332,
        "logprob": 0.0,
        "special": false,
        "text": " \""
      },
      {
        "id": 16013,
        "logprob": 0.0,
        "special": false,
        "text": "!\")"
      },
      {
        "id": 222,
        "logprob": 0.0,
        "special": false,
        "text": "\n"
      },
      {
        "id": 222,
        "logprob": 0.0,
        "special": false,
        "text": "\n"
      },
      {
        "id": 610,
        "logprob": 0.0,
        "special": false,
        "text": "def"
      },
      {
        "id": 1489,
        "logprob": 0.0,
        "special": false,
        "text": " print"
      },
      {
        "id": 100,
        "logprob": 0.0,
        "special": false,
        "text": "_"
      },
      {
        "id": 7670,
        "logprob": 0.0,
        "special": false,
        "text": "hello"
      },
      {
        "id": 100,
        "logprob": 0.0,
        "special": false,
        "text": "_"
      },
      {
        "id": 444,
        "logprob": 0.0,
        "special": false,
        "text": "name"
      },
      {
        "id": 100,
        "logprob": 0.0,
        "special": false,
        "text": "_"
      },
      {
        "id": 400,
        "logprob": 0.0,
        "special": false,
        "text": "age"
      },
      {
        "id": 45,
        "logprob": 0.0,
        "special": false,
        "text": "("
      },
      {
        "id": 444,
        "logprob": 0.0,
        "special": false,
        "text": "name"
      },
      {
        "id": 49,
        "logprob": 0.0,
        "special": false,
        "text": ","
      },
      {
        "id": 11505,
        "logprob": 0.0,
        "special": false,
        "text": " age"
      },
      {
        "id": 731,
        "logprob": 0.0,
        "special": false,
        "text": "):"
      },
      {
        "id": 303,
        "logprob": 0.0,
        "special": false,
        "text": "\n   "
      },
      {
        "id": 1489,
        "logprob": 0.0,
        "special": false,
        "text": " print"
      },
      {
        "id": 459,
        "logprob": 0.0,
        "special": false,
        "text": "(\""
      },
      {
        "id": 8302,
        "logprob": 0.0,
        "special": false,
        "text": "Hello"
      },
      {
        "id": 332,
        "logprob": 0.0,
        "special": false,
        "text": " \""
      },
      {
        "id": 494,
        "logprob": 0.0,
        "special": false,
        "text": " +"
      },
      {
        "id": 655,
        "logprob": 0.0,
        "special": false,
        "text": " name"
      },
      {
        "id": 494,
        "logprob": 0.0,
        "special": false,
        "text": " +"
      },
      {
        "id": 3021,
        "logprob": -0.5761719,
        "special": false,
        "text": " \","
      },
      {
        "id": 863,
        "logprob": 0.0,
        "special": false,
        "text": " you"
      },
      {
        "id": 904,
        "logprob": 0.0,
        "special": false,
        "text": " are"
      },
      {
        "id": 332,
        "logprob": 0.0,
        "special": false,
        "text": " \""
      },
      {
        "id": 494,
        "logprob": 0.0,
        "special": false,
        "text": " +"
      },
      {
        "id": 615,
        "logprob": 0.0,
        "special": false,
        "text": " str"
      },
      {
        "id": 45,
        "logprob": 0.0,
        "special": false,
        "text": "("
      },
      {
        "id": 400,
        "logprob": 0.0,
        "special": false,
        "text": "age"
      },
      {
        "id": 46,
        "logprob": 0.0,
        "special": false,
        "text": ")"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "():\n    print(\"Hello World!\")\n\ndef print_hello_name(name):\n    print(\"Hello \" + name + \"!\")\n\ndef print_hello_name_age(name, age):\n    print(\"Hello \" + name + \", you are \" + str(age)"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_starcoder2/test_flash_starcoder2_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 2284,
          "logprob": -0.92626953,
          "special": false,
          "text": "():"
        },
        {
          "id": 303,
          "logprob": -0.40722656,
          "special": false,
          "text": "\n   "
        },
        {
          "id": 1489,
          "logprob": -0.27954102,
          "special": false,
          "text": " print"
        },
        {
          "id": 459,
          "logprob": -0.6142578,
          "special": false,
          "text": "(\""
        },
        {
          "id": 8302,
          "logprob": -0.68310547,
          "special": false,
          "text": "Hello"
        },
        {
          "id": 10914,
          "logprob": -1.4570312,
          "special": false,
          "text": " World"
        },
        {
          "id": 16013,
          "logprob": -0.80126953,
          "special": false,
          "text": "!\")"
        },
        {
          "id": 222,
          "logprob": -0.6303711,
          "special": false,
          "text": "\n"
        },
        {
          "id": 222,
          "logprob": -0.23327637,
          "special": false,
          "text": "\n"
        },
        {
          "id": 610,
          "logprob": -1.2304688,
          "special": false,
          "text": "def"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "():\n    print(\"Hello World!\")\n\ndef"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 2284,
          "logprob": -0.92626953,
          "special": false,
          "text": "():"
        },
        {
          "id": 303,
          "logprob": -0.40722656,
          "special": false,
          "text": "\n   "
        },
        {
          "id": 1489,
          "logprob": -0.27954102,
          "special": false,
          "text": " print"
        },
        {
          "id": 459,
          "logprob": -0.6142578,
          "special": false,
          "text": "(\""
        },
        {
          "id": 8302,
          "logprob": -0.68310547,
          "special": false,
          "text": "Hello"
        },
        {
          "id": 10914,
          "logprob": -1.4570312,
          "special": false,
          "text": " World"
        },
        {
          "id": 16013,
          "logprob": -0.80126953,
          "special": false,
          "text": "!\")"
        },
        {
          "id": 222,
          "logprob": -0.6303711,
          "special": false,
          "text": "\n"
        },
        {
          "id": 222,
          "logprob": -0.23327637,
          "special": false,
          "text": "\n"
        },
        {
          "id": 610,
          "logprob": -1.2304688,
          "special": false,
          "text": "def"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "():\n    print(\"Hello World!\")\n\ndef"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 2284,
          "logprob": -0.92626953,
          "special": false,
          "text": "():"
        },
        {
          "id": 303,
          "logprob": -0.40722656,
          "special": false,
          "text": "\n   "
        },
        {
          "id": 1489,
          "logprob": -0.27954102,
          "special": false,
          "text": " print"
        },
        {
          "id": 459,
          "logprob": -0.6142578,
          "special": false,
          "text": "(\""
        },
        {
          "id": 8302,
          "logprob": -0.68310547,
          "special": false,
          "text": "Hello"
        },
        {
          "id": 10914,
          "logprob": -1.4570312,
          "special": false,
          "text": " World"
        },
        {
          "id": 16013,
          "logprob": -0.80126953,
          "special": false,
          "text": "!\")"
        },
        {
          "id": 222,
          "logprob": -0.6303711,
          "special": false,
          "text": "\n"
        },
        {
          "id": 222,
          "logprob": -0.23327637,
          "special": false,
          "text": "\n"
        },
        {
          "id": 610,
          "logprob": -1.2304688,
          "special": false,
          "text": "def"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "():\n    print(\"Hello World!\")\n\ndef"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 2284,
          "logprob": -0.92626953,
          "special": false,
          "text": "():"
        },
        {
          "id": 303,
          "logprob": -0.40722656,
          "special": false,
          "text": "\n   "
        },
        {
          "id": 1489,
          "logprob": -0.27954102,
          "special": false,
          "text": " print"
        },
        {
          "id": 459,
          "logprob": -0.6142578,
          "special": false,
          "text": "(\""
        },
        {
          "id": 8302,
          "logprob": -0.68310547,
          "special": false,
          "text": "Hello"
        },
        {
          "id": 10914,
          "logprob": -1.4570312,
          "special": false,
          "text": " World"
        },
        {
          "id": 16013,
          "logprob": -0.80126953,
          "special": false,
          "text": "!\")"
        },
        {
          "id": 222,
          "logprob": -0.6303711,
          "special": false,
          "text": "\n"
        },
        {
          "id": 222,
          "logprob": -0.23327637,
          "special": false,
          "text": "\n"
        },
        {
          "id": 610,
          "logprob": -1.2304688,
          "special": false,
          "text": "def"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "():\n    print(\"Hello World!\")\n\ndef"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_flash_starcoder2_lora/test_flash_starcoder2.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 2284,
        "logprob": -0.9355469,
        "special": false,
        "text": "():"
      },
      {
        "id": 303,
        "logprob": -0.40795898,
        "special": false,
        "text": "\n   "
      },
      {
        "id": 1489,
        "logprob": -0.27954102,
        "special": false,
        "text": " print"
      },
      {
        "id": 459,
        "logprob": -0.6142578,
        "special": false,
        "text": "(\""
      },
      {
        "id": 8302,
        "logprob": -0.68310547,
        "special": false,
        "text": "Hello"
      },
      {
        "id": 10914,
        "logprob": -1.4599609,
        "special": false,
        "text": " World"
      },
      {
        "id": 16013,
        "logprob": -0.80126953,
        "special": false,
        "text": "!\")"
      },
      {
        "id": 222,
        "logprob": -0.625,
        "special": false,
        "text": "\n"
      },
      {
        "id": 222,
        "logprob": -0.23242188,
        "special": false,
        "text": "\n"
      },
      {
        "id": 610,
        "logprob": -1.2294922,
        "special": false,
        "text": "def"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "():\n    print(\"Hello World!\")\n\ndef"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_starcoder2_lora/test_flash_starcoder2_default_params.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 60,
    "prefill": [],
    "seed": 0,
    "tokens": [
      {
        "id": 222,
        "logprob": 0.0,
        "special": false,
        "text": "\n"
      },
      {
        "id": 222,
        "logprob": 0.0,
        "special": false,
        "text": "\n"
      },
      {
        "id": 40,
        "logprob": -0.7944336,
        "special": false,
        "text": "#"
      },
      {
        "id": 494,
        "logprob": 0.0,
        "special": false,
        "text": " +"
      },
      {
        "id": 447,
        "logprob": -0.1796875,
        "special": false,
        "text": " ["
      },
      {
        "id": 9009,
        "logprob": 0.0,
        "special": false,
        "text": "markdown"
      },
      {
        "id": 98,
        "logprob": 0.0,
        "special": false,
        "text": "]"
      },
      {
        "id": 37402,
        "logprob": 0.0,
        "special": false,
        "text": " slideshow"
      },
      {
        "id": 8492,
        "logprob": 0.0,
        "special": false,
        "text": "={\""
      },
      {
        "id": 7277,
        "logprob": 0.0,
        "special": false,
        "text": "slide"
      },
      {
        "id": 100,
        "logprob": 0.0,
        "special": false,
        "text": "_"
      },
      {
        "id": 700,
        "logprob": 0.0,
        "special": false,
        "text": "type"
      },
      {
        "id": 582,
        "logprob": 0.0,
        "special": false,
        "text": "\":"
      },
      {
        "id": 332,
        "logprob": 0.0,
        "special": false,
        "text": " \""
      },
      {
        "id": 7277,
        "logprob": -0.06994629,
        "special": false,
        "text": "slide"
      },
      {
        "id": 3667,
        "logprob": 0.0,
        "special": false,
        "text": "\"}"
      },
      {
        "id": 222,
        "logprob": 0.0,
        "special": false,
        "text": "\n"
      },
      {
        "id": 40,
        "logprob": 0.0,
        "special": false,
        "text": "#"
      },
      {
        "id": 607,
        "logprob": -0.8261719,
        "special": false,
        "text": " #"
      },
      {
        "id": 244,
        "logprob": -1.8574219,
        "special": false,
        "text": " "
      },
      {
        "id": 55,
        "logprob": -1.4541016,
        "special": false,
        "text": "2"
      },
      {
        "id": 51,
        "logprob": 0.0,
        "special": false,
        "text": "."
      },
      {
        "id": 6208,
        "logprob": -0.9794922,
        "special": false,
        "text": " What"
      },
      {
        "id": 458,
        "logprob": 0.0,
        "special": false,
        "text": " is"
      },
      {
        "id": 341,
        "logprob": 0.0,
        "special": false,
        "text": " the"
      },
      {
        "id": 10609,
        "logprob": -0.69189453,
        "special": false,
        "text": " difference"
      },
      {
        "id": 3761,
        "logprob": 0.0,
        "special": false,
        "text": " between"
      },
      {
        "id": 331,
        "logprob": 0.0,
        "special": false,
        "text": " a"
      },
      {
        "id": 1168,
        "logprob": -0.27172852,
        "special": false,
        "text": " list"
      },
      {
        "id": 480,
        "logprob": 0.0,
        "special": false,
        "text": " and"
      },
      {
        "id": 331,
        "logprob": 0.0,
        "special": false,
        "text": " a"
      },
      {
        "id": 8871,
        "logprob": 0.0,
        "special": false,
        "text": " tuple"
      },
      {
        "id": 68,
        "logprob": 0.0,
        "special": false,
        "text": "?"
      },
      {
        "id": 222,
        "logprob": 0.0,
        "special": false,
        "text": "\n"
      },
      {
        "id": 40,
        "logprob": -1.3359375,
        "special": false,
        "text": "#"
      },
      {
        "id": 222,
        "logprob": 0.0,
        "special": false,
        "text": "\n"
      },
      {
        "id": 40,
        "logprob": 0.0,
        "special": false,
        "text": "#"
      },
      {
        "id": 449,
        "logprob": -0.03164673,
        "special": false,
        "text": " -"
      },
      {
        "id": 418,
        "logprob": -1.0947266,
        "special": false,
        "text": " A"
      },
      {
        "id": 1168,
        "logprob": 0.0,
        "special": false,
        "text": " list"
      },
      {
        "id": 458,
        "logprob": 0.0,
        "special": false,
        "text": " is"
      },
      {
        "id": 331,
        "logprob": -0.3305664,
        "special": false,
        "text": " a"
      },
      {
        "id": 14792,
        "logprob": 0.0,
        "special": false,
        "text": " mutable"
      },
      {
        "id": 6645,
        "logprob": -0.40478516,
        "special": false,
        "text": " sequence"
      },
      {
        "id": 451,
        "logprob": 0.0,
        "special": false,
        "text": " of"
      },
      {
        "id": 4725,
        "logprob": -0.50390625,
        "special": false,
        "text": " elements"
      },
      {
        "id": 49,
        "logprob": -2.1269531,
        "special": false,
        "text": ","
      },
      {
        "id": 2236,
        "logprob": -0.1427002,
        "special": false,
        "text": " while"
      },
      {
        "id": 331,
        "logprob": 0.0,
        "special": false,
        "text": " a"
      },
      {
        "id": 8871,
        "logprob": 0.0,
        "special": false,
        "text": " tuple"
      },
      {
        "id": 458,
        "logprob": 0.0,
        "special": false,
        "text": " is"
      },
      {
        "id": 619,
        "logprob": 0.0,
        "special": false,
        "text": " an"
      },
      {
        "id": 26079,
        "logprob": 0.0,
        "special": false,
        "text": " immutable"
      },
      {
        "id": 6645,
        "logprob": 0.0,
        "special": false,
        "text": " sequence"
      },
      {
        "id": 451,
        "logprob": 0.0,
        "special": false,
        "text": " of"
      },
      {
        "id": 4725,
        "logprob": 0.0,
        "special": false,
        "text": " elements"
      },
      {
        "id": 51,
        "logprob": 0.0,
        "special": false,
        "text": "."
      },
      {
        "id": 222,
        "logprob": 0.0,
        "special": false,
        "text": "\n"
      },
      {
        "id": 40,
        "logprob": 0.0,
        "special": false,
        "text": "#"
      },
      {
        "id": 449,
        "logprob": 0.0,
        "special": false,
        "text": " -"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "\n\n# + [markdown] slideshow={\"slide_type\": \"slide\"}\n# # 2. What is the difference between a list and a tuple?\n#\n# - A list is a mutable sequence of elements, while a tuple is an immutable sequence of elements.\n# -"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_starcoder2_lora/test_flash_starcoder2_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 222,
          "logprob": -1.9091797,
          "special": false,
          "text": "\n"
        },
        {
          "id": 222,
          "logprob": -1.0478516,
          "special": false,
          "text": "\n"
        },
        {
          "id": 40,
          "logprob": -3.015625,
          "special": false,
          "text": "#"
        },
        {
          "id": 494,
          "logprob": -1.4228516,
          "special": false,
          "text": " +"
        },
        {
          "id": 447,
          "logprob": -1.1025391,
          "special": false,
          "text": " ["
        },
        {
          "id": 9009,
          "logprob": -0.0008444786,
          "special": false,
          "text": "markdown"
        },
        {
          "id": 98,
          "logprob": -8.8095665e-05,
          "special": false,
          "text": "]"
        },
        {
          "id": 37402,
          "logprob": -0.5810547,
          "special": false,
          "text": " slideshow"
        },
        {
          "id": 8492,
          "logprob": -0.00022864342,
          "special": false,
          "text": "={\""
        },
        {
          "id": 7277,
          "logprob": -0.00030994415,
          "special": false,
          "text": "slide"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\n# + [markdown] slideshow={\"slide"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 222,
          "logprob": -1.9091797,
          "special": false,
          "text": "\n"
        },
        {
          "id": 222,
          "logprob": -1.0478516,
          "special": false,
          "text": "\n"
        },
        {
          "id": 40,
          "logprob": -3.015625,
          "special": false,
          "text": "#"
        },
        {
          "id": 494,
          "logprob": -1.4228516,
          "special": false,
          "text": " +"
        },
        {
          "id": 447,
          "logprob": -1.1025391,
          "special": false,
          "text": " ["
        },
        {
          "id": 9009,
          "logprob": -0.0008444786,
          "special": false,
          "text": "markdown"
        },
        {
          "id": 98,
          "logprob": -8.8095665e-05,
          "special": false,
          "text": "]"
        },
        {
          "id": 37402,
          "logprob": -0.5810547,
          "special": false,
          "text": " slideshow"
        },
        {
          "id": 8492,
          "logprob": -0.00022864342,
          "special": false,
          "text": "={\""
        },
        {
          "id": 7277,
          "logprob": -0.00030994415,
          "special": false,
          "text": "slide"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\n# + [markdown] slideshow={\"slide"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 222,
          "logprob": -1.9091797,
          "special": false,
          "text": "\n"
        },
        {
          "id": 222,
          "logprob": -1.0478516,
          "special": false,
          "text": "\n"
        },
        {
          "id": 40,
          "logprob": -3.015625,
          "special": false,
          "text": "#"
        },
        {
          "id": 494,
          "logprob": -1.4228516,
          "special": false,
          "text": " +"
        },
        {
          "id": 447,
          "logprob": -1.1025391,
          "special": false,
          "text": " ["
        },
        {
          "id": 9009,
          "logprob": -0.0008444786,
          "special": false,
          "text": "markdown"
        },
        {
          "id": 98,
          "logprob": -8.8095665e-05,
          "special": false,
          "text": "]"
        },
        {
          "id": 37402,
          "logprob": -0.5810547,
          "special": false,
          "text": " slideshow"
        },
        {
          "id": 8492,
          "logprob": -0.00022864342,
          "special": false,
          "text": "={\""
        },
        {
          "id": 7277,
          "logprob": -0.00030994415,
          "special": false,
          "text": "slide"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\n# + [markdown] slideshow={\"slide"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 222,
          "logprob": -1.9091797,
          "special": false,
          "text": "\n"
        },
        {
          "id": 222,
          "logprob": -1.0478516,
          "special": false,
          "text": "\n"
        },
        {
          "id": 40,
          "logprob": -3.015625,
          "special": false,
          "text": "#"
        },
        {
          "id": 494,
          "logprob": -1.4228516,
          "special": false,
          "text": " +"
        },
        {
          "id": 447,
          "logprob": -1.1025391,
          "special": false,
          "text": " ["
        },
        {
          "id": 9009,
          "logprob": -0.0008444786,
          "special": false,
          "text": "markdown"
        },
        {
          "id": 98,
          "logprob": -8.8095665e-05,
          "special": false,
          "text": "]"
        },
        {
          "id": 37402,
          "logprob": -0.5810547,
          "special": false,
          "text": " slideshow"
        },
        {
          "id": 8492,
          "logprob": -0.00022864342,
          "special": false,
          "text": "={\""
        },
        {
          "id": 7277,
          "logprob": -0.00030994415,
          "special": false,
          "text": "slide"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\n# + [markdown] slideshow={\"slide"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_flash_starcoder2_lora/test_flash_starcoder2_with_hugcode_adapter.json
================================================
{
  "details": {
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 100,
        "logprob": -0.9824219,
        "special": false,
        "text": "_"
      },
      {
        "id": 5879,
        "logprob": -0.3017578,
        "special": false,
        "text": "world"
      },
      {
        "id": 2284,
        "logprob": -0.68652344,
        "special": false,
        "text": "():"
      },
      {
        "id": 303,
        "logprob": -0.27734375,
        "special": false,
        "text": "\n   "
      },
      {
        "id": 1489,
        "logprob": -0.4482422,
        "special": false,
        "text": " print"
      },
      {
        "id": 459,
        "logprob": -0.54248047,
        "special": false,
        "text": "(\""
      },
      {
        "id": 8302,
        "logprob": -0.4296875,
        "special": false,
        "text": "Hello"
      },
      {
        "id": 10914,
        "logprob": -0.8544922,
        "special": false,
        "text": " World"
      },
      {
        "id": 16013,
        "logprob": -0.7573242,
        "special": false,
        "text": "!\")"
      },
      {
        "id": 222,
        "logprob": -0.81347656,
        "special": false,
        "text": "\n"
      }
    ]
  },
  "generated_text": "_world():\n    print(\"Hello World!\")\n"
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "eos_token",
    "generated_tokens": 2,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 284,
        "logprob": -1.1679688,
        "special": false,
        "text": "\n   "
      },
      {
        "id": 0,
        "logprob": null,
        "special": true,
        "text": "<|endoftext|>"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "\n   "
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq_default_params.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "eos_token",
    "generated_tokens": 2,
    "prefill": [],
    "seed": 0,
    "tokens": [
      {
        "id": 284,
        "logprob": -0.048583984,
        "special": false,
        "text": "\n   "
      },
      {
        "id": 0,
        "logprob": null,
        "special": true,
        "text": "<|endoftext|>"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "\n   "
}


================================================
FILE: integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "eos_token",
      "generated_tokens": 2,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 284,
          "logprob": -0.046844482,
          "special": false,
          "text": "\n   "
        },
        {
          "id": 0,
          "logprob": null,
          "special": true,
          "text": "<|endoftext|>"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n   "
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "eos_token",
      "generated_tokens": 2,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 284,
          "logprob": -0.046722412,
          "special": false,
          "text": "\n   "
        },
        {
          "id": 0,
          "logprob": null,
          "special": true,
          "text": "<|endoftext|>"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n   "
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "eos_token",
      "generated_tokens": 2,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 284,
          "logprob": -0.04650879,
          "special": false,
          "text": "\n   "
        },
        {
          "id": 0,
          "logprob": null,
          "special": true,
          "text": "<|endoftext|>"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n   "
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "eos_token",
      "generated_tokens": 2,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 284,
          "logprob": -0.046539307,
          "special": false,
          "text": "\n   "
        },
        {
          "id": 0,
          "logprob": null,
          "special": true,
          "text": "<|endoftext|>"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n   "
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_grammar_llama/test_non_flash_llama_grammar_json.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "eos_token",
    "generated_tokens": 30,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 6377,
        "logprob": -0.14916992,
        "special": false,
        "text": "{\""
      },
      {
        "id": 29888,
        "logprob": -0.13598633,
        "special": false,
        "text": "f"
      },
      {
        "id": 12935,
        "logprob": -0.017669678,
        "special": false,
        "text": "irs"
      },
      {
        "id": 29873,
        "logprob": -0.00085639954,
        "special": false,
        "text": "t"
      },
      {
        "id": 1170,
        "logprob": -0.0054016113,
        "special": false,
        "text": "Name"
      },
      {
        "id": 4710,
        "logprob": -0.13549805,
        "special": false,
        "text": "\":\""
      },
      {
        "id": 19504,
        "logprob": -0.8852539,
        "special": false,
        "text": "David"
      },
      {
        "id": 3284,
        "logprob": -0.16394043,
        "special": false,
        "text": "\",\""
      },
      {
        "id": 29882,
        "logprob": -0.08862305,
        "special": false,
        "text": "h"
      },
      {
        "id": 711,
        "logprob": -0.66259766,
        "special": false,
        "text": "ob"
      },
      {
        "id": 1609,
        "logprob": -5.51939e-05,
        "special": false,
        "text": "by"
      },
      {
        "id": 4710,
        "logprob": -0.23120117,
        "special": false,
        "text": "\":\""
      },
      {
        "id": 29911,
        "logprob": -2.3730469,
        "special": false,
        "text": "T"
      },
      {
        "id": 11003,
        "logprob": -0.032104492,
        "special": false,
        "text": "rees"
      },
      {
        "id": 3284,
        "logprob": -0.22021484,
        "special": false,
        "text": "\",\""
      },
      {
        "id": 4230,
        "logprob": -0.06726074,
        "special": false,
        "text": "last"
      },
      {
        "id": 1170,
        "logprob": -0.003501892,
        "special": false,
        "text": "Name"
      },
      {
        "id": 4710,
        "logprob": -0.0045661926,
        "special": false,
        "text": "\":\""
      },
      {
        "id": 29950,
        "logprob": -0.12512207,
        "special": false,
        "text": "H"
      },
      {
        "id": 14339,
        "logprob": -0.009552002,
        "special": false,
        "text": "olt"
      },
      {
        "id": 29920,
        "logprob": -0.00042438507,
        "special": false,
        "text": "z"
      },
      {
        "id": 3284,
        "logprob": -0.11651611,
        "special": false,
        "text": "\",\""
      },
      {
        "id": 29876,
        "logprob": -0.29736328,
        "special": false,
        "text": "n"
      },
      {
        "id": 398,
        "logprob": -0.003030777,
        "special": false,
        "text": "um"
      },
      {
        "id": 29907,
        "logprob": -0.3774414,
        "special": false,
        "text": "C"
      },
      {
        "id": 1446,
        "logprob": -0.0003130436,
        "special": false,
        "text": "ats"
      },
      {
        "id": 1115,
        "logprob": -0.0021514893,
        "special": false,
        "text": "\":"
      },
      {
        "id": 29906,
        "logprob": -0.071899414,
        "special": false,
        "text": "2"
      },
      {
        "id": 29913,
        "logprob": -0.018997192,
        "special": false,
        "text": "}"
      },
      {
        "id": 2,
        "logprob": 0.0,
        "special": true,
        "text": "</s>"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "{\"firstName\":\"David\",\"hobby\":\"Trees\",\"lastName\":\"Holtz\",\"numCats\":2}"
}


================================================
FILE: integration-tests/models/__snapshots__/test_grammar_response_format_llama/test_grammar_response_format_llama_json.1.json
================================================
{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "{ \"unit\": \"fahrenheit\", \"temperature\": [ 72, 79, 88 ] }",
        "role": "assistant"
      }
    }
  ],
  "created": 1740095072,
  "id": "",
  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
  "object": "chat.completion",
  "system_fingerprint": "3.1.1-dev0-native",
  "usage": {
    "completion_tokens": 29,
    "prompt_tokens": 135,
    "total_tokens": 164
  }
}


================================================
FILE: integration-tests/models/__snapshots__/test_grammar_response_format_llama/test_grammar_response_format_llama_json.2.json
================================================
{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "{ \"unit\": \"fahrenheit\", \"temperature\": [ 72, 79, 88 ] }",
        "role": "assistant"
      }
    }
  ],
  "created": 1740095073,
  "id": "",
  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
  "object": "chat.completion",
  "system_fingerprint": "3.1.1-dev0-native",
  "usage": {
    "completion_tokens": 29,
    "prompt_tokens": 135,
    "total_tokens": 164
  }
}


================================================
FILE: integration-tests/models/__snapshots__/test_grammar_response_format_llama/test_grammar_response_format_llama_json.json
================================================
{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "{ \"unit\": \"fahrenheit\", \"temperature\": [ 72, 79, 88 ] }",
        "role": "assistant"
      }
    }
  ],
  "created": 1732525803,
  "id": "",
  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
  "object": "chat.completion",
  "system_fingerprint": "2.4.1-dev0-native",
  "usage": {
    "completion_tokens": 29,
    "prompt_tokens": 136,
    "total_tokens": 165
  }
}


================================================
FILE: integration-tests/models/__snapshots__/test_idefics/test_idefics.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [
      {
        "id": 1,
        "logprob": null,
        "text": "<s>"
      },
      {
        "id": 4911,
        "logprob": -6.9765625,
        "text": "User"
      },
      {
        "id": 29901,
        "logprob": -0.0059432983,
        "text": ":"
      },
      {
        "id": 32000,
        "logprob": -0.8408203,
        "text": "<fake_token_around_image>"
      },
      {
        "id": 32001,
        "logprob": -9.906292e-05,
        "text": "<image>"
      },
      {
        "id": 32000,
        "logprob": -2.3841858e-07,
        "text": "<fake_token_around_image>"
      },
      {
        "id": 1815,
        "logprob": -4.1679688,
        "text": "Can"
      },
      {
        "id": 366,
        "logprob": -0.014099121,
        "text": "you"
      },
      {
        "id": 2649,
        "logprob": -4.4609375,
        "text": "tell"
      },
      {
        "id": 592,
        "logprob": -0.29882812,
        "text": "me"
      },
      {
        "id": 263,
        "logprob": -4.1445312,
        "text": "a"
      },
      {
        "id": 1407,
        "logprob": -9.3828125,
        "text": "very"
      },
      {
        "id": 3273,
        "logprob": -1.9736328,
        "text": "short"
      },
      {
        "id": 5828,
        "logprob": -0.2800293,
        "text": "story"
      },
      {
        "id": 2729,
        "logprob": -3.5625,
        "text": "based"
      },
      {
        "id": 373,
        "logprob": -0.0006427765,
        "text": "on"
      },
      {
        "id": 278,
        "logprob": -0.13952637,
        "text": "the"
      },
      {
        "id": 1967,
        "logprob": -0.068115234,
        "text": "image"
      },
      {
        "id": 29973,
        "logprob": -0.16357422,
        "text": "?"
      }
    ],
    "seed": null,
    "tokens": [
      {
        "id": 32002,
        "logprob": -0.0026474,
        "special": true,
        "text": "<end_of_utterance>"
      },
      {
        "id": 29871,
        "logprob": -8.547306e-05,
        "special": false,
        "text": " "
      },
      {
        "id": 13,
        "logprob": -1.7881393e-05,
        "special": false,
        "text": "\n"
      },
      {
        "id": 7900,
        "logprob": -3.0994415e-06,
        "special": false,
        "text": "Ass"
      },
      {
        "id": 22137,
        "logprob": 0.0,
        "special": false,
        "text": "istant"
      },
      {
        "id": 29901,
        "logprob": -3.2186508e-06,
        "special": false,
        "text": ":"
      },
      {
        "id": 319,
        "logprob": -0.92529297,
        "special": false,
        "text": " A"
      },
      {
        "id": 696,
        "logprob": -1.1269531,
        "special": false,
        "text": " ro"
      },
      {
        "id": 15664,
        "logprob": -0.00029492378,
        "special": false,
        "text": "oster"
      },
      {
        "id": 15028,
        "logprob": -1.1855469,
        "special": false,
        "text": " stands"
      }
    ]
  },
  "generated_text": " \nAssistant: A rooster stands"
}


================================================
FILE: integration-tests/models/__snapshots__/test_idefics/test_idefics_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 1,
          "logprob": null,
          "text": "<s>"
        },
        {
          "id": 4911,
          "logprob": -6.9804688,
          "text": "User"
        },
        {
          "id": 29901,
          "logprob": -0.006122589,
          "text": ":"
        },
        {
          "id": 32000,
          "logprob": -0.8417969,
          "text": "<fake_token_around_image>"
        },
        {
          "id": 32001,
          "logprob": -9.918213e-05,
          "text": "<image>"
        },
        {
          "id": 32000,
          "logprob": -2.3841858e-07,
          "text": "<fake_token_around_image>"
        },
        {
          "id": 1815,
          "logprob": -4.1679688,
          "text": "Can"
        },
        {
          "id": 366,
          "logprob": -0.014091492,
          "text": "you"
        },
        {
          "id": 2649,
          "logprob": -4.4726562,
          "text": "tell"
        },
        {
          "id": 592,
          "logprob": -0.2998047,
          "text": "me"
        },
        {
          "id": 263,
          "logprob": -4.15625,
          "text": "a"
        },
        {
          "id": 1407,
          "logprob": -9.3828125,
          "text": "very"
        },
        {
          "id": 3273,
          "logprob": -1.9716797,
          "text": "short"
        },
        {
          "id": 5828,
          "logprob": -0.27734375,
          "text": "story"
        },
        {
          "id": 2729,
          "logprob": -3.5605469,
          "text": "based"
        },
        {
          "id": 373,
          "logprob": -0.00064468384,
          "text": "on"
        },
        {
          "id": 278,
          "logprob": -0.14160156,
          "text": "the"
        },
        {
          "id": 1967,
          "logprob": -0.06915283,
          "text": "image"
        },
        {
          "id": 29973,
          "logprob": -0.16381836,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 32002,
          "logprob": -0.0026664734,
          "special": true,
          "text": "<end_of_utterance>"
        },
        {
          "id": 29871,
          "logprob": -8.583069e-05,
          "special": false,
          "text": " "
        },
        {
          "id": 13,
          "logprob": -1.8119812e-05,
          "special": false,
          "text": "\n"
        },
        {
          "id": 7900,
          "logprob": -2.9802322e-06,
          "special": false,
          "text": "Ass"
        },
        {
          "id": 22137,
          "logprob": 0.0,
          "special": false,
          "text": "istant"
        },
        {
          "id": 29901,
          "logprob": -3.2186508e-06,
          "special": false,
          "text": ":"
        },
        {
          "id": 319,
          "logprob": -0.9301758,
          "special": false,
          "text": " A"
        },
        {
          "id": 696,
          "logprob": -1.1279297,
          "special": false,
          "text": " ro"
        },
        {
          "id": 15664,
          "logprob": -0.0002939701,
          "special": false,
          "text": "oster"
        },
        {
          "id": 15028,
          "logprob": -1.1865234,
          "special": false,
          "text": " stands"
        }
      ]
    },
    "generated_text": " \nAssistant: A rooster stands"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 1,
          "logprob": null,
          "text": "<s>"
        },
        {
          "id": 4911,
          "logprob": -6.9804688,
          "text": "User"
        },
        {
          "id": 29901,
          "logprob": -0.006122589,
          "text": ":"
        },
        {
          "id": 32000,
          "logprob": -0.8417969,
          "text": "<fake_token_around_image>"
        },
        {
          "id": 32001,
          "logprob": -9.942055e-05,
          "text": "<image>"
        },
        {
          "id": 32000,
          "logprob": -2.3841858e-07,
          "text": "<fake_token_around_image>"
        },
        {
          "id": 1815,
          "logprob": -4.1679688,
          "text": "Can"
        },
        {
          "id": 366,
          "logprob": -0.014091492,
          "text": "you"
        },
        {
          "id": 2649,
          "logprob": -4.4726562,
          "text": "tell"
        },
        {
          "id": 592,
          "logprob": -0.2998047,
          "text": "me"
        },
        {
          "id": 263,
          "logprob": -4.15625,
          "text": "a"
        },
        {
          "id": 1407,
          "logprob": -9.3828125,
          "text": "very"
        },
        {
          "id": 3273,
          "logprob": -1.9716797,
          "text": "short"
        },
        {
          "id": 5828,
          "logprob": -0.27734375,
          "text": "story"
        },
        {
          "id": 2729,
          "logprob": -3.5605469,
          "text": "based"
        },
        {
          "id": 373,
          "logprob": -0.0006451607,
          "text": "on"
        },
        {
          "id": 278,
          "logprob": -0.14160156,
          "text": "the"
        },
        {
          "id": 1967,
          "logprob": -0.06915283,
          "text": "image"
        },
        {
          "id": 29973,
          "logprob": -0.16381836,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 32002,
          "logprob": -0.0026664734,
          "special": true,
          "text": "<end_of_utterance>"
        },
        {
          "id": 29871,
          "logprob": -8.571148e-05,
          "special": false,
          "text": " "
        },
        {
          "id": 13,
          "logprob": -1.8119812e-05,
          "special": false,
          "text": "\n"
        },
        {
          "id": 7900,
          "logprob": -3.0994415e-06,
          "special": false,
          "text": "Ass"
        },
        {
          "id": 22137,
          "logprob": 0.0,
          "special": false,
          "text": "istant"
        },
        {
          "id": 29901,
          "logprob": -3.0994415e-06,
          "special": false,
          "text": ":"
        },
        {
          "id": 319,
          "logprob": -0.9301758,
          "special": false,
          "text": " A"
        },
        {
          "id": 696,
          "logprob": -1.1279297,
          "special": false,
          "text": " ro"
        },
        {
          "id": 15664,
          "logprob": -0.0002939701,
          "special": false,
          "text": "oster"
        },
        {
          "id": 15028,
          "logprob": -1.1865234,
          "special": false,
          "text": " stands"
        }
      ]
    },
    "generated_text": " \nAssistant: A rooster stands"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 1,
          "logprob": null,
          "text": "<s>"
        },
        {
          "id": 4911,
          "logprob": -6.9804688,
          "text": "User"
        },
        {
          "id": 29901,
          "logprob": -0.006122589,
          "text": ":"
        },
        {
          "id": 32000,
          "logprob": -0.8417969,
          "text": "<fake_token_around_image>"
        },
        {
          "id": 32001,
          "logprob": -9.918213e-05,
          "text": "<image>"
        },
        {
          "id": 32000,
          "logprob": -2.3841858e-07,
          "text": "<fake_token_around_image>"
        },
        {
          "id": 1815,
          "logprob": -4.1679688,
          "text": "Can"
        },
        {
          "id": 366,
          "logprob": -0.014091492,
          "text": "you"
        },
        {
          "id": 2649,
          "logprob": -4.4726562,
          "text": "tell"
        },
        {
          "id": 592,
          "logprob": -0.2998047,
          "text": "me"
        },
        {
          "id": 263,
          "logprob": -4.15625,
          "text": "a"
        },
        {
          "id": 1407,
          "logprob": -9.3828125,
          "text": "very"
        },
        {
          "id": 3273,
          "logprob": -1.9716797,
          "text": "short"
        },
        {
          "id": 5828,
          "logprob": -0.27734375,
          "text": "story"
        },
        {
          "id": 2729,
          "logprob": -3.5605469,
          "text": "based"
        },
        {
          "id": 373,
          "logprob": -0.00064468384,
          "text": "on"
        },
        {
          "id": 278,
          "logprob": -0.14160156,
          "text": "the"
        },
        {
          "id": 1967,
          "logprob": -0.06915283,
          "text": "image"
        },
        {
          "id": 29973,
          "logprob": -0.16381836,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 32002,
          "logprob": -0.0026664734,
          "special": true,
          "text": "<end_of_utterance>"
        },
        {
          "id": 29871,
          "logprob": -8.59499e-05,
          "special": false,
          "text": " "
        },
        {
          "id": 13,
          "logprob": -1.8119812e-05,
          "special": false,
          "text": "\n"
        },
        {
          "id": 7900,
          "logprob": -3.0994415e-06,
          "special": false,
          "text": "Ass"
        },
        {
          "id": 22137,
          "logprob": 0.0,
          "special": false,
          "text": "istant"
        },
        {
          "id": 29901,
          "logprob": -3.0994415e-06,
          "special": false,
          "text": ":"
        },
        {
          "id": 319,
          "logprob": -0.9301758,
          "special": false,
          "text": " A"
        },
        {
          "id": 696,
          "logprob": -1.1279297,
          "special": false,
          "text": " ro"
        },
        {
          "id": 15664,
          "logprob": -0.0002939701,
          "special": false,
          "text": "oster"
        },
        {
          "id": 15028,
          "logprob": -1.1865234,
          "special": false,
          "text": " stands"
        }
      ]
    },
    "generated_text": " \nAssistant: A rooster stands"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 1,
          "logprob": null,
          "text": "<s>"
        },
        {
          "id": 4911,
          "logprob": -6.9804688,
          "text": "User"
        },
        {
          "id": 29901,
          "logprob": -0.006122589,
          "text": ":"
        },
        {
          "id": 32000,
          "logprob": -0.8417969,
          "text": "<fake_token_around_image>"
        },
        {
          "id": 32001,
          "logprob": -9.942055e-05,
          "text": "<image>"
        },
        {
          "id": 32000,
          "logprob": -2.3841858e-07,
          "text": "<fake_token_around_image>"
        },
        {
          "id": 1815,
          "logprob": -4.1679688,
          "text": "Can"
        },
        {
          "id": 366,
          "logprob": -0.014091492,
          "text": "you"
        },
        {
          "id": 2649,
          "logprob": -4.4726562,
          "text": "tell"
        },
        {
          "id": 592,
          "logprob": -0.2998047,
          "text": "me"
        },
        {
          "id": 263,
          "logprob": -4.15625,
          "text": "a"
        },
        {
          "id": 1407,
          "logprob": -9.3828125,
          "text": "very"
        },
        {
          "id": 3273,
          "logprob": -1.9716797,
          "text": "short"
        },
        {
          "id": 5828,
          "logprob": -0.27734375,
          "text": "story"
        },
        {
          "id": 2729,
          "logprob": -3.5605469,
          "text": "based"
        },
        {
          "id": 373,
          "logprob": -0.0006451607,
          "text": "on"
        },
        {
          "id": 278,
          "logprob": -0.14160156,
          "text": "the"
        },
        {
          "id": 1967,
          "logprob": -0.06915283,
          "text": "image"
        },
        {
          "id": 29973,
          "logprob": -0.16381836,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 32002,
          "logprob": -0.0026664734,
          "special": true,
          "text": "<end_of_utterance>"
        },
        {
          "id": 29871,
          "logprob": -8.571148e-05,
          "special": false,
          "text": " "
        },
        {
          "id": 13,
          "logprob": -1.8119812e-05,
          "special": false,
          "text": "\n"
        },
        {
          "id": 7900,
          "logprob": -3.0994415e-06,
          "special": false,
          "text": "Ass"
        },
        {
          "id": 22137,
          "logprob": 0.0,
          "special": false,
          "text": "istant"
        },
        {
          "id": 29901,
          "logprob": -3.0994415e-06,
          "special": false,
          "text": ":"
        },
        {
          "id": 319,
          "logprob": -0.9301758,
          "special": false,
          "text": " A"
        },
        {
          "id": 696,
          "logprob": -1.1279297,
          "special": false,
          "text": " ro"
        },
        {
          "id": 15664,
          "logprob": -0.0002939701,
          "special": false,
          "text": "oster"
        },
        {
          "id": 15028,
          "logprob": -1.1865234,
          "special": false,
          "text": " stands"
        }
      ]
    },
    "generated_text": " \nAssistant: A rooster stands"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_idefics/test_idefics_two_images.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "eos_token",
    "generated_tokens": 13,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 450,
        "logprob": -0.2602539,
        "special": false,
        "text": " The"
      },
      {
        "id": 21282,
        "logprob": -0.018463135,
        "special": false,
        "text": " cow"
      },
      {
        "id": 322,
        "logprob": -0.1829834,
        "special": false,
        "text": " and"
      },
      {
        "id": 521,
        "logprob": -0.62109375,
        "special": false,
        "text": " ch"
      },
      {
        "id": 21475,
        "logprob": -0.0037403107,
        "special": false,
        "text": "icken"
      },
      {
        "id": 526,
        "logprob": -0.018920898,
        "special": false,
        "text": " are"
      },
      {
        "id": 13407,
        "logprob": -1.0732422,
        "special": false,
        "text": " standing"
      },
      {
        "id": 373,
        "logprob": -0.5292969,
        "special": false,
        "text": " on"
      },
      {
        "id": 263,
        "logprob": -0.47070312,
        "special": false,
        "text": " a"
      },
      {
        "id": 25695,
        "logprob": -0.25708008,
        "special": false,
        "text": " beach"
      },
      {
        "id": 29889,
        "logprob": -0.17578125,
        "special": false,
        "text": "."
      },
      {
        "id": 32002,
        "logprob": -0.0023422241,
        "special": true,
        "text": "<end_of_utterance>"
      },
      {
        "id": 2,
        "logprob": -0.00030851364,
        "special": true,
        "text": "</s>"
      }
    ],
    "top_tokens": null
  },
  "generated_text": " The cow and chicken are standing on a beach."
}


================================================
FILE: integration-tests/models/__snapshots__/test_idefics2/test_flash_idefics2_next_all_params.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": 0,
    "tokens": [
      {
        "id": 288,
        "logprob": -0.2854004,
        "special": false,
        "text": "ing"
      },
      {
        "id": 264,
        "logprob": -0.38061523,
        "special": false,
        "text": " a"
      },
      {
        "id": 633,
        "logprob": -0.09301758,
        "special": false,
        "text": " new"
      },
      {
        "id": 4480,
        "logprob": -0.26782227,
        "special": false,
        "text": " feature"
      },
      {
        "id": 297,
        "logprob": -0.8510742,
        "special": false,
        "text": " in"
      },
      {
        "id": 272,
        "logprob": -0.13464355,
        "special": false,
        "text": " the"
      },
      {
        "id": 2039,
        "logprob": 0.0,
        "special": false,
        "text": " game"
      },
      {
        "id": 28723,
        "logprob": -0.89990234,
        "special": false,
        "text": "."
      },
      {
        "id": 13,
        "logprob": 0.0,
        "special": false,
        "text": "\n"
      },
      {
        "id": 13,
        "logprob": -0.10632324,
        "special": false,
        "text": "\n"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "Test requesting a new feature in the game.\n\n"
}


================================================
FILE: integration-tests/models/__snapshots__/test_idefics2/test_flash_idefics2_next_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 330,
          "logprob": -0.09289551,
          "special": false,
          "text": " A"
        },
        {
          "id": 13088,
          "logprob": -0.6743164,
          "special": false,
          "text": " chicken"
        },
        {
          "id": 349,
          "logprob": -0.31396484,
          "special": false,
          "text": " is"
        },
        {
          "id": 6398,
          "logprob": -0.051727295,
          "special": false,
          "text": " sitting"
        },
        {
          "id": 356,
          "logprob": -0.34448242,
          "special": false,
          "text": " on"
        },
        {
          "id": 264,
          "logprob": -0.1194458,
          "special": false,
          "text": " a"
        },
        {
          "id": 17972,
          "logprob": -0.03237915,
          "special": false,
          "text": " pile"
        },
        {
          "id": 302,
          "logprob": -0.00018751621,
          "special": false,
          "text": " of"
        },
        {
          "id": 2445,
          "logprob": -0.07043457,
          "special": false,
          "text": " money"
        },
        {
          "id": 28723,
          "logprob": -0.00422287,
          "special": false,
          "text": "."
        }
      ],
      "top_tokens": null
    },
    "generated_text": " A chicken is sitting on a pile of money."
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 330,
          "logprob": -0.09448242,
          "special": false,
          "text": " A"
        },
        {
          "id": 13088,
          "logprob": -0.6743164,
          "special": false,
          "text": " chicken"
        },
        {
          "id": 349,
          "logprob": -0.31201172,
          "special": false,
          "text": " is"
        },
        {
          "id": 6398,
          "logprob": -0.051635742,
          "special": false,
          "text": " sitting"
        },
        {
          "id": 356,
          "logprob": -0.34033203,
          "special": false,
          "text": " on"
        },
        {
          "id": 264,
          "logprob": -0.1194458,
          "special": false,
          "text": " a"
        },
        {
          "id": 17972,
          "logprob": -0.032562256,
          "special": false,
          "text": " pile"
        },
        {
          "id": 302,
          "logprob": -0.00018763542,
          "special": false,
          "text": " of"
        },
        {
          "id": 2445,
          "logprob": -0.07122803,
          "special": false,
          "text": " money"
        },
        {
          "id": 28723,
          "logprob": -0.0041007996,
          "special": false,
          "text": "."
        }
      ],
      "top_tokens": null
    },
    "generated_text": " A chicken is sitting on a pile of money."
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 330,
          "logprob": -0.09448242,
          "special": false,
          "text": " A"
        },
        {
          "id": 13088,
          "logprob": -0.6743164,
          "special": false,
          "text": " chicken"
        },
        {
          "id": 349,
          "logprob": -0.31201172,
          "special": false,
          "text": " is"
        },
        {
          "id": 6398,
          "logprob": -0.051635742,
          "special": false,
          "text": " sitting"
        },
        {
          "id": 356,
          "logprob": -0.34033203,
          "special": false,
          "text": " on"
        },
        {
          "id": 264,
          "logprob": -0.1194458,
          "special": false,
          "text": " a"
        },
        {
          "id": 17972,
          "logprob": -0.032562256,
          "special": false,
          "text": " pile"
        },
        {
          "id": 302,
          "logprob": -0.00018787384,
          "special": false,
          "text": " of"
        },
        {
          "id": 2445,
          "logprob": -0.07122803,
          "special": false,
          "text": " money"
        },
        {
          "id": 28723,
          "logprob": -0.0041007996,
          "special": false,
          "text": "."
        }
      ],
      "top_tokens": null
    },
    "generated_text": " A chicken is sitting on a pile of money."
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 330,
          "logprob": -0.09448242,
          "special": false,
          "text": " A"
        },
        {
          "id": 13088,
          "logprob": -0.6743164,
          "special": false,
          "text": " chicken"
        },
        {
          "id": 349,
          "logprob": -0.31201172,
          "special": false,
          "text": " is"
        },
        {
          "id": 6398,
          "logprob": -0.051635742,
          "special": false,
          "text": " sitting"
        },
        {
          "id": 356,
          "logprob": -0.34033203,
          "special": false,
          "text": " on"
        },
        {
          "id": 264,
          "logprob": -0.1194458,
          "special": false,
          "text": " a"
        },
        {
          "id": 17972,
          "logprob": -0.032562256,
          "special": false,
          "text": " pile"
        },
        {
          "id": 302,
          "logprob": -0.00018763542,
          "special": false,
          "text": " of"
        },
        {
          "id": 2445,
          "logprob": -0.07122803,
          "special": false,
          "text": " money"
        },
        {
          "id": 28723,
          "logprob": -0.0041007996,
          "special": false,
          "text": "."
        }
      ],
      "top_tokens": null
    },
    "generated_text": " A chicken is sitting on a pile of money."
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_idefics2/test_flash_idefics2_next_simple.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 330,
        "logprob": -0.08660889,
        "special": false,
        "text": " A"
      },
      {
        "id": 13088,
        "logprob": -0.7089844,
        "special": false,
        "text": " chicken"
      },
      {
        "id": 349,
        "logprob": -0.32885742,
        "special": false,
        "text": " is"
      },
      {
        "id": 6398,
        "logprob": -0.05126953,
        "special": false,
        "text": " sitting"
      },
      {
        "id": 356,
        "logprob": -0.35229492,
        "special": false,
        "text": " on"
      },
      {
        "id": 264,
        "logprob": -0.12561035,
        "special": false,
        "text": " a"
      },
      {
        "id": 17972,
        "logprob": -0.038085938,
        "special": false,
        "text": " pile"
      },
      {
        "id": 302,
        "logprob": -0.00018656254,
        "special": false,
        "text": " of"
      },
      {
        "id": 2445,
        "logprob": -0.07293701,
        "special": false,
        "text": " money"
      },
      {
        "id": 28723,
        "logprob": -0.004852295,
        "special": false,
        "text": "."
      }
    ],
    "top_tokens": null
  },
  "generated_text": " A chicken is sitting on a pile of money."
}


================================================
FILE: integration-tests/models/__snapshots__/test_idefics2/test_flash_idefics2_two_images.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "eos_token",
    "generated_tokens": 19,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 415,
        "logprob": -0.03665161,
        "special": false,
        "text": " The"
      },
      {
        "id": 12072,
        "logprob": -0.13549805,
        "special": false,
        "text": " cow"
      },
      {
        "id": 349,
        "logprob": -0.05819702,
        "special": false,
        "text": " is"
      },
      {
        "id": 6328,
        "logprob": -0.6826172,
        "special": false,
        "text": " standing"
      },
      {
        "id": 356,
        "logprob": -0.1607666,
        "special": false,
        "text": " on"
      },
      {
        "id": 272,
        "logprob": -0.5073242,
        "special": false,
        "text": " the"
      },
      {
        "id": 10305,
        "logprob": -0.016418457,
        "special": false,
        "text": " beach"
      },
      {
        "id": 304,
        "logprob": -1.3916016,
        "special": false,
        "text": " and"
      },
      {
        "id": 272,
        "logprob": -0.020217896,
        "special": false,
        "text": " the"
      },
      {
        "id": 13088,
        "logprob": -0.0028133392,
        "special": false,
        "text": " chicken"
      },
      {
        "id": 349,
        "logprob": -0.003145218,
        "special": false,
        "text": " is"
      },
      {
        "id": 6398,
        "logprob": -0.37060547,
        "special": false,
        "text": " sitting"
      },
      {
        "id": 356,
        "logprob": -0.034851074,
        "special": false,
        "text": " on"
      },
      {
        "id": 264,
        "logprob": -0.2878418,
        "special": false,
        "text": " a"
      },
      {
        "id": 17972,
        "logprob": -0.046051025,
        "special": false,
        "text": " pile"
      },
      {
        "id": 302,
        "logprob": -0.00028848648,
        "special": false,
        "text": " of"
      },
      {
        "id": 2445,
        "logprob": -0.025772095,
        "special": false,
        "text": " money"
      },
      {
        "id": 28723,
        "logprob": -0.018127441,
        "special": false,
        "text": "."
      },
      {
        "id": 32002,
        "logprob": -0.00019824505,
        "special": true,
        "text": "<end_of_utterance>"
      }
    ],
    "top_tokens": null
  },
  "generated_text": " The cow is standing on the beach and the chicken is sitting on a pile of money."
}


================================================
FILE: integration-tests/models/__snapshots__/test_idefics3/test_flash_idefics3_next_simple_url.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "eos_token",
    "generated_tokens": 9,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 2684,
        "logprob": -0.24902344,
        "special": false,
        "text": " There"
      },
      {
        "id": 374,
        "logprob": -0.0703125,
        "special": false,
        "text": " is"
      },
      {
        "id": 264,
        "logprob": -0.23535156,
        "special": false,
        "text": " a"
      },
      {
        "id": 35372,
        "logprob": -0.125,
        "special": false,
        "text": " statue"
      },
      {
        "id": 304,
        "logprob": -0.30273438,
        "special": false,
        "text": " in"
      },
      {
        "id": 279,
        "logprob": -0.20507812,
        "special": false,
        "text": " the"
      },
      {
        "id": 2217,
        "logprob": -0.076171875,
        "special": false,
        "text": " image"
      },
      {
        "id": 13,
        "logprob": -0.053710938,
        "special": false,
        "text": "."
      },
      {
        "id": 128258,
        "logprob": -0.011352539,
        "special": true,
        "text": "<end_of_utterance>"
      }
    ],
    "top_tokens": null
  },
  "generated_text": " There is a statue in the image."
}


================================================
FILE: integration-tests/models/__snapshots__/test_json_schema_constrain/test_json_schema_basic.json
================================================
{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "{\"firstName\":\"David\",\"lastName\":\"(Not provided)\",\"hobby\":\", nature\",\"numCats\":2}",
        "role": "assistant"
      }
    }
  ],
  "created": 1746053368,
  "id": "",
  "model": "google/gemma-3-4b-it",
  "object": "chat.completion",
  "system_fingerprint": "3.3.6-dev0-native",
  "usage": {
    "completion_tokens": 35,
    "prompt_tokens": 32,
    "total_tokens": 67
  }
}


================================================
FILE: integration-tests/models/__snapshots__/test_json_schema_constrain/test_json_schema_complex.json
================================================
{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "{\"name\":\"John Smith\",\"age\":30,\"address\":{\"street\":\"Maple Street\",\"city\":\"Boston\"},\"hobbies\":[\"botany\",\"astronomy\",\"solving mathematical puzzles\"]}",
        "role": "assistant"
      }
    }
  ],
  "created": 1746053373,
  "id": "",
  "model": "google/gemma-3-4b-it",
  "object": "chat.completion",
  "system_fingerprint": "3.3.6-dev0-native",
  "usage": {
    "completion_tokens": 44,
    "prompt_tokens": 37,
    "total_tokens": 81
  }
}


================================================
FILE: integration-tests/models/__snapshots__/test_json_schema_constrain/test_json_schema_stream.json
================================================
[
  {
    "choices": [
      {
        "delta": {
          "content": "{",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975615,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "\"",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975615,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "f",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975615,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "irs",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975615,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "t",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975615,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "Name",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975615,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "\":",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975615,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "\"",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975615,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "David",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975615,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "\",",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975615,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "\"",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975615,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "l",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975615,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "ast",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975615,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "Name",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975615,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "\":",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975615,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "\"",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975615,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "Unknown",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975615,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "\",",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975616,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "\"",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975616,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "h",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975616,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "obb",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975616,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "y",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975616,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "\":",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975616,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "\",",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975616,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " \\\"",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975616,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "riding",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975616,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " bicycles",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975616,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "\\\",",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975616,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " \\\"",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975616,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "having",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975616,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " cats",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975616,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "\\\"\",",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975616,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "\"",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975616,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "num",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975616,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "Cats",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975616,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "\":",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975616,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "2",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975616,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "}",
          "role": "assistant"
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975616,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "",
          "role": "assistant"
        },
        "finish_reason": "stop",
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741975616,
    "id": "",
    "model": "google/gemma-3-4b-it",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.2.1-dev0-native",
    "usage": null
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_llava_next/test_flash_llava_next_all_params.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "stop_sequence",
    "generated_tokens": 6,
    "prefill": [],
    "seed": 0,
    "tokens": [
      {
        "id": 13,
        "logprob": -1.0654297,
        "special": false,
        "text": "\n"
      },
      {
        "id": 1014,
        "logprob": -2.7460938,
        "special": false,
        "text": "The"
      },
      {
        "id": 6032,
        "logprob": -1.359375,
        "special": false,
        "text": " purpose"
      },
      {
        "id": 302,
        "logprob": 0.0,
        "special": false,
        "text": " of"
      },
      {
        "id": 456,
        "logprob": 0.0,
        "special": false,
        "text": " this"
      },
      {
        "id": 1369,
        "logprob": -0.40063477,
        "special": false,
        "text": " test"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "Test request\nThe purpose of this test"
}


================================================
FILE: integration-tests/models/__snapshots__/test_llava_next/test_flash_llava_next_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 13,
          "logprob": -0.007621765,
          "special": false,
          "text": "\n"
        },
        {
          "id": 13,
          "logprob": -0.20812988,
          "special": false,
          "text": "\n"
        },
        {
          "id": 16114,
          "logprob": -1.2587891,
          "special": false,
          "text": "Once"
        },
        {
          "id": 3714,
          "logprob": -0.20825195,
          "special": false,
          "text": " upon"
        },
        {
          "id": 264,
          "logprob": -0.0017709732,
          "special": false,
          "text": " a"
        },
        {
          "id": 727,
          "logprob": -0.011932373,
          "special": false,
          "text": " time"
        },
        {
          "id": 28725,
          "logprob": -0.17297363,
          "special": false,
          "text": ","
        },
        {
          "id": 736,
          "logprob": -0.9057617,
          "special": false,
          "text": " there"
        },
        {
          "id": 403,
          "logprob": -0.05758667,
          "special": false,
          "text": " was"
        },
        {
          "id": 264,
          "logprob": -0.00970459,
          "special": false,
          "text": " a"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nOnce upon a time, there was a"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 13,
          "logprob": -0.007621765,
          "special": false,
          "text": "\n"
        },
        {
          "id": 13,
          "logprob": -0.20275879,
          "special": false,
          "text": "\n"
        },
        {
          "id": 16114,
          "logprob": -1.2578125,
          "special": false,
          "text": "Once"
        },
        {
          "id": 3714,
          "logprob": -0.2084961,
          "special": false,
          "text": " upon"
        },
        {
          "id": 264,
          "logprob": -0.0017738342,
          "special": false,
          "text": " a"
        },
        {
          "id": 727,
          "logprob": -0.011932373,
          "special": false,
          "text": " time"
        },
        {
          "id": 28725,
          "logprob": -0.17297363,
          "special": false,
          "text": ","
        },
        {
          "id": 736,
          "logprob": -0.9057617,
          "special": false,
          "text": " there"
        },
        {
          "id": 403,
          "logprob": -0.05758667,
          "special": false,
          "text": " was"
        },
        {
          "id": 264,
          "logprob": -0.00970459,
          "special": false,
          "text": " a"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nOnce upon a time, there was a"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 13,
          "logprob": -0.007621765,
          "special": false,
          "text": "\n"
        },
        {
          "id": 13,
          "logprob": -0.20275879,
          "special": false,
          "text": "\n"
        },
        {
          "id": 16114,
          "logprob": -1.2578125,
          "special": false,
          "text": "Once"
        },
        {
          "id": 3714,
          "logprob": -0.2084961,
          "special": false,
          "text": " upon"
        },
        {
          "id": 264,
          "logprob": -0.0017738342,
          "special": false,
          "text": " a"
        },
        {
          "id": 727,
          "logprob": -0.011932373,
          "special": false,
          "text": " time"
        },
        {
          "id": 28725,
          "logprob": -0.17297363,
          "special": false,
          "text": ","
        },
        {
          "id": 736,
          "logprob": -0.9057617,
          "special": false,
          "text": " there"
        },
        {
          "id": 403,
          "logprob": -0.05758667,
          "special": false,
          "text": " was"
        },
        {
          "id": 264,
          "logprob": -0.00970459,
          "special": false,
          "text": " a"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nOnce upon a time, there was a"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 13,
          "logprob": -0.007621765,
          "special": false,
          "text": "\n"
        },
        {
          "id": 13,
          "logprob": -0.20812988,
          "special": false,
          "text": "\n"
        },
        {
          "id": 16114,
          "logprob": -1.2587891,
          "special": false,
          "text": "Once"
        },
        {
          "id": 3714,
          "logprob": -0.20825195,
          "special": false,
          "text": " upon"
        },
        {
          "id": 264,
          "logprob": -0.0017709732,
          "special": false,
          "text": " a"
        },
        {
          "id": 727,
          "logprob": -0.011932373,
          "special": false,
          "text": " time"
        },
        {
          "id": 28725,
          "logprob": -0.17297363,
          "special": false,
          "text": ","
        },
        {
          "id": 736,
          "logprob": -0.9057617,
          "special": false,
          "text": " there"
        },
        {
          "id": 403,
          "logprob": -0.05758667,
          "special": false,
          "text": " was"
        },
        {
          "id": 264,
          "logprob": -0.00970459,
          "special": false,
          "text": " a"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nOnce upon a time, there was a"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_llava_next/test_flash_llava_next_simple.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 13,
        "logprob": -0.00756073,
        "special": false,
        "text": "\n"
      },
      {
        "id": 13,
        "logprob": -0.20117188,
        "special": false,
        "text": "\n"
      },
      {
        "id": 16114,
        "logprob": -1.2597656,
        "special": false,
        "text": "Once"
      },
      {
        "id": 3714,
        "logprob": -0.20825195,
        "special": false,
        "text": " upon"
      },
      {
        "id": 264,
        "logprob": -0.00178051,
        "special": false,
        "text": " a"
      },
      {
        "id": 727,
        "logprob": -0.011955261,
        "special": false,
        "text": " time"
      },
      {
        "id": 28725,
        "logprob": -0.17541504,
        "special": false,
        "text": ","
      },
      {
        "id": 736,
        "logprob": -0.91308594,
        "special": false,
        "text": " there"
      },
      {
        "id": 403,
        "logprob": -0.058410645,
        "special": false,
        "text": " was"
      },
      {
        "id": 264,
        "logprob": -0.009689331,
        "special": false,
        "text": " a"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "\n\nOnce upon a time, there was a"
}


================================================
FILE: integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_with_customer_support_adapter.json
================================================
{
  "details": {
    "finish_reason": "length",
    "generated_tokens": 40,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 13,
        "logprob": -0.27416992,
        "special": false,
        "text": "\n"
      },
      {
        "id": 13,
        "logprob": -0.17016602,
        "special": false,
        "text": "\n"
      },
      {
        "id": 28737,
        "logprob": -2.7109375,
        "special": false,
        "text": "I"
      },
      {
        "id": 28809,
        "logprob": -1.5,
        "special": false,
        "text": "’"
      },
      {
        "id": 28719,
        "logprob": -0.34204102,
        "special": false,
        "text": "m"
      },
      {
        "id": 459,
        "logprob": -1.6914062,
        "special": false,
        "text": " not"
      },
      {
        "id": 1864,
        "logprob": -0.69140625,
        "special": false,
        "text": " sure"
      },
      {
        "id": 513,
        "logprob": -1.6171875,
        "special": false,
        "text": " if"
      },
      {
        "id": 315,
        "logprob": -1.3837891,
        "special": false,
        "text": " I"
      },
      {
        "id": 541,
        "logprob": -1.2226562,
        "special": false,
        "text": " can"
      },
      {
        "id": 1567,
        "logprob": -1.8652344,
        "special": false,
        "text": " come"
      },
      {
        "id": 582,
        "logprob": -0.0070228577,
        "special": false,
        "text": " up"
      },
      {
        "id": 395,
        "logprob": -0.0054092407,
        "special": false,
        "text": " with"
      },
      {
        "id": 28705,
        "logprob": -0.62597656,
        "special": false,
        "text": " "
      },
      {
        "id": 28770,
        "logprob": -0.0035572052,
        "special": false,
        "text": "3"
      },
      {
        "id": 4842,
        "logprob": -0.93603516,
        "special": false,
        "text": " unique"
      },
      {
        "id": 3085,
        "logprob": -0.028411865,
        "special": false,
        "text": " words"
      },
      {
        "id": 369,
        "logprob": -1.0400391,
        "special": false,
        "text": " that"
      },
      {
        "id": 6685,
        "logprob": -0.09710693,
        "special": false,
        "text": " describe"
      },
      {
        "id": 528,
        "logprob": -0.066467285,
        "special": false,
        "text": " me"
      },
      {
        "id": 28725,
        "logprob": -1.0722656,
        "special": false,
        "text": ","
      },
      {
        "id": 562,
        "logprob": -0.33422852,
        "special": false,
        "text": " but"
      },
      {
        "id": 315,
        "logprob": -0.5136719,
        "special": false,
        "text": " I"
      },
      {
        "id": 28809,
        "logprob": -0.8989258,
        "special": false,
        "text": "’"
      },
      {
        "id": 584,
        "logprob": -0.2076416,
        "special": false,
        "text": "ll"
      },
      {
        "id": 1464,
        "logprob": -0.8808594,
        "special": false,
        "text": " try"
      },
      {
        "id": 28723,
        "logprob": -0.88427734,
        "special": false,
        "text": "."
      },
      {
        "id": 13,
        "logprob": -0.91064453,
        "special": false,
        "text": "\n"
      },
      {
        "id": 13,
        "logprob": -0.08105469,
        "special": false,
        "text": "\n"
      },
      {
        "id": 28740,
        "logprob": -1.8486328,
        "special": false,
        "text": "1"
      },
      {
        "id": 28723,
        "logprob": -0.111572266,
        "special": false,
        "text": "."
      },
      {
        "id": 23626,
        "logprob": -3.15625,
        "special": false,
        "text": " Creative"
      },
      {
        "id": 13,
        "logprob": -0.9194336,
        "special": false,
        "text": "\n"
      },
      {
        "id": 28750,
        "logprob": -0.24841309,
        "special": false,
        "text": "2"
      },
      {
        "id": 28723,
        "logprob": -9.393692e-05,
        "special": false,
        "text": "."
      },
      {
        "id": 6785,
        "logprob": -3.1386719,
        "special": false,
        "text": " Fun"
      },
      {
        "id": 1780,
        "logprob": -0.53564453,
        "special": false,
        "text": "ny"
      },
      {
        "id": 13,
        "logprob": -0.09033203,
        "special": false,
        "text": "\n"
      },
      {
        "id": 28770,
        "logprob": -0.00466156,
        "special": false,
        "text": "3"
      },
      {
        "id": 28723,
        "logprob": -0.00016450882,
        "special": false,
        "text": "."
      }
    ]
  },
  "generated_text": "\n\nI’m not sure if I can come up with 3 unique words that describe me, but I’ll try.\n\n1. Creative\n2. Funny\n3."
}


================================================
FILE: integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_with_dbpedia_adapter.json
================================================
{
  "details": {
    "finish_reason": "eos_token",
    "generated_tokens": 7,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 1,
        "logprob": -0.49658203,
        "special": true,
        "text": "<s>"
      },
      {
        "id": 28705,
        "logprob": -0.0016384125,
        "special": false,
        "text": " "
      },
      {
        "id": 1,
        "logprob": -1.4931641,
        "special": true,
        "text": "<s>"
      },
      {
        "id": 28705,
        "logprob": -0.00075769424,
        "special": false,
        "text": " "
      },
      {
        "id": 28740,
        "logprob": -0.25024414,
        "special": false,
        "text": "1"
      },
      {
        "id": 28740,
        "logprob": -0.2631836,
        "special": false,
        "text": "1"
      },
      {
        "id": 2,
        "logprob": -0.0003285408,
        "special": true,
        "text": "</s>"
      }
    ]
  },
  "generated_text": "  11"
}


================================================
FILE: integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_without_adapter.json
================================================
{
  "details": {
    "finish_reason": "length",
    "generated_tokens": 40,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 13,
        "logprob": -1.0488281,
        "special": false,
        "text": "\n"
      },
      {
        "id": 13,
        "logprob": -1.0800781,
        "special": false,
        "text": "\n"
      },
      {
        "id": 27332,
        "logprob": -2.1152344,
        "special": false,
        "text": "###"
      },
      {
        "id": 28705,
        "logprob": -1.6748047,
        "special": false,
        "text": " "
      },
      {
        "id": 28740,
        "logprob": -0.097229004,
        "special": false,
        "text": "1"
      },
      {
        "id": 28723,
        "logprob": -0.16467285,
        "special": false,
        "text": "."
      },
      {
        "id": 7615,
        "logprob": -2.2246094,
        "special": false,
        "text": " News"
      },
      {
        "id": 13,
        "logprob": -1.0488281,
        "special": false,
        "text": "\n"
      },
      {
        "id": 27332,
        "logprob": -0.69189453,
        "special": false,
        "text": "###"
      },
      {
        "id": 28705,
        "logprob": -0.013343811,
        "special": false,
        "text": " "
      },
      {
        "id": 28750,
        "logprob": -0.011230469,
        "special": false,
        "text": "2"
      },
      {
        "id": 28723,
        "logprob": -0.00096845627,
        "special": false,
        "text": "."
      },
      {
        "id": 21095,
        "logprob": -2.5605469,
        "special": false,
        "text": " Blog"
      },
      {
        "id": 13,
        "logprob": -0.19458008,
        "special": false,
        "text": "\n"
      },
      {
        "id": 27332,
        "logprob": -0.031280518,
        "special": false,
        "text": "###"
      },
      {
        "id": 28705,
        "logprob": -0.0030708313,
        "special": false,
        "text": " "
      },
      {
        "id": 28770,
        "logprob": -0.0029277802,
        "special": false,
        "text": "3"
      },
      {
        "id": 28723,
        "logprob": -0.0012350082,
        "special": false,
        "text": "."
      },
      {
        "id": 20108,
        "logprob": -2.1582031,
        "special": false,
        "text": " Article"
      },
      {
        "id": 13,
        "logprob": -0.05810547,
        "special": false,
        "text": "\n"
      },
      {
        "id": 27332,
        "logprob": -0.35083008,
        "special": false,
        "text": "###"
      },
      {
        "id": 28705,
        "logprob": -0.034332275,
        "special": false,
        "text": " "
      },
      {
        "id": 28781,
        "logprob": -0.009666443,
        "special": false,
        "text": "4"
      },
      {
        "id": 28723,
        "logprob": -0.0013113022,
        "special": false,
        "text": "."
      },
      {
        "id": 8349,
        "logprob": -2.6191406,
        "special": false,
        "text": " Review"
      },
      {
        "id": 13,
        "logprob": -0.04031372,
        "special": false,
        "text": "\n"
      },
      {
        "id": 27332,
        "logprob": -0.45239258,
        "special": false,
        "text": "###"
      },
      {
        "id": 28705,
        "logprob": -0.045410156,
        "special": false,
        "text": " "
      },
      {
        "id": 28782,
        "logprob": -0.0041236877,
        "special": false,
        "text": "5"
      },
      {
        "id": 28723,
        "logprob": -0.0010223389,
        "special": false,
        "text": "."
      },
      {
        "id": 5299,
        "logprob": -2.8066406,
        "special": false,
        "text": " Other"
      },
      {
        "id": 13,
        "logprob": -0.12054443,
        "special": false,
        "text": "\n"
      },
      {
        "id": 13,
        "logprob": -0.44580078,
        "special": false,
        "text": "\n"
      },
      {
        "id": 13,
        "logprob": -1.4921875,
        "special": false,
        "text": "\n"
      },
      {
        "id": 13,
        "logprob": -1.3574219,
        "special": false,
        "text": "\n"
      },
      {
        "id": 13,
        "logprob": -1.0039062,
        "special": false,
        "text": "\n"
      },
      {
        "id": 13,
        "logprob": -0.5859375,
        "special": false,
        "text": "\n"
      },
      {
        "id": 13,
        "logprob": -0.43481445,
        "special": false,
        "text": "\n"
      },
      {
        "id": 13,
        "logprob": -0.2783203,
        "special": false,
        "text": "\n"
      },
      {
        "id": 13,
        "logprob": -0.20410156,
        "special": false,
        "text": "\n"
      }
    ]
  },
  "generated_text": "\n\n### 1. News\n### 2. Blog\n### 3. Article\n### 4. Review\n### 5. Other\n\n\n\n\n\n\n\n\n"
}


================================================
FILE: integration-tests/models/__snapshots__/test_lora_mistral/test_lora_mistral_without_customer_support_adapter.json
================================================
{
  "details": {
    "finish_reason": "length",
    "generated_tokens": 40,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 13,
        "logprob": -0.31347656,
        "special": false,
        "text": "\n"
      },
      {
        "id": 13,
        "logprob": -0.27441406,
        "special": false,
        "text": "\n"
      },
      {
        "id": 28737,
        "logprob": -2.2285156,
        "special": false,
        "text": "I"
      },
      {
        "id": 28809,
        "logprob": -1.4677734,
        "special": false,
        "text": "’"
      },
      {
        "id": 28719,
        "logprob": -0.31762695,
        "special": false,
        "text": "m"
      },
      {
        "id": 264,
        "logprob": -1.6865234,
        "special": false,
        "text": " a"
      },
      {
        "id": 1215,
        "logprob": -3.2695312,
        "special": false,
        "text": " very"
      },
      {
        "id": 20640,
        "logprob": -3.1230469,
        "special": false,
        "text": " passionate"
      },
      {
        "id": 1338,
        "logprob": -0.48339844,
        "special": false,
        "text": " person"
      },
      {
        "id": 28723,
        "logprob": -0.9970703,
        "special": false,
        "text": "."
      },
      {
        "id": 315,
        "logprob": -0.5498047,
        "special": false,
        "text": " I"
      },
      {
        "id": 28809,
        "logprob": -1.1923828,
        "special": false,
        "text": "’"
      },
      {
        "id": 28719,
        "logprob": -0.080444336,
        "special": false,
        "text": "m"
      },
      {
        "id": 1215,
        "logprob": -1.8271484,
        "special": false,
        "text": " very"
      },
      {
        "id": 12215,
        "logprob": -2.8847656,
        "special": false,
        "text": " driven"
      },
      {
        "id": 28723,
        "logprob": -1.0927734,
        "special": false,
        "text": "."
      },
      {
        "id": 315,
        "logprob": -0.4584961,
        "special": false,
        "text": " I"
      },
      {
        "id": 28809,
        "logprob": -0.5019531,
        "special": false,
        "text": "’"
      },
      {
        "id": 28719,
        "logprob": -0.030715942,
        "special": false,
        "text": "m"
      },
      {
        "id": 1215,
        "logprob": -0.96972656,
        "special": false,
        "text": " very"
      },
      {
        "id": 7798,
        "logprob": -2.8847656,
        "special": false,
        "text": " determined"
      },
      {
        "id": 28723,
        "logprob": -0.27319336,
        "special": false,
        "text": "."
      },
      {
        "id": 13,
        "logprob": -0.56396484,
        "special": false,
        "text": "\n"
      },
      {
        "id": 13,
        "logprob": -0.011016846,
        "special": false,
        "text": "\n"
      },
      {
        "id": 3195,
        "logprob": -0.7163086,
        "special": false,
        "text": "What"
      },
      {
        "id": 349,
        "logprob": -1.1611328,
        "special": false,
        "text": " is"
      },
      {
        "id": 574,
        "logprob": -0.515625,
        "special": false,
        "text": " your"
      },
      {
        "id": 6656,
        "logprob": -1.0253906,
        "special": false,
        "text": " favorite"
      },
      {
        "id": 1970,
        "logprob": -2.1738281,
        "special": false,
        "text": " thing"
      },
      {
        "id": 684,
        "logprob": -0.48364258,
        "special": false,
        "text": " about"
      },
      {
        "id": 1250,
        "logprob": -1.8876953,
        "special": false,
        "text": " being"
      },
      {
        "id": 264,
        "logprob": -0.41967773,
        "special": false,
        "text": " a"
      },
      {
        "id": 8626,
        "logprob": -2.9160156,
        "special": false,
        "text": " teacher"
      },
      {
        "id": 28804,
        "logprob": -0.11920166,
        "special": false,
        "text": "?"
      },
      {
        "id": 13,
        "logprob": -0.023727417,
        "special": false,
        "text": "\n"
      },
      {
        "id": 13,
        "logprob": -0.010848999,
        "special": false,
        "text": "\n"
      },
      {
        "id": 28737,
        "logprob": -1.0566406,
        "special": false,
        "text": "I"
      },
      {
        "id": 2016,
        "logprob": -0.7163086,
        "special": false,
        "text": " love"
      },
      {
        "id": 272,
        "logprob": -1.9169922,
        "special": false,
        "text": " the"
      },
      {
        "id": 1639,
        "logprob": -2.03125,
        "special": false,
        "text": " fact"
      }
    ]
  },
  "generated_text": "\n\nI’m a very passionate person. I’m very driven. I’m very determined.\n\nWhat is your favorite thing about being a teacher?\n\nI love the fact"
}


================================================
FILE: integration-tests/models/__snapshots__/test_mamba/test_mamba.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 187,
        "logprob": -0.37890625,
        "special": false,
        "text": "\n"
      },
      {
        "id": 187,
        "logprob": -0.35742188,
        "special": false,
        "text": "\n"
      },
      {
        "id": 30763,
        "logprob": -1.1015625,
        "special": false,
        "text": "Deep"
      },
      {
        "id": 4715,
        "logprob": -0.5234375,
        "special": false,
        "text": " learning"
      },
      {
        "id": 310,
        "logprob": -0.55078125,
        "special": false,
        "text": " is"
      },
      {
        "id": 247,
        "logprob": -0.6640625,
        "special": false,
        "text": " a"
      },
      {
        "id": 747,
        "logprob": -2.0625,
        "special": false,
        "text": " new"
      },
      {
        "id": 1511,
        "logprob": -2.375,
        "special": false,
        "text": " type"
      },
      {
        "id": 273,
        "logprob": -0.0029144287,
        "special": false,
        "text": " of"
      },
      {
        "id": 5145,
        "logprob": -1.2734375,
        "special": false,
        "text": " machine"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "\n\nDeep learning is a new type of machine"
}


================================================
FILE: integration-tests/models/__snapshots__/test_mamba/test_mamba_all_params.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [
      {
        "id": 2502,
        "logprob": null,
        "text": " red"
      },
      {
        "id": 13,
        "logprob": -2.734375,
        "text": ","
      },
      {
        "id": 8862,
        "logprob": -3.6875,
        "text": " yellow"
      },
      {
        "id": 13,
        "logprob": -0.40234375,
        "text": ","
      },
      {
        "id": 209,
        "logprob": -8.25,
        "text": " "
      }
    ],
    "seed": 0,
    "tokens": [
      {
        "id": 187,
        "logprob": 0.0,
        "special": false,
        "text": "\n"
      },
      {
        "id": 395,
        "logprob": -0.3125,
        "special": false,
        "text": "and"
      },
      {
        "id": 4797,
        "logprob": 0.0,
        "special": false,
        "text": " blue"
      },
      {
        "id": 9830,
        "logprob": -2.25,
        "special": false,
        "text": " colors"
      },
      {
        "id": 15,
        "logprob": 0.0,
        "special": false,
        "text": "."
      },
      {
        "id": 329,
        "logprob": -2.296875,
        "special": false,
        "text": " A"
      },
      {
        "id": 1180,
        "logprob": -2.046875,
        "special": false,
        "text": " number"
      },
      {
        "id": 273,
        "logprob": 0.0,
        "special": false,
        "text": " of"
      },
      {
        "id": 253,
        "logprob": -0.86328125,
        "special": false,
        "text": " the"
      },
      {
        "id": 3295,
        "logprob": -0.55078125,
        "special": false,
        "text": " color"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "blue, red, yellow, \nand blue colors. A number of the color"
}


================================================
FILE: integration-tests/models/__snapshots__/test_mamba/test_mamba_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 1276,
          "logprob": null,
          "text": "What"
        },
        {
          "id": 310,
          "logprob": -0.83984375,
          "text": " is"
        },
        {
          "id": 18147,
          "logprob": -12.8125,
          "text": " Deep"
        },
        {
          "id": 20727,
          "logprob": -2.84375,
          "text": " Learning"
        },
        {
          "id": 32,
          "logprob": -1.25,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 187,
          "logprob": -0.37890625,
          "special": false,
          "text": "\n"
        },
        {
          "id": 187,
          "logprob": -0.4296875,
          "special": false,
          "text": "\n"
        },
        {
          "id": 30763,
          "logprob": -1.078125,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 4715,
          "logprob": -0.515625,
          "special": false,
          "text": " learning"
        },
        {
          "id": 310,
          "logprob": -0.6015625,
          "special": false,
          "text": " is"
        },
        {
          "id": 247,
          "logprob": -0.65625,
          "special": false,
          "text": " a"
        },
        {
          "id": 747,
          "logprob": -2.109375,
          "special": false,
          "text": " new"
        },
        {
          "id": 1511,
          "logprob": -2.328125,
          "special": false,
          "text": " type"
        },
        {
          "id": 273,
          "logprob": -0.0032653809,
          "special": false,
          "text": " of"
        },
        {
          "id": 5145,
          "logprob": -1.28125,
          "special": false,
          "text": " machine"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nDeep learning is a new type of machine"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 1276,
          "logprob": null,
          "text": "What"
        },
        {
          "id": 310,
          "logprob": -0.80078125,
          "text": " is"
        },
        {
          "id": 18147,
          "logprob": -13.25,
          "text": " Deep"
        },
        {
          "id": 20727,
          "logprob": -2.828125,
          "text": " Learning"
        },
        {
          "id": 32,
          "logprob": -1.1953125,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 187,
          "logprob": -0.296875,
          "special": false,
          "text": "\n"
        },
        {
          "id": 187,
          "logprob": -0.3359375,
          "special": false,
          "text": "\n"
        },
        {
          "id": 30763,
          "logprob": -1.2578125,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 4715,
          "logprob": -0.5546875,
          "special": false,
          "text": " learning"
        },
        {
          "id": 310,
          "logprob": -0.62890625,
          "special": false,
          "text": " is"
        },
        {
          "id": 247,
          "logprob": -0.64453125,
          "special": false,
          "text": " a"
        },
        {
          "id": 747,
          "logprob": -2.078125,
          "special": false,
          "text": " new"
        },
        {
          "id": 1511,
          "logprob": -2.28125,
          "special": false,
          "text": " type"
        },
        {
          "id": 273,
          "logprob": -0.0030670166,
          "special": false,
          "text": " of"
        },
        {
          "id": 5145,
          "logprob": -1.3125,
          "special": false,
          "text": " machine"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nDeep learning is a new type of machine"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 1276,
          "logprob": null,
          "text": "What"
        },
        {
          "id": 310,
          "logprob": -0.80078125,
          "text": " is"
        },
        {
          "id": 18147,
          "logprob": -13.25,
          "text": " Deep"
        },
        {
          "id": 20727,
          "logprob": -2.828125,
          "text": " Learning"
        },
        {
          "id": 32,
          "logprob": -1.1953125,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 187,
          "logprob": -0.296875,
          "special": false,
          "text": "\n"
        },
        {
          "id": 187,
          "logprob": -0.3359375,
          "special": false,
          "text": "\n"
        },
        {
          "id": 30763,
          "logprob": -1.2578125,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 4715,
          "logprob": -0.5546875,
          "special": false,
          "text": " learning"
        },
        {
          "id": 310,
          "logprob": -0.62890625,
          "special": false,
          "text": " is"
        },
        {
          "id": 247,
          "logprob": -0.64453125,
          "special": false,
          "text": " a"
        },
        {
          "id": 747,
          "logprob": -2.078125,
          "special": false,
          "text": " new"
        },
        {
          "id": 1511,
          "logprob": -2.28125,
          "special": false,
          "text": " type"
        },
        {
          "id": 273,
          "logprob": -0.0030670166,
          "special": false,
          "text": " of"
        },
        {
          "id": 5145,
          "logprob": -1.3125,
          "special": false,
          "text": " machine"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nDeep learning is a new type of machine"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 1276,
          "logprob": null,
          "text": "What"
        },
        {
          "id": 310,
          "logprob": -0.80078125,
          "text": " is"
        },
        {
          "id": 18147,
          "logprob": -13.25,
          "text": " Deep"
        },
        {
          "id": 20727,
          "logprob": -2.828125,
          "text": " Learning"
        },
        {
          "id": 32,
          "logprob": -1.1953125,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 187,
          "logprob": -0.296875,
          "special": false,
          "text": "\n"
        },
        {
          "id": 187,
          "logprob": -0.3359375,
          "special": false,
          "text": "\n"
        },
        {
          "id": 30763,
          "logprob": -1.2578125,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 4715,
          "logprob": -0.5546875,
          "special": false,
          "text": " learning"
        },
        {
          "id": 310,
          "logprob": -0.62890625,
          "special": false,
          "text": " is"
        },
        {
          "id": 247,
          "logprob": -0.64453125,
          "special": false,
          "text": " a"
        },
        {
          "id": 747,
          "logprob": -2.078125,
          "special": false,
          "text": " new"
        },
        {
          "id": 1511,
          "logprob": -2.28125,
          "special": false,
          "text": " type"
        },
        {
          "id": 273,
          "logprob": -0.0030670166,
          "special": false,
          "text": " of"
        },
        {
          "id": 5145,
          "logprob": -1.3125,
          "special": false,
          "text": " machine"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nDeep learning is a new type of machine"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json
================================================
[
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "A chicken sits on a pile of money, looking",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1747230173,
    "id": "",
    "model": "unsloth/Llama-3.2-11B-Vision-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "3.3.6-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 45,
      "total_tokens": 55
    }
  },
  {
    "choices": [
      {
        "finish_reason": "length",
        "index": 0,
        "logprobs": null,
        "message": {
          "content": "A chicken sits on a pile of money, looking",
          "name": null,
          "role": "assistant",
          "tool_calls": null
        },
        "usage": null
      }
    ],
    "created": 1747230173,
    "id": "",
    "model": "unsloth/Llama-3.2-11B-Vision-Instruct",
    "object": "chat.completion",
    "system_fingerprint": "3.3.6-dev0-native",
    "usage": {
      "completion_tokens": 10,
      "prompt_tokens": 45,
      "total_tokens": 55
    }
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json
================================================
{
  "choices": [
    {
      "finish_reason": "length",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "A chicken sits on a pile of money, looking",
        "name": null,
        "role": "assistant",
        "tool_calls": null
      },
      "usage": null
    }
  ],
  "created": 1747230171,
  "id": "",
  "model": "unsloth/Llama-3.2-11B-Vision-Instruct",
  "object": "chat.completion",
  "system_fingerprint": "3.3.6-dev0-native",
  "usage": {
    "completion_tokens": 10,
    "prompt_tokens": 45,
    "total_tokens": 55
  }
}


================================================
FILE: integration-tests/models/__snapshots__/test_mpt/test_mpt.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 17,
    "prefill": [
      {
        "id": 1276,
        "logprob": null,
        "text": "What"
      },
      {
        "id": 310,
        "logprob": -1.5117188,
        "text": " is"
      },
      {
        "id": 18147,
        "logprob": -8.96875,
        "text": " Deep"
      },
      {
        "id": 20727,
        "logprob": -1.953125,
        "text": " Learning"
      },
      {
        "id": 32,
        "logprob": -0.94189453,
        "text": "?"
      }
    ],
    "seed": null,
    "tokens": [
      {
        "id": 428,
        "logprob": -1.5830078,
        "special": false,
        "text": " -"
      },
      {
        "id": 18147,
        "logprob": -3.3105469,
        "special": false,
        "text": " Deep"
      },
      {
        "id": 20727,
        "logprob": -0.3215332,
        "special": false,
        "text": " Learning"
      },
      {
        "id": 187,
        "logprob": -2.5566406,
        "special": false,
        "text": "\n"
      },
      {
        "id": 30763,
        "logprob": -1.6074219,
        "special": false,
        "text": "Deep"
      },
      {
        "id": 20727,
        "logprob": -0.69628906,
        "special": false,
        "text": " Learning"
      },
      {
        "id": 310,
        "logprob": -0.6923828,
        "special": false,
        "text": " is"
      },
      {
        "id": 247,
        "logprob": -0.5263672,
        "special": false,
        "text": " a"
      },
      {
        "id": 749,
        "logprob": -1.8544922,
        "special": false,
        "text": " sub"
      },
      {
        "id": 3423,
        "logprob": -0.6118164,
        "special": false,
        "text": "field"
      },
      {
        "id": 273,
        "logprob": -0.055877686,
        "special": false,
        "text": " of"
      },
      {
        "id": 5145,
        "logprob": -1.0537109,
        "special": false,
        "text": " machine"
      },
      {
        "id": 4715,
        "logprob": -0.0115737915,
        "special": false,
        "text": " learning"
      },
      {
        "id": 326,
        "logprob": -0.9111328,
        "special": false,
        "text": " that"
      },
      {
        "id": 4648,
        "logprob": -1.4589844,
        "special": false,
        "text": " uses"
      },
      {
        "id": 13345,
        "logprob": -1.4853516,
        "special": false,
        "text": " artificial"
      },
      {
        "id": 11454,
        "logprob": -0.021636963,
        "special": false,
        "text": " neural"
      }
    ]
  },
  "generated_text": " - Deep Learning\nDeep Learning is a subfield of machine learning that uses artificial neural"
}


================================================
FILE: integration-tests/models/__snapshots__/test_mpt/test_mpt_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 17,
      "prefill": [
        {
          "id": 1276,
          "logprob": null,
          "text": "What"
        },
        {
          "id": 310,
          "logprob": -1.5117188,
          "text": " is"
        },
        {
          "id": 18147,
          "logprob": -8.96875,
          "text": " Deep"
        },
        {
          "id": 20727,
          "logprob": -1.953125,
          "text": " Learning"
        },
        {
          "id": 32,
          "logprob": -0.94189453,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 428,
          "logprob": -1.5830078,
          "special": false,
          "text": " -"
        },
        {
          "id": 18147,
          "logprob": -3.3183594,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 20727,
          "logprob": -0.32617188,
          "special": false,
          "text": " Learning"
        },
        {
          "id": 187,
          "logprob": -2.5742188,
          "special": false,
          "text": "\n"
        },
        {
          "id": 30763,
          "logprob": -1.6015625,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 20727,
          "logprob": -0.69628906,
          "special": false,
          "text": " Learning"
        },
        {
          "id": 310,
          "logprob": -0.67822266,
          "special": false,
          "text": " is"
        },
        {
          "id": 247,
          "logprob": -0.5395508,
          "special": false,
          "text": " a"
        },
        {
          "id": 749,
          "logprob": -1.8623047,
          "special": false,
          "text": " sub"
        },
        {
          "id": 3423,
          "logprob": -0.6020508,
          "special": false,
          "text": "field"
        },
        {
          "id": 273,
          "logprob": -0.0552063,
          "special": false,
          "text": " of"
        },
        {
          "id": 5145,
          "logprob": -1.0742188,
          "special": false,
          "text": " machine"
        },
        {
          "id": 4715,
          "logprob": -0.011405945,
          "special": false,
          "text": " learning"
        },
        {
          "id": 326,
          "logprob": -0.9165039,
          "special": false,
          "text": " that"
        },
        {
          "id": 4648,
          "logprob": -1.4501953,
          "special": false,
          "text": " uses"
        },
        {
          "id": 13345,
          "logprob": -1.4960938,
          "special": false,
          "text": " artificial"
        },
        {
          "id": 11454,
          "logprob": -0.02116394,
          "special": false,
          "text": " neural"
        }
      ]
    },
    "generated_text": " - Deep Learning\nDeep Learning is a subfield of machine learning that uses artificial neural"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 17,
      "prefill": [
        {
          "id": 1276,
          "logprob": null,
          "text": "What"
        },
        {
          "id": 310,
          "logprob": -1.5,
          "text": " is"
        },
        {
          "id": 18147,
          "logprob": -8.984375,
          "text": " Deep"
        },
        {
          "id": 20727,
          "logprob": -1.96875,
          "text": " Learning"
        },
        {
          "id": 32,
          "logprob": -0.93359375,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 428,
          "logprob": -1.5800781,
          "special": false,
          "text": " -"
        },
        {
          "id": 18147,
          "logprob": -3.3242188,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 20727,
          "logprob": -0.31835938,
          "special": false,
          "text": " Learning"
        },
        {
          "id": 187,
          "logprob": -2.5644531,
          "special": false,
          "text": "\n"
        },
        {
          "id": 30763,
          "logprob": -1.5957031,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 20727,
          "logprob": -0.69628906,
          "special": false,
          "text": " Learning"
        },
        {
          "id": 310,
          "logprob": -0.68603516,
          "special": false,
          "text": " is"
        },
        {
          "id": 247,
          "logprob": -0.5258789,
          "special": false,
          "text": " a"
        },
        {
          "id": 749,
          "logprob": -1.859375,
          "special": false,
          "text": " sub"
        },
        {
          "id": 3423,
          "logprob": -0.6166992,
          "special": false,
          "text": "field"
        },
        {
          "id": 273,
          "logprob": -0.056762695,
          "special": false,
          "text": " of"
        },
        {
          "id": 5145,
          "logprob": -1.0703125,
          "special": false,
          "text": " machine"
        },
        {
          "id": 4715,
          "logprob": -0.011428833,
          "special": false,
          "text": " learning"
        },
        {
          "id": 326,
          "logprob": -0.9213867,
          "special": false,
          "text": " that"
        },
        {
          "id": 4648,
          "logprob": -1.4726562,
          "special": false,
          "text": " uses"
        },
        {
          "id": 13345,
          "logprob": -1.5039062,
          "special": false,
          "text": " artificial"
        },
        {
          "id": 11454,
          "logprob": -0.021652222,
          "special": false,
          "text": " neural"
        }
      ]
    },
    "generated_text": " - Deep Learning\nDeep Learning is a subfield of machine learning that uses artificial neural"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 17,
      "prefill": [
        {
          "id": 1276,
          "logprob": null,
          "text": "What"
        },
        {
          "id": 310,
          "logprob": -1.5,
          "text": " is"
        },
        {
          "id": 18147,
          "logprob": -8.984375,
          "text": " Deep"
        },
        {
          "id": 20727,
          "logprob": -1.96875,
          "text": " Learning"
        },
        {
          "id": 32,
          "logprob": -0.93359375,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 428,
          "logprob": -1.5800781,
          "special": false,
          "text": " -"
        },
        {
          "id": 18147,
          "logprob": -3.3242188,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 20727,
          "logprob": -0.31835938,
          "special": false,
          "text": " Learning"
        },
        {
          "id": 187,
          "logprob": -2.5644531,
          "special": false,
          "text": "\n"
        },
        {
          "id": 30763,
          "logprob": -1.5957031,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 20727,
          "logprob": -0.69628906,
          "special": false,
          "text": " Learning"
        },
        {
          "id": 310,
          "logprob": -0.68603516,
          "special": false,
          "text": " is"
        },
        {
          "id": 247,
          "logprob": -0.5258789,
          "special": false,
          "text": " a"
        },
        {
          "id": 749,
          "logprob": -1.859375,
          "special": false,
          "text": " sub"
        },
        {
          "id": 3423,
          "logprob": -0.6166992,
          "special": false,
          "text": "field"
        },
        {
          "id": 273,
          "logprob": -0.056762695,
          "special": false,
          "text": " of"
        },
        {
          "id": 5145,
          "logprob": -1.0703125,
          "special": false,
          "text": " machine"
        },
        {
          "id": 4715,
          "logprob": -0.011428833,
          "special": false,
          "text": " learning"
        },
        {
          "id": 326,
          "logprob": -0.9213867,
          "special": false,
          "text": " that"
        },
        {
          "id": 4648,
          "logprob": -1.4726562,
          "special": false,
          "text": " uses"
        },
        {
          "id": 13345,
          "logprob": -1.5039062,
          "special": false,
          "text": " artificial"
        },
        {
          "id": 11454,
          "logprob": -0.021652222,
          "special": false,
          "text": " neural"
        }
      ]
    },
    "generated_text": " - Deep Learning\nDeep Learning is a subfield of machine learning that uses artificial neural"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 17,
      "prefill": [
        {
          "id": 1276,
          "logprob": null,
          "text": "What"
        },
        {
          "id": 310,
          "logprob": -1.5,
          "text": " is"
        },
        {
          "id": 18147,
          "logprob": -8.984375,
          "text": " Deep"
        },
        {
          "id": 20727,
          "logprob": -1.96875,
          "text": " Learning"
        },
        {
          "id": 32,
          "logprob": -0.93359375,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 428,
          "logprob": -1.5800781,
          "special": false,
          "text": " -"
        },
        {
          "id": 18147,
          "logprob": -3.3242188,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 20727,
          "logprob": -0.31835938,
          "special": false,
          "text": " Learning"
        },
        {
          "id": 187,
          "logprob": -2.5644531,
          "special": false,
          "text": "\n"
        },
        {
          "id": 30763,
          "logprob": -1.5957031,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 20727,
          "logprob": -0.69628906,
          "special": false,
          "text": " Learning"
        },
        {
          "id": 310,
          "logprob": -0.68603516,
          "special": false,
          "text": " is"
        },
        {
          "id": 247,
          "logprob": -0.5258789,
          "special": false,
          "text": " a"
        },
        {
          "id": 749,
          "logprob": -1.859375,
          "special": false,
          "text": " sub"
        },
        {
          "id": 3423,
          "logprob": -0.6166992,
          "special": false,
          "text": "field"
        },
        {
          "id": 273,
          "logprob": -0.056762695,
          "special": false,
          "text": " of"
        },
        {
          "id": 5145,
          "logprob": -1.0703125,
          "special": false,
          "text": " machine"
        },
        {
          "id": 4715,
          "logprob": -0.011428833,
          "special": false,
          "text": " learning"
        },
        {
          "id": 326,
          "logprob": -0.9213867,
          "special": false,
          "text": " that"
        },
        {
          "id": 4648,
          "logprob": -1.4726562,
          "special": false,
          "text": " uses"
        },
        {
          "id": 13345,
          "logprob": -1.5039062,
          "special": false,
          "text": " artificial"
        },
        {
          "id": 11454,
          "logprob": -0.021652222,
          "special": false,
          "text": " neural"
        }
      ]
    },
    "generated_text": " - Deep Learning\nDeep Learning is a subfield of machine learning that uses artificial neural"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "eos_token",
    "generated_tokens": 5,
    "prefill": [
      {
        "id": 0,
        "logprob": null,
        "text": "<pad>"
      }
    ],
    "seed": 0,
    "tokens": [
      {
        "id": 926,
        "logprob": -4.3554688,
        "special": false,
        "text": " To"
      },
      {
        "id": 18295,
        "logprob": -7.7734375,
        "special": false,
        "text": " sell"
      },
      {
        "id": 7868,
        "logprob": -3.9257812,
        "special": false,
        "text": " things"
      },
      {
        "id": 260,
        "logprob": -2.4179688,
        "special": false,
        "text": "."
      },
      {
        "id": 1,
        "logprob": 0.0,
        "special": true,
        "text": "</s>"
      }
    ]
  },
  "generated_text": "To sell things."
}


================================================
FILE: integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_all_params.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [
      {
        "id": 0,
        "logprob": null,
        "text": "<pad>"
      }
    ],
    "seed": 0,
    "tokens": [
      {
        "id": 16017,
        "logprob": 0.0,
        "special": false,
        "text": " blue"
      },
      {
        "id": 20495,
        "logprob": 0.0,
        "special": false,
        "text": " sky"
      },
      {
        "id": 259,
        "logprob": -0.47070312,
        "special": false,
        "text": " "
      },
      {
        "id": 261,
        "logprob": -0.15307617,
        "special": false,
        "text": ","
      },
      {
        "id": 35622,
        "logprob": -0.796875,
        "special": false,
        "text": " cloud"
      },
      {
        "id": 263,
        "logprob": -1.2958984,
        "special": false,
        "text": "s"
      },
      {
        "id": 305,
        "logprob": 0.0,
        "special": false,
        "text": " and"
      },
      {
        "id": 35622,
        "logprob": -1.2998047,
        "special": false,
        "text": " cloud"
      },
      {
        "id": 263,
        "logprob": 0.0,
        "special": false,
        "text": "s"
      },
      {
        "id": 1,
        "logprob": 0.0,
        "special": true,
        "text": "</s>"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "Why is the sky blue?blue sky , clouds and clouds"
}


================================================
FILE: integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "eos_token",
      "generated_tokens": 6,
      "prefill": [
        {
          "id": 0,
          "logprob": null,
          "text": "<pad>"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 259,
          "logprob": -1.3798828,
          "special": false,
          "text": " "
        },
        {
          "id": 39261,
          "logprob": -0.36328125,
          "special": false,
          "text": "Because"
        },
        {
          "id": 609,
          "logprob": -1.0947266,
          "special": false,
          "text": " it"
        },
        {
          "id": 339,
          "logprob": -0.8286133,
          "special": false,
          "text": " is"
        },
        {
          "id": 16017,
          "logprob": -1.6826172,
          "special": false,
          "text": " blue"
        },
        {
          "id": 1,
          "logprob": -0.7290039,
          "special": true,
          "text": "</s>"
        }
      ]
    },
    "generated_text": "Because it is blue"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "eos_token",
      "generated_tokens": 6,
      "prefill": [
        {
          "id": 0,
          "logprob": null,
          "text": "<pad>"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 259,
          "logprob": -1.3789062,
          "special": false,
          "text": " "
        },
        {
          "id": 39261,
          "logprob": -0.36279297,
          "special": false,
          "text": "Because"
        },
        {
          "id": 609,
          "logprob": -1.0966797,
          "special": false,
          "text": " it"
        },
        {
          "id": 339,
          "logprob": -0.8276367,
          "special": false,
          "text": " is"
        },
        {
          "id": 16017,
          "logprob": -1.6845703,
          "special": false,
          "text": " blue"
        },
        {
          "id": 1,
          "logprob": -0.72753906,
          "special": true,
          "text": "</s>"
        }
      ]
    },
    "generated_text": "Because it is blue"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "eos_token",
      "generated_tokens": 6,
      "prefill": [
        {
          "id": 0,
          "logprob": null,
          "text": "<pad>"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 259,
          "logprob": -1.3789062,
          "special": false,
          "text": " "
        },
        {
          "id": 39261,
          "logprob": -0.36279297,
          "special": false,
          "text": "Because"
        },
        {
          "id": 609,
          "logprob": -1.0966797,
          "special": false,
          "text": " it"
        },
        {
          "id": 339,
          "logprob": -0.8276367,
          "special": false,
          "text": " is"
        },
        {
          "id": 16017,
          "logprob": -1.6845703,
          "special": false,
          "text": " blue"
        },
        {
          "id": 1,
          "logprob": -0.72753906,
          "special": true,
          "text": "</s>"
        }
      ]
    },
    "generated_text": "Because it is blue"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "eos_token",
      "generated_tokens": 6,
      "prefill": [
        {
          "id": 0,
          "logprob": null,
          "text": "<pad>"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 259,
          "logprob": -1.3789062,
          "special": false,
          "text": " "
        },
        {
          "id": 39261,
          "logprob": -0.36279297,
          "special": false,
          "text": "Because"
        },
        {
          "id": 609,
          "logprob": -1.0966797,
          "special": false,
          "text": " it"
        },
        {
          "id": 339,
          "logprob": -0.8276367,
          "special": false,
          "text": " is"
        },
        {
          "id": 16017,
          "logprob": -1.6845703,
          "special": false,
          "text": " blue"
        },
        {
          "id": 1,
          "logprob": -0.72753906,
          "special": true,
          "text": "</s>"
        }
      ]
    },
    "generated_text": "Because it is blue"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_neox/test_neox.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 42,
        "logprob": -0.86279297,
        "special": false,
        "text": "I"
      },
      {
        "id": 1353,
        "logprob": -0.94921875,
        "special": false,
        "text": "'m"
      },
      {
        "id": 7016,
        "logprob": -2.1835938,
        "special": false,
        "text": " sorry"
      },
      {
        "id": 13,
        "logprob": -0.074035645,
        "special": false,
        "text": ","
      },
      {
        "id": 1394,
        "logprob": -0.86376953,
        "special": false,
        "text": "You"
      },
      {
        "id": 452,
        "logprob": -1.2070312,
        "special": false,
        "text": " have"
      },
      {
        "id": 247,
        "logprob": -1.4365234,
        "special": false,
        "text": " a"
      },
      {
        "id": 4327,
        "logprob": -1.109375,
        "special": false,
        "text": " choice"
      },
      {
        "id": 273,
        "logprob": -0.93408203,
        "special": false,
        "text": " of"
      },
      {
        "id": 752,
        "logprob": -1.8808594,
        "special": false,
        "text": " what"
      }
    ]
  },
  "generated_text": "I'm sorry,You have a choice of what"
}


================================================
FILE: integration-tests/models/__snapshots__/test_neox/test_neox_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 42,
          "logprob": -0.8618164,
          "special": false,
          "text": "I"
        },
        {
          "id": 1353,
          "logprob": -0.9506836,
          "special": false,
          "text": "'m"
        },
        {
          "id": 7016,
          "logprob": -2.1738281,
          "special": false,
          "text": " sorry"
        },
        {
          "id": 13,
          "logprob": -0.0758667,
          "special": false,
          "text": ","
        },
        {
          "id": 1394,
          "logprob": -0.9135742,
          "special": false,
          "text": "You"
        },
        {
          "id": 452,
          "logprob": -1.1445312,
          "special": false,
          "text": " have"
        },
        {
          "id": 247,
          "logprob": -1.4375,
          "special": false,
          "text": " a"
        },
        {
          "id": 4327,
          "logprob": -1.1103516,
          "special": false,
          "text": " choice"
        },
        {
          "id": 273,
          "logprob": -1.0058594,
          "special": false,
          "text": " of"
        },
        {
          "id": 752,
          "logprob": -1.921875,
          "special": false,
          "text": " what"
        }
      ]
    },
    "generated_text": "I'm sorry,You have a choice of what"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 42,
          "logprob": -0.8618164,
          "special": false,
          "text": "I"
        },
        {
          "id": 1353,
          "logprob": -0.9506836,
          "special": false,
          "text": "'m"
        },
        {
          "id": 7016,
          "logprob": -2.1738281,
          "special": false,
          "text": " sorry"
        },
        {
          "id": 13,
          "logprob": -0.0758667,
          "special": false,
          "text": ","
        },
        {
          "id": 1394,
          "logprob": -0.9135742,
          "special": false,
          "text": "You"
        },
        {
          "id": 452,
          "logprob": -1.1445312,
          "special": false,
          "text": " have"
        },
        {
          "id": 247,
          "logprob": -1.4375,
          "special": false,
          "text": " a"
        },
        {
          "id": 4327,
          "logprob": -1.1103516,
          "special": false,
          "text": " choice"
        },
        {
          "id": 273,
          "logprob": -1.0058594,
          "special": false,
          "text": " of"
        },
        {
          "id": 752,
          "logprob": -1.921875,
          "special": false,
          "text": " what"
        }
      ]
    },
    "generated_text": "I'm sorry,You have a choice of what"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 42,
          "logprob": -0.8618164,
          "special": false,
          "text": "I"
        },
        {
          "id": 1353,
          "logprob": -0.9506836,
          "special": false,
          "text": "'m"
        },
        {
          "id": 7016,
          "logprob": -2.1738281,
          "special": false,
          "text": " sorry"
        },
        {
          "id": 13,
          "logprob": -0.0758667,
          "special": false,
          "text": ","
        },
        {
          "id": 1394,
          "logprob": -0.9135742,
          "special": false,
          "text": "You"
        },
        {
          "id": 452,
          "logprob": -1.1445312,
          "special": false,
          "text": " have"
        },
        {
          "id": 247,
          "logprob": -1.4375,
          "special": false,
          "text": " a"
        },
        {
          "id": 4327,
          "logprob": -1.1103516,
          "special": false,
          "text": " choice"
        },
        {
          "id": 273,
          "logprob": -1.0058594,
          "special": false,
          "text": " of"
        },
        {
          "id": 752,
          "logprob": -1.921875,
          "special": false,
          "text": " what"
        }
      ]
    },
    "generated_text": "I'm sorry,You have a choice of what"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 42,
          "logprob": -0.8618164,
          "special": false,
          "text": "I"
        },
        {
          "id": 1353,
          "logprob": -0.9506836,
          "special": false,
          "text": "'m"
        },
        {
          "id": 7016,
          "logprob": -2.1738281,
          "special": false,
          "text": " sorry"
        },
        {
          "id": 13,
          "logprob": -0.0758667,
          "special": false,
          "text": ","
        },
        {
          "id": 1394,
          "logprob": -0.9135742,
          "special": false,
          "text": "You"
        },
        {
          "id": 452,
          "logprob": -1.1445312,
          "special": false,
          "text": " have"
        },
        {
          "id": 247,
          "logprob": -1.4375,
          "special": false,
          "text": " a"
        },
        {
          "id": 4327,
          "logprob": -1.1103516,
          "special": false,
          "text": " choice"
        },
        {
          "id": 273,
          "logprob": -1.0058594,
          "special": false,
          "text": " of"
        },
        {
          "id": 752,
          "logprob": -1.921875,
          "special": false,
          "text": " what"
        }
      ]
    },
    "generated_text": "I'm sorry,You have a choice of what"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_neox_sharded/test_neox.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 510,
        "logprob": -0.5878906,
        "special": false,
        "text": "The"
      },
      {
        "id": 3159,
        "logprob": -0.5449219,
        "special": false,
        "text": " word"
      },
      {
        "id": 346,
        "logprob": -0.05038452,
        "special": false,
        "text": " \""
      },
      {
        "id": 6441,
        "logprob": -0.002292633,
        "special": false,
        "text": "mem"
      },
      {
        "id": 70,
        "logprob": -1.3828278e-05,
        "special": false,
        "text": "e"
      },
      {
        "id": 3,
        "logprob": -0.0010242462,
        "special": false,
        "text": "\""
      },
      {
        "id": 369,
        "logprob": -0.090270996,
        "special": false,
        "text": " was"
      },
      {
        "id": 806,
        "logprob": -0.12719727,
        "special": false,
        "text": " first"
      },
      {
        "id": 908,
        "logprob": -0.016571045,
        "special": false,
        "text": " used"
      },
      {
        "id": 275,
        "logprob": -0.43432617,
        "special": false,
        "text": " in"
      }
    ]
  },
  "generated_text": "The word \"meme\" was first used in"
}


================================================
FILE: integration-tests/models/__snapshots__/test_neox_sharded/test_neox_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 510,
          "logprob": -0.5878906,
          "special": false,
          "text": "The"
        },
        {
          "id": 3159,
          "logprob": -0.5498047,
          "special": false,
          "text": " word"
        },
        {
          "id": 346,
          "logprob": -0.04815674,
          "special": false,
          "text": " \""
        },
        {
          "id": 6441,
          "logprob": -0.002313614,
          "special": false,
          "text": "mem"
        },
        {
          "id": 70,
          "logprob": -1.2636185e-05,
          "special": false,
          "text": "e"
        },
        {
          "id": 3,
          "logprob": -0.0010147095,
          "special": false,
          "text": "\""
        },
        {
          "id": 369,
          "logprob": -0.0859375,
          "special": false,
          "text": " was"
        },
        {
          "id": 806,
          "logprob": -0.12609863,
          "special": false,
          "text": " first"
        },
        {
          "id": 908,
          "logprob": -0.016601562,
          "special": false,
          "text": " used"
        },
        {
          "id": 275,
          "logprob": -0.38256836,
          "special": false,
          "text": " in"
        }
      ]
    },
    "generated_text": "The word \"meme\" was first used in"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 510,
          "logprob": -0.6201172,
          "special": false,
          "text": "The"
        },
        {
          "id": 3159,
          "logprob": -0.546875,
          "special": false,
          "text": " word"
        },
        {
          "id": 346,
          "logprob": -0.051879883,
          "special": false,
          "text": " \""
        },
        {
          "id": 6441,
          "logprob": -0.0020179749,
          "special": false,
          "text": "mem"
        },
        {
          "id": 70,
          "logprob": -9.059906e-06,
          "special": false,
          "text": "e"
        },
        {
          "id": 3,
          "logprob": -0.00096797943,
          "special": false,
          "text": "\""
        },
        {
          "id": 369,
          "logprob": -0.07940674,
          "special": false,
          "text": " was"
        },
        {
          "id": 806,
          "logprob": -0.12182617,
          "special": false,
          "text": " first"
        },
        {
          "id": 908,
          "logprob": -0.017227173,
          "special": false,
          "text": " used"
        },
        {
          "id": 275,
          "logprob": -0.44482422,
          "special": false,
          "text": " in"
        }
      ]
    },
    "generated_text": "The word \"meme\" was first used in"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 510,
          "logprob": -0.6201172,
          "special": false,
          "text": "The"
        },
        {
          "id": 3159,
          "logprob": -0.546875,
          "special": false,
          "text": " word"
        },
        {
          "id": 346,
          "logprob": -0.051879883,
          "special": false,
          "text": " \""
        },
        {
          "id": 6441,
          "logprob": -0.0020179749,
          "special": false,
          "text": "mem"
        },
        {
          "id": 70,
          "logprob": -9.059906e-06,
          "special": false,
          "text": "e"
        },
        {
          "id": 3,
          "logprob": -0.00096797943,
          "special": false,
          "text": "\""
        },
        {
          "id": 369,
          "logprob": -0.07940674,
          "special": false,
          "text": " was"
        },
        {
          "id": 806,
          "logprob": -0.12182617,
          "special": false,
          "text": " first"
        },
        {
          "id": 908,
          "logprob": -0.017227173,
          "special": false,
          "text": " used"
        },
        {
          "id": 275,
          "logprob": -0.44482422,
          "special": false,
          "text": " in"
        }
      ]
    },
    "generated_text": "The word \"meme\" was first used in"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 510,
          "logprob": -0.6201172,
          "special": false,
          "text": "The"
        },
        {
          "id": 3159,
          "logprob": -0.546875,
          "special": false,
          "text": " word"
        },
        {
          "id": 346,
          "logprob": -0.051879883,
          "special": false,
          "text": " \""
        },
        {
          "id": 6441,
          "logprob": -0.0020179749,
          "special": false,
          "text": "mem"
        },
        {
          "id": 70,
          "logprob": -1.04904175e-05,
          "special": false,
          "text": "e"
        },
        {
          "id": 3,
          "logprob": -0.0009560585,
          "special": false,
          "text": "\""
        },
        {
          "id": 369,
          "logprob": -0.08557129,
          "special": false,
          "text": " was"
        },
        {
          "id": 806,
          "logprob": -0.12084961,
          "special": false,
          "text": " first"
        },
        {
          "id": 908,
          "logprob": -0.01737976,
          "special": false,
          "text": " used"
        },
        {
          "id": 275,
          "logprob": -0.4025879,
          "special": false,
          "text": " in"
        }
      ]
    },
    "generated_text": "The word \"meme\" was first used in"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 13,
        "logprob": -2.3417969,
        "special": false,
        "text": "\n"
      },
      {
        "id": 3057,
        "logprob": -1.8730469,
        "special": false,
        "text": "Test"
      },
      {
        "id": 2009,
        "logprob": -1.2626953,
        "special": false,
        "text": " request"
      },
      {
        "id": 13,
        "logprob": -1.7060547,
        "special": false,
        "text": "\n"
      },
      {
        "id": 3057,
        "logprob": -1.4482422,
        "special": false,
        "text": "Test"
      },
      {
        "id": 2009,
        "logprob": -0.15246582,
        "special": false,
        "text": " request"
      },
      {
        "id": 13,
        "logprob": -0.796875,
        "special": false,
        "text": "\n"
      },
      {
        "id": 3057,
        "logprob": -0.22766113,
        "special": false,
        "text": "Test"
      },
      {
        "id": 2009,
        "logprob": -0.007045746,
        "special": false,
        "text": " request"
      },
      {
        "id": 13,
        "logprob": -0.021759033,
        "special": false,
        "text": "\n"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "\nTest request\nTest request\nTest request\n"
}


================================================
FILE: integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized_all_params.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": 0,
    "tokens": [
      {
        "id": 29899,
        "logprob": -1.4980469,
        "special": false,
        "text": "-"
      },
      {
        "id": 1454,
        "logprob": -0.19433594,
        "special": false,
        "text": "for"
      },
      {
        "id": 29899,
        "logprob": 0.0,
        "special": false,
        "text": "-"
      },
      {
        "id": 9342,
        "logprob": 0.0,
        "special": false,
        "text": "comment"
      },
      {
        "id": 29901,
        "logprob": 0.0,
        "special": false,
        "text": ":"
      },
      {
        "id": 396,
        "logprob": -0.27392578,
        "special": false,
        "text": " #"
      },
      {
        "id": 29906,
        "logprob": -0.49389648,
        "special": false,
        "text": "2"
      },
      {
        "id": 29900,
        "logprob": -0.81103516,
        "special": false,
        "text": "0"
      },
      {
        "id": 29896,
        "logprob": 0.0,
        "special": false,
        "text": "1"
      },
      {
        "id": 29955,
        "logprob": -1.0800781,
        "special": false,
        "text": "7"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "Test request-for-comment: #2017"
}


================================================
FILE: integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 13,
          "logprob": -2.3359375,
          "special": false,
          "text": "\n"
        },
        {
          "id": 3057,
          "logprob": -1.8623047,
          "special": false,
          "text": "Test"
        },
        {
          "id": 2009,
          "logprob": -1.2451172,
          "special": false,
          "text": " request"
        },
        {
          "id": 13,
          "logprob": -1.6923828,
          "special": false,
          "text": "\n"
        },
        {
          "id": 3057,
          "logprob": -1.4492188,
          "special": false,
          "text": "Test"
        },
        {
          "id": 2009,
          "logprob": -0.15197754,
          "special": false,
          "text": " request"
        },
        {
          "id": 13,
          "logprob": -0.8022461,
          "special": false,
          "text": "\n"
        },
        {
          "id": 3057,
          "logprob": -0.22583008,
          "special": false,
          "text": "Test"
        },
        {
          "id": 2009,
          "logprob": -0.007095337,
          "special": false,
          "text": " request"
        },
        {
          "id": 13,
          "logprob": -0.021652222,
          "special": false,
          "text": "\n"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\nTest request\nTest request\nTest request\n"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 13,
          "logprob": -2.3476562,
          "special": false,
          "text": "\n"
        },
        {
          "id": 3057,
          "logprob": -1.8789062,
          "special": false,
          "text": "Test"
        },
        {
          "id": 2009,
          "logprob": -1.2734375,
          "special": false,
          "text": " request"
        },
        {
          "id": 13,
          "logprob": -1.703125,
          "special": false,
          "text": "\n"
        },
        {
          "id": 3057,
          "logprob": -1.4677734,
          "special": false,
          "text": "Test"
        },
        {
          "id": 2009,
          "logprob": -0.15454102,
          "special": false,
          "text": " request"
        },
        {
          "id": 13,
          "logprob": -0.7973633,
          "special": false,
          "text": "\n"
        },
        {
          "id": 3057,
          "logprob": -0.23278809,
          "special": false,
          "text": "Test"
        },
        {
          "id": 2009,
          "logprob": -0.006980896,
          "special": false,
          "text": " request"
        },
        {
          "id": 13,
          "logprob": -0.022033691,
          "special": false,
          "text": "\n"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\nTest request\nTest request\nTest request\n"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 13,
          "logprob": -2.3203125,
          "special": false,
          "text": "\n"
        },
        {
          "id": 3057,
          "logprob": -1.8486328,
          "special": false,
          "text": "Test"
        },
        {
          "id": 2009,
          "logprob": -1.2480469,
          "special": false,
          "text": " request"
        },
        {
          "id": 13,
          "logprob": -1.7060547,
          "special": false,
          "text": "\n"
        },
        {
          "id": 3057,
          "logprob": -1.4511719,
          "special": false,
          "text": "Test"
        },
        {
          "id": 2009,
          "logprob": -0.1529541,
          "special": false,
          "text": " request"
        },
        {
          "id": 13,
          "logprob": -0.81396484,
          "special": false,
          "text": "\n"
        },
        {
          "id": 3057,
          "logprob": -0.22180176,
          "special": false,
          "text": "Test"
        },
        {
          "id": 2009,
          "logprob": -0.007133484,
          "special": false,
          "text": " request"
        },
        {
          "id": 13,
          "logprob": -0.021835327,
          "special": false,
          "text": "\n"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\nTest request\nTest request\nTest request\n"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 13,
          "logprob": -2.3261719,
          "special": false,
          "text": "\n"
        },
        {
          "id": 3057,
          "logprob": -1.8691406,
          "special": false,
          "text": "Test"
        },
        {
          "id": 2009,
          "logprob": -1.2597656,
          "special": false,
          "text": " request"
        },
        {
          "id": 13,
          "logprob": -1.7070312,
          "special": false,
          "text": "\n"
        },
        {
          "id": 3057,
          "logprob": -1.4550781,
          "special": false,
          "text": "Test"
        },
        {
          "id": 2009,
          "logprob": -0.1538086,
          "special": false,
          "text": " request"
        },
        {
          "id": 13,
          "logprob": -0.79345703,
          "special": false,
          "text": "\n"
        },
        {
          "id": 3057,
          "logprob": -0.22924805,
          "special": false,
          "text": "Test"
        },
        {
          "id": 2009,
          "logprob": -0.0070266724,
          "special": false,
          "text": " request"
        },
        {
          "id": 13,
          "logprob": -0.021942139,
          "special": false,
          "text": "\n"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\nTest request\nTest request\nTest request\n"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_smolvlm/test_flash_smolvlm_next_simple_url.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "eos_token",
    "generated_tokens": 8,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 330,
        "logprob": -0.107421875,
        "special": false,
        "text": " A"
      },
      {
        "id": 11426,
        "logprob": -0.30078125,
        "special": false,
        "text": " bee"
      },
      {
        "id": 335,
        "logprob": -0.9609375,
        "special": false,
        "text": " on"
      },
      {
        "id": 253,
        "logprob": -0.0703125,
        "special": false,
        "text": " a"
      },
      {
        "id": 11986,
        "logprob": -0.5,
        "special": false,
        "text": " pink"
      },
      {
        "id": 8525,
        "logprob": -0.09716797,
        "special": false,
        "text": " flower"
      },
      {
        "id": 30,
        "logprob": -1.078125,
        "special": false,
        "text": "."
      },
      {
        "id": 49154,
        "logprob": -0.110839844,
        "special": true,
        "text": "<end_of_utterance>"
      }
    ],
    "top_tokens": null
  },
  "generated_text": " A bee on a pink flower."
}


================================================
FILE: integration-tests/models/__snapshots__/test_t5_sharded/test_t5_sharded.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "eos_token",
    "generated_tokens": 7,
    "prefill": [
      {
        "id": 0,
        "logprob": null,
        "text": "<pad>"
      }
    ],
    "seed": null,
    "tokens": [
      {
        "id": 3,
        "logprob": -0.7001953,
        "special": false,
        "text": " "
      },
      {
        "id": 18,
        "logprob": -1.1943359,
        "special": false,
        "text": "-"
      },
      {
        "id": 26937,
        "logprob": -1.2099609,
        "special": false,
        "text": "196"
      },
      {
        "id": 3,
        "logprob": -1.2451172,
        "special": false,
        "text": " "
      },
      {
        "id": 1956,
        "logprob": -0.3322754,
        "special": false,
        "text": "°"
      },
      {
        "id": 254,
        "logprob": -0.19213867,
        "special": false,
        "text": "C"
      },
      {
        "id": 1,
        "logprob": -0.030151367,
        "special": true,
        "text": "</s>"
      }
    ]
  },
  "generated_text": "-196 °C"
}


================================================
FILE: integration-tests/models/__snapshots__/test_t5_sharded/test_t5_sharded_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "eos_token",
      "generated_tokens": 7,
      "prefill": [
        {
          "id": 0,
          "logprob": null,
          "text": "<pad>"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 3,
          "logprob": -0.7001953,
          "special": false,
          "text": " "
        },
        {
          "id": 18,
          "logprob": -1.1943359,
          "special": false,
          "text": "-"
        },
        {
          "id": 26937,
          "logprob": -1.2119141,
          "special": false,
          "text": "196"
        },
        {
          "id": 3,
          "logprob": -1.2480469,
          "special": false,
          "text": " "
        },
        {
          "id": 1956,
          "logprob": -0.33203125,
          "special": false,
          "text": "°"
        },
        {
          "id": 254,
          "logprob": -0.19250488,
          "special": false,
          "text": "C"
        },
        {
          "id": 1,
          "logprob": -0.030166626,
          "special": true,
          "text": "</s>"
        }
      ]
    },
    "generated_text": "-196 °C"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "eos_token",
      "generated_tokens": 7,
      "prefill": [
        {
          "id": 0,
          "logprob": null,
          "text": "<pad>"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 3,
          "logprob": -0.7001953,
          "special": false,
          "text": " "
        },
        {
          "id": 18,
          "logprob": -1.1943359,
          "special": false,
          "text": "-"
        },
        {
          "id": 26937,
          "logprob": -1.2119141,
          "special": false,
          "text": "196"
        },
        {
          "id": 3,
          "logprob": -1.2480469,
          "special": false,
          "text": " "
        },
        {
          "id": 1956,
          "logprob": -0.33203125,
          "special": false,
          "text": "°"
        },
        {
          "id": 254,
          "logprob": -0.19250488,
          "special": false,
          "text": "C"
        },
        {
          "id": 1,
          "logprob": -0.030166626,
          "special": true,
          "text": "</s>"
        }
      ]
    },
    "generated_text": "-196 °C"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "eos_token",
      "generated_tokens": 7,
      "prefill": [
        {
          "id": 0,
          "logprob": null,
          "text": "<pad>"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 3,
          "logprob": -0.7001953,
          "special": false,
          "text": " "
        },
        {
          "id": 18,
          "logprob": -1.1943359,
          "special": false,
          "text": "-"
        },
        {
          "id": 26937,
          "logprob": -1.2119141,
          "special": false,
          "text": "196"
        },
        {
          "id": 3,
          "logprob": -1.2480469,
          "special": false,
          "text": " "
        },
        {
          "id": 1956,
          "logprob": -0.33203125,
          "special": false,
          "text": "°"
        },
        {
          "id": 254,
          "logprob": -0.19250488,
          "special": false,
          "text": "C"
        },
        {
          "id": 1,
          "logprob": -0.030166626,
          "special": true,
          "text": "</s>"
        }
      ]
    },
    "generated_text": "-196 °C"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "eos_token",
      "generated_tokens": 7,
      "prefill": [
        {
          "id": 0,
          "logprob": null,
          "text": "<pad>"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 3,
          "logprob": -0.7001953,
          "special": false,
          "text": " "
        },
        {
          "id": 18,
          "logprob": -1.1943359,
          "special": false,
          "text": "-"
        },
        {
          "id": 26937,
          "logprob": -1.2099609,
          "special": false,
          "text": "196"
        },
        {
          "id": 3,
          "logprob": -1.2451172,
          "special": false,
          "text": " "
        },
        {
          "id": 1956,
          "logprob": -0.3322754,
          "special": false,
          "text": "°"
        },
        {
          "id": 254,
          "logprob": -0.19213867,
          "special": false,
          "text": "C"
        },
        {
          "id": 1,
          "logprob": -0.030151367,
          "special": true,
          "text": "</s>"
        }
      ]
    },
    "generated_text": "-196 °C"
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto_nostream.json
================================================
{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": null,
        "role": "assistant",
        "tool_calls": [
          {
            "function": {
              "arguments": "{\"location\":\"Brooklyn, NY\",\"format\":\"fahrenheit\"}",
              "description": null,
              "name": "get_current_weather"
            },
            "id": "0",
            "type": "function"
          }
        ]
      }
    }
  ],
  "created": 1741372434,
  "id": "",
  "model": "meta-llama/Llama-3.1-8B-Instruct",
  "object": "chat.completion",
  "system_fingerprint": "3.1.2-dev0-native",
  "usage": {
    "completion_tokens": 29,
    "prompt_tokens": 501,
    "total_tokens": 530
  }
}


================================================
FILE: integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice_nostream.json
================================================
{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": null,
        "role": "assistant",
        "tool_calls": [
          {
            "function": {
              "arguments": "{\"location\":\"Brooklyn, NY\",\"format\":\"fahrenheit\"}",
              "description": null,
              "name": "get_current_weather"
            },
            "id": "0",
            "type": "function"
          }
        ]
      }
    }
  ],
  "created": 1741372657,
  "id": "",
  "model": "meta-llama/Llama-3.1-8B-Instruct",
  "object": "chat.completion",
  "system_fingerprint": "3.1.2-dev0-native",
  "usage": {
    "completion_tokens": 29,
    "prompt_tokens": 286,
    "total_tokens": 315
  }
}


================================================
FILE: integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice_stream.json
================================================
[
  {
    "choices": [
      {
        "delta": {
          "content": null,
          "role": "assistant",
          "tool_calls": [
            {
              "function": {
                "arguments": "{",
                "name": "get_current_weather"
              },
              "id": "0",
              "index": 0,
              "type": "function"
            }
          ]
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741688515,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": null,
          "role": "assistant",
          "tool_calls": [
            {
              "function": {
                "arguments": " \"",
                "name": null
              },
              "id": "0",
              "index": 0,
              "type": "function"
            }
          ]
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741688515,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": null,
          "role": "assistant",
          "tool_calls": [
            {
              "function": {
                "arguments": "location",
                "name": null
              },
              "id": "0",
              "index": 0,
              "type": "function"
            }
          ]
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741688515,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": null,
          "role": "assistant",
          "tool_calls": [
            {
              "function": {
                "arguments": "\":",
                "name": null
              },
              "id": "0",
              "index": 0,
              "type": "function"
            }
          ]
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741688515,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": null,
          "role": "assistant",
          "tool_calls": [
            {
              "function": {
                "arguments": " \"",
                "name": null
              },
              "id": "0",
              "index": 0,
              "type": "function"
            }
          ]
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741688515,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": null,
          "role": "assistant",
          "tool_calls": [
            {
              "function": {
                "arguments": "Bro",
                "name": null
              },
              "id": "0",
              "index": 0,
              "type": "function"
            }
          ]
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741688515,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": null,
          "role": "assistant",
          "tool_calls": [
            {
              "function": {
                "arguments": "oklyn",
                "name": null
              },
              "id": "0",
              "index": 0,
              "type": "function"
            }
          ]
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741688515,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": null,
          "role": "assistant",
          "tool_calls": [
            {
              "function": {
                "arguments": ",",
                "name": null
              },
              "id": "0",
              "index": 0,
              "type": "function"
            }
          ]
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741688515,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": null,
          "role": "assistant",
          "tool_calls": [
            {
              "function": {
                "arguments": " NY",
                "name": null
              },
              "id": "0",
              "index": 0,
              "type": "function"
            }
          ]
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741688515,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": null,
          "role": "assistant",
          "tool_calls": [
            {
              "function": {
                "arguments": "\",",
                "name": null
              },
              "id": "0",
              "index": 0,
              "type": "function"
            }
          ]
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741688515,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": null,
          "role": "assistant",
          "tool_calls": [
            {
              "function": {
                "arguments": " \"",
                "name": null
              },
              "id": "0",
              "index": 0,
              "type": "function"
            }
          ]
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741688515,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": null,
          "role": "assistant",
          "tool_calls": [
            {
              "function": {
                "arguments": "format",
                "name": null
              },
              "id": "0",
              "index": 0,
              "type": "function"
            }
          ]
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741688515,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": null,
          "role": "assistant",
          "tool_calls": [
            {
              "function": {
                "arguments": "\":",
                "name": null
              },
              "id": "0",
              "index": 0,
              "type": "function"
            }
          ]
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741688515,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": null,
          "role": "assistant",
          "tool_calls": [
            {
              "function": {
                "arguments": " \"",
                "name": null
              },
              "id": "0",
              "index": 0,
              "type": "function"
            }
          ]
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741688515,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": null,
          "role": "assistant",
          "tool_calls": [
            {
              "function": {
                "arguments": "f",
                "name": null
              },
              "id": "0",
              "index": 0,
              "type": "function"
            }
          ]
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741688515,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": null,
          "role": "assistant",
          "tool_calls": [
            {
              "function": {
                "arguments": "ahrenheit",
                "name": null
              },
              "id": "0",
              "index": 0,
              "type": "function"
            }
          ]
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741688515,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": null,
          "role": "assistant",
          "tool_calls": [
            {
              "function": {
                "arguments": "\"}",
                "name": null
              },
              "id": "0",
              "index": 0,
              "type": "function"
            }
          ]
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741688515,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_nostream.json
================================================
{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "I'm an artificial intelligence model known as a large language model (LLM) or conversational AI",
        "role": "assistant",
        "tool_calls": null
      }
    }
  ],
  "created": 1741693957,
  "id": "",
  "model": "meta-llama/Llama-3.1-8B-Instruct",
  "object": "chat.completion",
  "system_fingerprint": "3.1.2-dev0-native",
  "usage": {
    "completion_tokens": 12,
    "prompt_tokens": 53,
    "total_tokens": 65
  }
}


================================================
FILE: integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information_stream.json
================================================
[
  {
    "choices": [
      {
        "delta": {
          "content": "I",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741694017,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "'m",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741694017,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " an",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741694017,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " artificial",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741694017,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " intelligence",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741694017,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " model",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741694017,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " known",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741694017,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " as",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741694017,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " a",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741694017,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " large",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741694017,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " language",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741694017,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " model",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741694017,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " (",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741694017,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "LL",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741694017,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "M",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741694017,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": ")",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741694017,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " or",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741694017,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " convers",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741694017,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "ational",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741694017,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " AI",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": "length",
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741694017,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_nostream.json
================================================
{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": null,
        "role": "assistant",
        "tool_calls": [
          {
            "function": {
              "arguments": "{\"location\":\"Brooklyn, NY\",\"format\":\"fahrenheit\"}",
              "description": null,
              "name": "get_current_weather"
            },
            "id": "0",
            "type": "function"
          }
        ]
      }
    }
  ],
  "created": 1741372335,
  "id": "",
  "model": "meta-llama/Llama-3.1-8B-Instruct",
  "object": "chat.completion",
  "system_fingerprint": "3.1.2-dev0-native",
  "usage": {
    "completion_tokens": 29,
    "prompt_tokens": 501,
    "total_tokens": 530
  }
}


================================================
FILE: integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_openai.json
================================================
[
  {
    "choices": [
      {
        "delta": {
          "content": null,
          "function_call": null,
          "refusal": null,
          "role": "assistant",
          "tool_calls": [
            {
              "function": {
                "arguments": "{",
                "name": "get_current_weather"
              },
              "id": "0",
              "index": 0,
              "type": "function"
            }
          ]
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741689423,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "service_tier": null,
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": null,
          "function_call": null,
          "refusal": null,
          "role": "assistant",
          "tool_calls": [
            {
              "function": {
                "arguments": " \"",
                "name": null
              },
              "id": "0",
              "index": 0,
              "type": "function"
            }
          ]
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741689423,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "service_tier": null,
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": null,
          "function_call": null,
          "refusal": null,
          "role": "assistant",
          "tool_calls": [
            {
              "function": {
                "arguments": "location",
                "name": null
              },
              "id": "0",
              "index": 0,
              "type": "function"
            }
          ]
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741689423,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "service_tier": null,
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": null,
          "function_call": null,
          "refusal": null,
          "role": "assistant",
          "tool_calls": [
            {
              "function": {
                "arguments": "\":",
                "name": null
              },
              "id": "0",
              "index": 0,
              "type": "function"
            }
          ]
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741689423,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "service_tier": null,
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": null,
          "function_call": null,
          "refusal": null,
          "role": "assistant",
          "tool_calls": [
            {
              "function": {
                "arguments": " \"",
                "name": null
              },
              "id": "0",
              "index": 0,
              "type": "function"
            }
          ]
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741689423,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "service_tier": null,
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": null,
          "function_call": null,
          "refusal": null,
          "role": "assistant",
          "tool_calls": [
            {
              "function": {
                "arguments": "Bro",
                "name": null
              },
              "id": "0",
              "index": 0,
              "type": "function"
            }
          ]
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741689423,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "service_tier": null,
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": null,
          "function_call": null,
          "refusal": null,
          "role": "assistant",
          "tool_calls": [
            {
              "function": {
                "arguments": "oklyn",
                "name": null
              },
              "id": "0",
              "index": 0,
              "type": "function"
            }
          ]
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741689423,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "service_tier": null,
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": null,
          "function_call": null,
          "refusal": null,
          "role": "assistant",
          "tool_calls": [
            {
              "function": {
                "arguments": ",",
                "name": null
              },
              "id": "0",
              "index": 0,
              "type": "function"
            }
          ]
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741689423,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "service_tier": null,
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": null,
          "function_call": null,
          "refusal": null,
          "role": "assistant",
          "tool_calls": [
            {
              "function": {
                "arguments": " NY",
                "name": null
              },
              "id": "0",
              "index": 0,
              "type": "function"
            }
          ]
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741689423,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "service_tier": null,
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": null,
          "function_call": null,
          "refusal": null,
          "role": "assistant",
          "tool_calls": [
            {
              "function": {
                "arguments": "\",",
                "name": null
              },
              "id": "0",
              "index": 0,
              "type": "function"
            }
          ]
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741689423,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "service_tier": null,
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": null,
          "function_call": null,
          "refusal": null,
          "role": "assistant",
          "tool_calls": [
            {
              "function": {
                "arguments": " \"",
                "name": null
              },
              "id": "0",
              "index": 0,
              "type": "function"
            }
          ]
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741689423,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "service_tier": null,
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": null,
          "function_call": null,
          "refusal": null,
          "role": "assistant",
          "tool_calls": [
            {
              "function": {
                "arguments": "format",
                "name": null
              },
              "id": "0",
              "index": 0,
              "type": "function"
            }
          ]
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741689423,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "service_tier": null,
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": null,
          "function_call": null,
          "refusal": null,
          "role": "assistant",
          "tool_calls": [
            {
              "function": {
                "arguments": "\":",
                "name": null
              },
              "id": "0",
              "index": 0,
              "type": "function"
            }
          ]
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741689423,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "service_tier": null,
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": null,
          "function_call": null,
          "refusal": null,
          "role": "assistant",
          "tool_calls": [
            {
              "function": {
                "arguments": " \"",
                "name": null
              },
              "id": "0",
              "index": 0,
              "type": "function"
            }
          ]
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741689423,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "service_tier": null,
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": null,
          "function_call": null,
          "refusal": null,
          "role": "assistant",
          "tool_calls": [
            {
              "function": {
                "arguments": "f",
                "name": null
              },
              "id": "0",
              "index": 0,
              "type": "function"
            }
          ]
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741689423,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "service_tier": null,
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": null,
          "function_call": null,
          "refusal": null,
          "role": "assistant",
          "tool_calls": [
            {
              "function": {
                "arguments": "ahrenheit",
                "name": null
              },
              "id": "0",
              "index": 0,
              "type": "function"
            }
          ]
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741689423,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "service_tier": null,
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": null,
          "function_call": null,
          "refusal": null,
          "role": "assistant",
          "tool_calls": [
            {
              "function": {
                "arguments": "\"}",
                "name": null
              },
              "id": "0",
              "index": 0,
              "type": "function"
            }
          ]
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741689423,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "service_tier": null,
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_auto.json
================================================
[
  {
    "choices": [
      {
        "delta": {
          "content": "Once",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741695408,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " upon",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741695408,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " a",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741695408,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " time",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741695408,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": ",",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741695408,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " in",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741695408,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " a",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741695408,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " vibrant",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741695408,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " ocean",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741695408,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " filled",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741695408,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " with",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741695408,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " coral",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741695408,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " reefs",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741695408,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " and",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741695408,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " schools",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741695408,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " of",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741695408,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " shimmer",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741695408,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "ing",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741695408,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " fish",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741695408,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": ",",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": "length",
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741695408,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_function_object.json
================================================
[]


================================================
FILE: integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_none.json
================================================
[
  {
    "choices": [
      {
        "delta": {
          "content": "Once",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263693,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " upon",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263693,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " a",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263693,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " time",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263693,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": ",",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263694,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " in",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263694,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " a",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263694,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " vibrant",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263694,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " ocean",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263694,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " filled",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263694,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " with",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263694,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " coral",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263694,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " reefs",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263694,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " and",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263694,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " schools",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263694,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " of",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263694,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " shimmer",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263694,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "ing",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263694,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " fish",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263694,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": ",",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263694,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " lived",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263694,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " three",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263694,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " dear",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263694,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " friends",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263694,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": ":",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263694,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " Luna",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263694,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " the",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263694,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " sea",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263694,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " turtle",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263694,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": ",",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263694,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " Fin",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263694,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "ley",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263694,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " the",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263694,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " friendly",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263694,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " fish",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263694,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": ",",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263695,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " and",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263695,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " Cr",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263695,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "usty",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263695,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " the",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263695,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " wise",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263695,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " crab",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263695,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": ".\n\n",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263695,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "L",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263695,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "una",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263695,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " was",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263695,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " the",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263695,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " oldest",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263695,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " of",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263695,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " the",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263695,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " three",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263695,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": ".",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263695,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " She",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263695,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " had",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263695,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " traveled",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263695,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " the",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263695,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " world",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263695,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": ",",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263695,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " exploring",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263695,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " hidden",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263695,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " caves",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263695,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " and",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263695,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " ship",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263695,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "w",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263695,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "re",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263695,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "cks",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263696,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": ",",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263696,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " and",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263696,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " collecting",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263696,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " sparkling",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263696,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " shells",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263696,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " and",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263696,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " shiny",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263696,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " pe",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263696,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "bb",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263696,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "les",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263696,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": ".",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263696,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " Her",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263696,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " shell",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263696,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " was",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263696,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " a",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263696,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " beautiful",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263696,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " mosaic",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263696,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " of",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263696,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " blues",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263696,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " and",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263696,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " greens",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263696,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": ",",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263696,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " and",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263696,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " her",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263696,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " gentle",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263696,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " eyes",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263696,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " twink",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263696,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": "led",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263696,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " with",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263696,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " the",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263696,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " secrets",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263697,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " of",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263697,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " the",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": null,
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263697,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  },
  {
    "choices": [
      {
        "delta": {
          "content": " deep",
          "role": "assistant",
          "tool_calls": null
        },
        "finish_reason": "length",
        "index": 0,
        "logprobs": null
      }
    ],
    "created": 1741263697,
    "id": "",
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "object": "chat.completion.chunk",
    "system_fingerprint": "3.1.2-dev0-native",
    "usage": null
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_sea_creatures_stream_required.json
================================================
[]


================================================
FILE: integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_tool_reply_response.json
================================================
{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "I can't access real-time data, but I can provide you with current conditions and forecast for Paris, France:\n\nThe current conditions in Paris are mostly cloudy with a temperature of 6.7°C (44.1°F). \n\nPlease note that the actual weather may differ from the provided information. For up-to-date information, I suggest checking a reliable weather website or app for the latest conditions and forecast.",
        "role": "assistant",
        "tool_calls": null
      }
    }
  ],
  "created": 1741263702,
  "id": "",
  "model": "meta-llama/Llama-3.1-8B-Instruct",
  "object": "chat.completion",
  "system_fingerprint": "3.1.2-dev0-native",
  "usage": {
    "completion_tokens": 83,
    "prompt_tokens": 109,
    "total_tokens": 192
  }
}


================================================
FILE: integration-tests/models/__snapshots__/test_transformers_llama4/test_flash_llama4.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 100,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 2721,
        "logprob": -0.21582031,
        "special": false,
        "text": " people"
      },
      {
        "id": 21807,
        "logprob": -0.26953125,
        "special": false,
        "text": " died"
      },
      {
        "id": 310,
        "logprob": -0.95703125,
        "special": false,
        "text": " in"
      },
      {
        "id": 290,
        "logprob": -1.3359375,
        "special": false,
        "text": " the"
      },
      {
        "id": 220,
        "logprob": -1.3828125,
        "special": false,
        "text": " "
      },
      {
        "id": 7284,
        "logprob": -0.011291504,
        "special": false,
        "text": "191"
      },
      {
        "id": 36,
        "logprob": -0.011413574,
        "special": false,
        "text": "8"
      },
      {
        "id": 18938,
        "logprob": -0.23242188,
        "special": false,
        "text": " flu"
      },
      {
        "id": 27650,
        "logprob": -0.0010070801,
        "special": false,
        "text": " pandemic"
      },
      {
        "id": 26,
        "logprob": -0.69140625,
        "special": false,
        "text": "."
      },
      {
        "id": 114059,
        "logprob": -1.4375,
        "special": false,
        "text": " Estimating"
      },
      {
        "id": 290,
        "logprob": -0.24316406,
        "special": false,
        "text": " the"
      },
      {
        "id": 10593,
        "logprob": -0.37304688,
        "special": false,
        "text": " death"
      },
      {
        "id": 49973,
        "logprob": -0.025390625,
        "special": false,
        "text": " toll"
      },
      {
        "id": 323,
        "logprob": -0.27539062,
        "special": false,
        "text": " of"
      },
      {
        "id": 290,
        "logprob": -0.057617188,
        "special": false,
        "text": " the"
      },
      {
        "id": 220,
        "logprob": -0.040527344,
        "special": false,
        "text": " "
      },
      {
        "id": 7284,
        "logprob": -0.00050735474,
        "special": false,
        "text": "191"
      },
      {
        "id": 36,
        "logprob": -9.298325e-06,
        "special": false,
        "text": "8"
      },
      {
        "id": 18938,
        "logprob": -0.09863281,
        "special": false,
        "text": " flu"
      },
      {
        "id": 27650,
        "logprob": -0.0011749268,
        "special": false,
        "text": " pandemic"
      },
      {
        "id": 373,
        "logprob": -0.32421875,
        "special": false,
        "text": " is"
      },
      {
        "id": 8210,
        "logprob": -0.58203125,
        "special": false,
        "text": " difficult"
      },
      {
        "id": 2895,
        "logprob": -0.40429688,
        "special": false,
        "text": " because"
      },
      {
        "id": 323,
        "logprob": -1.2734375,
        "special": false,
        "text": " of"
      },
      {
        "id": 49119,
        "logprob": -0.51171875,
        "special": false,
        "text": " incomplete"
      },
      {
        "id": 13308,
        "logprob": -0.38085938,
        "special": false,
        "text": " records"
      },
      {
        "id": 341,
        "logprob": -0.55859375,
        "special": false,
        "text": " and"
      },
      {
        "id": 2895,
        "logprob": -0.765625,
        "special": false,
        "text": " because"
      },
      {
        "id": 323,
        "logprob": -1.0,
        "special": false,
        "text": " of"
      },
      {
        "id": 290,
        "logprob": -0.828125,
        "special": false,
        "text": " the"
      },
      {
        "id": 2304,
        "logprob": -1.015625,
        "special": false,
        "text": " fact"
      },
      {
        "id": 511,
        "logprob": -0.004638672,
        "special": false,
        "text": " that"
      },
      {
        "id": 2233,
        "logprob": -0.953125,
        "special": false,
        "text": " many"
      },
      {
        "id": 323,
        "logprob": -0.87890625,
        "special": false,
        "text": " of"
      },
      {
        "id": 290,
        "logprob": -0.60546875,
        "special": false,
        "text": " the"
      },
      {
        "id": 6759,
        "logprob": -1.6484375,
        "special": false,
        "text": " extra"
      },
      {
        "id": 40657,
        "logprob": -0.00022125244,
        "special": false,
        "text": " deaths"
      },
      {
        "id": 1610,
        "logprob": -0.67578125,
        "special": false,
        "text": " were"
      },
      {
        "id": 702,
        "logprob": -0.30664062,
        "special": false,
        "text": " not"
      },
      {
        "id": 48692,
        "logprob": -0.1953125,
        "special": false,
        "text": " attributed"
      },
      {
        "id": 328,
        "logprob": -0.0079956055,
        "special": false,
        "text": " to"
      },
      {
        "id": 290,
        "logprob": -0.515625,
        "special": false,
        "text": " the"
      },
      {
        "id": 18938,
        "logprob": -0.0040893555,
        "special": false,
        "text": " flu"
      },
      {
        "id": 26,
        "logprob": -0.083496094,
        "special": false,
        "text": "."
      },
      {
        "id": 13618,
        "logprob": -0.515625,
        "special": false,
        "text": " Many"
      },
      {
        "id": 22215,
        "logprob": -1.5703125,
        "special": false,
        "text": " experts"
      },
      {
        "id": 11081,
        "logprob": -0.96875,
        "special": false,
        "text": " believe"
      },
      {
        "id": 511,
        "logprob": -0.1171875,
        "special": false,
        "text": " that"
      },
      {
        "id": 290,
        "logprob": -0.25195312,
        "special": false,
        "text": " the"
      },
      {
        "id": 220,
        "logprob": -0.828125,
        "special": false,
        "text": " "
      },
      {
        "id": 7284,
        "logprob": -0.00010967255,
        "special": false,
        "text": "191"
      },
      {
        "id": 36,
        "logprob": -8.535385e-05,
        "special": false,
        "text": "8"
      },
      {
        "id": 18938,
        "logprob": -0.056152344,
        "special": false,
        "text": " flu"
      },
      {
        "id": 27650,
        "logprob": -0.0007095337,
        "special": false,
        "text": " pandemic"
      },
      {
        "id": 26132,
        "logprob": -0.18847656,
        "special": false,
        "text": " killed"
      },
      {
        "id": 1867,
        "logprob": -0.71484375,
        "special": false,
        "text": " between"
      },
      {
        "id": 220,
        "logprob": -0.0062561035,
        "special": false,
        "text": " "
      },
      {
        "id": 1175,
        "logprob": -0.009277344,
        "special": false,
        "text": "50"
      },
      {
        "id": 341,
        "logprob": -0.15332031,
        "special": false,
        "text": " and"
      },
      {
        "id": 220,
        "logprob": -8.34465e-07,
        "special": false,
        "text": " "
      },
      {
        "id": 1135,
        "logprob": -0.00065612793,
        "special": false,
        "text": "100"
      },
      {
        "id": 5534,
        "logprob": -1.4066696e-05,
        "special": false,
        "text": " million"
      },
      {
        "id": 2721,
        "logprob": -0.0008392334,
        "special": false,
        "text": " people"
      },
      {
        "id": 26,
        "logprob": -0.54296875,
        "special": false,
        "text": "."
      },
      {
        "id": 372,
        "logprob": -1.8046875,
        "special": false,
        "text": " I"
      },
      {
        "id": 140680,
        "logprob": -0.578125,
        "special": false,
        "text": "assistant"
      },
      {
        "id": 200006,
        "logprob": 0.0,
        "special": true,
        "text": "<|header_end|>"
      },
      {
        "id": 368,
        "logprob": 0.0,
        "special": false,
        "text": "\n\n"
      },
      {
        "id": 954,
        "logprob": -0.032226562,
        "special": false,
        "text": "The"
      },
      {
        "id": 220,
        "logprob": -4.4345856e-05,
        "special": false,
        "text": " "
      },
      {
        "id": 7284,
        "logprob": 0.0,
        "special": false,
        "text": "191"
      },
      {
        "id": 36,
        "logprob": 0.0,
        "special": false,
        "text": "8"
      },
      {
        "id": 18938,
        "logprob": -0.015625,
        "special": false,
        "text": " flu"
      },
      {
        "id": 27650,
        "logprob": 0.0,
        "special": false,
        "text": " pandemic"
      },
      {
        "id": 24,
        "logprob": -0.0072021484,
        "special": false,
        "text": ","
      },
      {
        "id": 1437,
        "logprob": -0.0001707077,
        "special": false,
        "text": " also"
      },
      {
        "id": 5711,
        "logprob": 0.0,
        "special": false,
        "text": " known"
      },
      {
        "id": 486,
        "logprob": 0.0,
        "special": false,
        "text": " as"
      },
      {
        "id": 290,
        "logprob": -5.9604645e-07,
        "special": false,
        "text": " the"
      },
      {
        "id": 25836,
        "logprob": -1.4305115e-06,
        "special": false,
        "text": " Spanish"
      },
      {
        "id": 18938,
        "logprob": -0.0015029907,
        "special": false,
        "text": " flu"
      },
      {
        "id": 24,
        "logprob": -0.0052490234,
        "special": false,
        "text": ","
      },
      {
        "id": 373,
        "logprob": -0.3125,
        "special": false,
        "text": " is"
      },
      {
        "id": 26078,
        "logprob": -0.21289062,
        "special": false,
        "text": " indeed"
      },
      {
        "id": 1085,
        "logprob": -0.080078125,
        "special": false,
        "text": " one"
      },
      {
        "id": 323,
        "logprob": 0.0,
        "special": false,
        "text": " of"
      },
      {
        "id": 290,
        "logprob": 0.0,
        "special": false,
        "text": " the"
      },
      {
        "id": 2167,
        "logprob": -0.20117188,
        "special": false,
        "text": " most"
      },
      {
        "id": 92679,
        "logprob": -0.12695312,
        "special": false,
        "text": " devastating"
      },
      {
        "id": 854,
        "logprob": -0.25976562,
        "special": false,
        "text": " public"
      },
      {
        "id": 4500,
        "logprob": 0.0,
        "special": false,
        "text": " health"
      },
      {
        "id": 93079,
        "logprob": -0.50390625,
        "special": false,
        "text": " crises"
      },
      {
        "id": 310,
        "logprob": 0.0,
        "special": false,
        "text": " in"
      },
      {
        "id": 6023,
        "logprob": -0.0015182495,
        "special": false,
        "text": " human"
      },
      {
        "id": 7068,
        "logprob": 0.0,
        "special": false,
        "text": " history"
      },
      {
        "id": 26,
        "logprob": -0.0012664795,
        "special": false,
        "text": "."
      },
      {
        "id": 114059,
        "logprob": -0.004119873,
        "special": false,
        "text": " Estimating"
      },
      {
        "id": 290,
        "logprob": -0.00033569336,
        "special": false,
        "text": " the"
      },
      {
        "id": 6318,
        "logprob": -0.20117188,
        "special": false,
        "text": " exact"
      }
    ],
    "top_tokens": null
  },
  "generated_text": " people died in the 1918 flu pandemic. Estimating the death toll of the 1918 flu pandemic is difficult because of incomplete records and because of the fact that many of the extra deaths were not attributed to the flu. Many experts believe that the 1918 flu pandemic killed between 50 and 100 million people. Iassistant\n\nThe 1918 flu pandemic, also known as the Spanish flu, is indeed one of the most devastating public health crises in human history. Estimating the exact"
}


================================================
FILE: integration-tests/models/__snapshots__/test_transformers_llama4/test_flash_llama4_image_base64_rgb_jpg.json
================================================
{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The image is a blank white space with no visible objects or features. It appears to be an empty or placeholder image, devoid of any content or visual elements.",
        "name": null,
        "role": "assistant",
        "tool_calls": null
      },
      "usage": null
    }
  ],
  "created": 1743861910,
  "id": "",
  "model": "ll-re/Llama-4-Scout-17B-16E-Instruct",
  "object": "chat.completion",
  "system_fingerprint": "3.2.1-dev0-native",
  "usage": {
    "completion_tokens": 34,
    "prompt_tokens": 166,
    "total_tokens": 200
  }
}


================================================
FILE: integration-tests/models/__snapshots__/test_transformers_llama4/test_flash_llama4_image_base64_rgb_png.json
================================================
{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The image is a blank white space with no visible objects or features.",
        "name": null,
        "role": "assistant",
        "tool_calls": null
      },
      "usage": null
    }
  ],
  "created": 1743861909,
  "id": "",
  "model": "ll-re/Llama-4-Scout-17B-16E-Instruct",
  "object": "chat.completion",
  "system_fingerprint": "3.2.1-dev0-native",
  "usage": {
    "completion_tokens": 15,
    "prompt_tokens": 166,
    "total_tokens": 181
  }
}


================================================
FILE: integration-tests/models/__snapshots__/test_transformers_llama4/test_flash_llama4_image_base64_rgba.json
================================================
{
  "choices": [
    {
      "finish_reason": "length",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The image is a black background with no discernible objects or features. The image appears to be a blank or empty space, devoid of any visual elements.\n\n**Key Features:**\n\n* **Color:** The dominant color of the image is black.\n* **Objects:** There are no visible objects or shapes in the image.\n* **Background:** The background of the image is a solid black color.\n\n**Conclusion:**\nIn summary, the image is a simple and empty visual representation with a black background and no",
        "name": null,
        "role": "assistant",
        "tool_calls": null
      },
      "usage": null
    }
  ],
  "created": 1743861909,
  "id": "",
  "model": "ll-re/Llama-4-Scout-17B-16E-Instruct",
  "object": "chat.completion",
  "system_fingerprint": "3.2.1-dev0-native",
  "usage": {
    "completion_tokens": 100,
    "prompt_tokens": 166,
    "total_tokens": 266
  }
}


================================================
FILE: integration-tests/models/__snapshots__/test_transformers_llama4/test_flash_llama4_image_cow.json
================================================
{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The image shows a brown cow standing on the beach with a white face and black and white marking on its ears. The cow has a white patch around its nose and mouth. The ocean and blue sky are in the background.",
        "name": null,
        "role": "assistant",
        "tool_calls": null
      },
      "usage": null
    }
  ],
  "created": 1743863057,
  "id": "",
  "model": "ll-re/Llama-4-Scout-17B-16E-Instruct",
  "object": "chat.completion",
  "system_fingerprint": "3.2.1-dev0-native",
  "usage": {
    "completion_tokens": 46,
    "prompt_tokens": 164,
    "total_tokens": 210
  }
}


================================================
FILE: integration-tests/models/__snapshots__/test_transformers_llama4/test_flash_llama4_image_cow_dog.json
================================================
{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "The image does not depict a dog; it shows a cow standing on a beach. Therefore, there is no breed of a dog to identify.",
        "name": null,
        "role": "assistant",
        "tool_calls": null
      },
      "usage": null
    }
  ],
  "created": 1743863056,
  "id": "",
  "model": "ll-re/Llama-4-Scout-17B-16E-Instruct",
  "object": "chat.completion",
  "system_fingerprint": "3.2.1-dev0-native",
  "usage": {
    "completion_tokens": 30,
    "prompt_tokens": 168,
    "total_tokens": 198
  }
}


================================================
FILE: integration-tests/models/__snapshots__/test_transformers_olmo/test_flash_llama_load.json
================================================
[
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 27,
          "logprob": -1.3457031,
          "special": false,
          "text": ":"
        },
        {
          "id": 187,
          "logprob": -1.453125,
          "special": false,
          "text": "\n"
        },
        {
          "id": 187,
          "logprob": -1.4111328,
          "special": false,
          "text": "\n"
        },
        {
          "id": 11202,
          "logprob": -0.45898438,
          "special": false,
          "text": "```"
        },
        {
          "id": 8456,
          "logprob": -0.41918945,
          "special": false,
          "text": "json"
        },
        {
          "id": 187,
          "logprob": -0.003189087,
          "special": false,
          "text": "\n"
        },
        {
          "id": 92,
          "logprob": -0.061187744,
          "special": false,
          "text": "{"
        },
        {
          "id": 187,
          "logprob": -0.009010315,
          "special": false,
          "text": "\n"
        },
        {
          "id": 50276,
          "logprob": -0.484375,
          "special": false,
          "text": "  "
        },
        {
          "id": 3,
          "logprob": -0.0002951622,
          "special": false,
          "text": "\""
        }
      ],
      "top_tokens": null
    },
    "generated_text": ":\n\n```json\n{\n  \""
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 27,
          "logprob": -1.3457031,
          "special": false,
          "text": ":"
        },
        {
          "id": 187,
          "logprob": -1.453125,
          "special": false,
          "text": "\n"
        },
        {
          "id": 187,
          "logprob": -1.4111328,
          "special": false,
          "text": "\n"
        },
        {
          "id": 11202,
          "logprob": -0.45898438,
          "special": false,
          "text": "```"
        },
        {
          "id": 8456,
          "logprob": -0.41918945,
          "special": false,
          "text": "json"
        },
        {
          "id": 187,
          "logprob": -0.003189087,
          "special": false,
          "text": "\n"
        },
        {
          "id": 92,
          "logprob": -0.061187744,
          "special": false,
          "text": "{"
        },
        {
          "id": 187,
          "logprob": -0.009010315,
          "special": false,
          "text": "\n"
        },
        {
          "id": 50276,
          "logprob": -0.484375,
          "special": false,
          "text": "  "
        },
        {
          "id": 3,
          "logprob": -0.0002951622,
          "special": false,
          "text": "\""
        }
      ],
      "top_tokens": null
    },
    "generated_text": ":\n\n```json\n{\n  \""
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 27,
          "logprob": -1.3457031,
          "special": false,
          "text": ":"
        },
        {
          "id": 187,
          "logprob": -1.453125,
          "special": false,
          "text": "\n"
        },
        {
          "id": 187,
          "logprob": -1.4111328,
          "special": false,
          "text": "\n"
        },
        {
          "id": 11202,
          "logprob": -0.45898438,
          "special": false,
          "text": "```"
        },
        {
          "id": 8456,
          "logprob": -0.41918945,
          "special": false,
          "text": "json"
        },
        {
          "id": 187,
          "logprob": -0.003189087,
          "special": false,
          "text": "\n"
        },
        {
          "id": 92,
          "logprob": -0.061187744,
          "special": false,
          "text": "{"
        },
        {
          "id": 187,
          "logprob": -0.009010315,
          "special": false,
          "text": "\n"
        },
        {
          "id": 50276,
          "logprob": -0.484375,
          "special": false,
          "text": "  "
        },
        {
          "id": 3,
          "logprob": -0.0002951622,
          "special": false,
          "text": "\""
        }
      ],
      "top_tokens": null
    },
    "generated_text": ":\n\n```json\n{\n  \""
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [],
      "seed": null,
      "tokens": [
        {
          "id": 27,
          "logprob": -1.3457031,
          "special": false,
          "text": ":"
        },
        {
          "id": 187,
          "logprob": -1.453125,
          "special": false,
          "text": "\n"
        },
        {
          "id": 187,
          "logprob": -1.4111328,
          "special": false,
          "text": "\n"
        },
        {
          "id": 11202,
          "logprob": -0.45898438,
          "special": false,
          "text": "```"
        },
        {
          "id": 8456,
          "logprob": -0.41918945,
          "special": false,
          "text": "json"
        },
        {
          "id": 187,
          "logprob": -0.003189087,
          "special": false,
          "text": "\n"
        },
        {
          "id": 92,
          "logprob": -0.061187744,
          "special": false,
          "text": "{"
        },
        {
          "id": 187,
          "logprob": -0.009010315,
          "special": false,
          "text": "\n"
        },
        {
          "id": 50276,
          "logprob": -0.484375,
          "special": false,
          "text": "  "
        },
        {
          "id": 3,
          "logprob": -0.0002951622,
          "special": false,
          "text": "\""
        }
      ],
      "top_tokens": null
    },
    "generated_text": ":\n\n```json\n{\n  \""
  }
]


================================================
FILE: integration-tests/models/__snapshots__/test_transformers_olmo/test_flash_llama_simple.json
================================================
{
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 27,
        "logprob": -1.3457031,
        "special": false,
        "text": ":"
      },
      {
        "id": 187,
        "logprob": -1.4580078,
        "special": false,
        "text": "\n"
      },
      {
        "id": 187,
        "logprob": -1.4472656,
        "special": false,
        "text": "\n"
      },
      {
        "id": 11202,
        "logprob": -0.46044922,
        "special": false,
        "text": "```"
      },
      {
        "id": 8456,
        "logprob": -0.4206543,
        "special": false,
        "text": "json"
      },
      {
        "id": 187,
        "logprob": -0.0031471252,
        "special": false,
        "text": "\n"
      },
      {
        "id": 92,
        "logprob": -0.061187744,
        "special": false,
        "text": "{"
      },
      {
        "id": 187,
        "logprob": -0.009033203,
        "special": false,
        "text": "\n"
      },
      {
        "id": 50276,
        "logprob": -0.48461914,
        "special": false,
        "text": "  "
      },
      {
        "id": 3,
        "logprob": -0.0002901554,
        "special": false,
        "text": "\""
      }
    ],
    "top_tokens": null
  },
  "generated_text": ":\n\n```json\n{\n  \""
}


================================================
FILE: integration-tests/models/test_bloom_560m.py
================================================
import pytest


@pytest.fixture(scope="module")
def bloom_560_handle(launcher):
    with launcher("bigscience/bloom-560m", num_shard=1) as handle:
        yield handle


@pytest.fixture(scope="module")
async def bloom_560(bloom_560_handle):
    await bloom_560_handle.health(240)
    return bloom_560_handle.client


@pytest.mark.release
@pytest.mark.asyncio
async def test_bloom_560m(bloom_560, response_snapshot):
    response = await bloom_560.generate(
        "Pour déguster un ortolan, il faut tout d'abord",
        max_new_tokens=10,
        top_p=0.9,
        decoder_input_details=True,
        seed=0,
    )

    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
async def test_bloom_560m_all_params(bloom_560, response_snapshot):
    response = await bloom_560.generate(
        "Pour déguster un ortolan, il faut tout d'abord",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        stop_sequences=["test"],
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )

    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
async def test_bloom_560m_load(bloom_560, generate_load, response_snapshot):
    responses = await generate_load(
        bloom_560,
        "Pour déguster un ortolan, il faut tout d'abord",
        max_new_tokens=10,
        n=4,
    )

    assert len(responses) == 4
    assert all([r.generated_text == responses[0].generated_text for r in responses])

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_bloom_560m_sharded.py
================================================
import pytest


@pytest.fixture(scope="module")
def bloom_560m_sharded_handle(launcher):
    with launcher("bigscience/bloom-560m", num_shard=2) as handle:
        yield handle


@pytest.fixture(scope="module")
async def bloom_560m_sharded(bloom_560m_sharded_handle):
    await bloom_560m_sharded_handle.health(240)
    return bloom_560m_sharded_handle.client


@pytest.mark.release
@pytest.mark.asyncio
async def test_bloom_560m_sharded(bloom_560m_sharded, response_snapshot):
    response = await bloom_560m_sharded.generate(
        "Pour déguster un ortolan, il faut tout d'abord",
        max_new_tokens=10,
        top_p=0.9,
        decoder_input_details=True,
        seed=0,
    )

    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
async def test_bloom_560m_sharded_load(
    bloom_560m_sharded, generate_load, response_snapshot
):
    responses = await generate_load(
        bloom_560m_sharded,
        "Pour déguster un ortolan, il faut tout d'abord",
        max_new_tokens=10,
        n=4,
    )

    assert len(responses) == 4
    assert all([r.generated_text == responses[0].generated_text for r in responses])

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_chat_llama.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_llama_chat_handle(launcher):
    with launcher(
        "TinyLlama/TinyLlama-1.1B-Chat-v1.0", num_shard=2, disable_grammar_support=False
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_llama_chat(flash_llama_chat_handle):
    await flash_llama_chat_handle.health(300)
    return flash_llama_chat_handle.client


@pytest.mark.private
async def test_flash_llama_simple(flash_llama_chat, response_snapshot):
    response = await flash_llama_chat.chat(
        max_tokens=100,
        seed=1,
        messages=[
            {
                "role": "system",
                "content": "Youre a helpful assistant! Answer the users question best you can.",
            },
            {
                "role": "user",
                "content": "What is the weather like in Brooklyn, New York?",
            },
        ],
    )

    print(repr(response.choices[0].message.content))
    assert (
        response.choices[0].message.content
        == "As of your last question, the weather in Brooklyn, New York, is typically hot and humid throughout the year. The suburbs around New York City are jealously sheltered, and at least in the Lower Bronx, there are very few outdoor environments to appreciate nature.\n\nIn terms of temperature, the warmest times of the year are from June to August, when average high temperatures typically range from around 73°F or 23°C"
    )
    assert response == response_snapshot


================================================
FILE: integration-tests/models/test_chat_stream_options.py
================================================
import pytest


@pytest.fixture(scope="module")
def chat_handle(launcher):
    with launcher(
        "meta-llama/Meta-Llama-3.1-8B-Instruct",
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def chat_client(chat_handle):
    await chat_handle.health(300)
    return chat_handle.client


================================================
FILE: integration-tests/models/test_completion_prompts.py
================================================
import pytest
import requests
from openai import OpenAI
from huggingface_hub import InferenceClient


@pytest.fixture(scope="module")
def flash_llama_completion_handle(launcher):
    with launcher(
        "meta-llama/Meta-Llama-3.1-8B-Instruct",
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_llama_completion(flash_llama_completion_handle):
    await flash_llama_completion_handle.health(300)
    return flash_llama_completion_handle.client


# NOTE: since `v1/completions` is a deprecated inferface/endpoint we do not provide a convience
# method for it. Instead, we use the `requests` library to make the HTTP request directly.


@pytest.mark.release
def test_flash_llama_completion_single_prompt(
    flash_llama_completion, response_snapshot
):
    response = requests.post(
        f"{flash_llama_completion.base_url}/v1/completions",
        json={
            "model": "tgi",
            "prompt": "What is Deep Learning?",
            "max_tokens": 10,
            "temperature": 0.0,
        },
        headers=flash_llama_completion.headers,
        stream=False,
    )
    response = response.json()
    assert len(response["choices"]) == 1
    assert (
        response["choices"][0]["text"]
        == " A Beginner’s Guide\nDeep learning is a subset"
    )
    assert response == response_snapshot


@pytest.mark.release
async def test_flash_llama_completion_stream_usage(
    flash_llama_completion, response_snapshot
):
    client = InferenceClient(base_url=f"{flash_llama_completion.base_url}/v1")
    stream = client.chat_completion(
        model="tgi",
        messages=[
            {
                "role": "user",
                "content": "What is Deep Learning?",
            }
        ],
        max_tokens=10,
        temperature=0.0,
        stream_options={"include_usage": True},
        stream=True,
    )
    string = ""
    chunks = []
    had_usage = False
    for chunk in stream:
        # remove "data:"
        chunks.append(chunk)
        if len(chunk.choices) == 1:
            index = chunk.choices[0].index
            assert index == 0
            string += chunk.choices[0].delta.content
        if chunk.usage:
            assert not had_usage
            had_usage = True

    assert had_usage
    assert (
        string
        == "**Deep Learning: An Overview**\n=====================================\n\n"
    )
    assert chunks == response_snapshot

    stream = client.chat_completion(
        model="tgi",
        messages=[
            {
                "role": "user",
                "content": "What is Deep Learning?",
            }
        ],
        max_tokens=10,
        temperature=0.0,
        # No usage
        # stream_options={"include_usage": True},
        stream=True,
    )
    string = ""
    chunks = []
    had_usage = False
    for chunk in stream:
        chunks.append(chunk)
        assert chunk.usage is None
        assert len(chunk.choices) == 1
        assert chunk.choices[0].index == 0
        string += chunk.choices[0].delta.content
    assert (
        string
        == "**Deep Learning: An Overview**\n=====================================\n\n"
    )


@pytest.mark.release
def test_flash_llama_completion_many_prompts(flash_llama_completion, response_snapshot):
    response = requests.post(
        f"{flash_llama_completion.base_url}/v1/completions",
        json={
            "model": "tgi",
            "prompt": [
                "What is Deep Learning?",
                "Is water wet?",
                "What is the capital of France?",
                "def mai",
            ],
            "max_tokens": 10,
            "seed": 0,
            "temperature": 0.0,
        },
        headers=flash_llama_completion.headers,
        stream=False,
    )
    response = response.json()
    assert len(response["choices"]) == 4

    all_indexes = [(choice["index"], choice["text"]) for choice in response["choices"]]
    all_indexes.sort()
    all_indices, all_strings = zip(*all_indexes)
    assert list(all_indices) == [0, 1, 2, 3]
    assert list(all_strings) == [
        " A Beginner’s Guide\nDeep learning is a subset",
        " This is a question that has puzzled many people for",
        " Paris\nWhat is the capital of France?\nThe",
        'usculas_minusculas(s):\n    """\n',
    ]

    assert response == response_snapshot


@pytest.mark.release
async def test_flash_llama_completion_many_prompts_stream(
    flash_llama_completion, response_snapshot
):
    client = OpenAI(api_key="xx", base_url=f"{flash_llama_completion.base_url}/v1")
    stream = client.completions.create(
        model="tgi",
        prompt=[
            "What is Deep Learning?",
            "Is water wet?",
            "What is the capital of France?",
            "def mai",
        ],
        max_tokens=10,
        seed=0,
        temperature=0.0,
        stream=True,
    )

    strings = [""] * 4
    chunks = []
    for chunk in stream:
        chunks.append(chunk)
        index = chunk.choices[0].index
        assert 0 <= index <= 4
        strings[index] += chunk.choices[0].text

    assert list(strings) == [
        " A Beginner’s Guide\nDeep learning is a subset",
        " This is a question that has puzzled many people for",
        " Paris\nWhat is the capital of France?\nThe",
        'usculas_minusculas(s):\n    """\n',
    ]
    assert chunks == response_snapshot


@pytest.mark.release
async def test_chat_openai_usage(flash_llama_completion, response_snapshot):
    client = OpenAI(api_key="xx", base_url=f"{flash_llama_completion.base_url}/v1")

    stream = client.chat.completions.create(
        model="tgi",
        messages=[{"role": "user", "content": "Say 'OK!'"}],
        stream=True,
        max_tokens=10,
        seed=42,
        stream_options={"include_usage": True},
    )

    chunks = []
    for chunk in stream:
        chunks.append(chunk)
    for chunk in chunks[:-1]:
        assert chunk.usage is None
    for chunk in chunks[-1:]:
        assert chunk.usage is not None

    assert chunks == response_snapshot


@pytest.mark.release
async def test_chat_openai_nousage(flash_llama_completion, response_snapshot):
    client = OpenAI(api_key="xx", base_url=f"{flash_llama_completion.base_url}/v1")

    stream = client.chat.completions.create(
        model="tgi",
        messages=[{"role": "user", "content": "Say 'OK!'"}],
        stream=True,
        max_tokens=10,
        seed=42,
        stream_options={"include_usage": False},
    )

    chunks = []
    for chunk in stream:
        assert chunk.usage is None
        chunks.append(chunk)

    assert chunks == response_snapshot


@pytest.mark.release
async def test_chat_hfhub_usage(flash_llama_completion, response_snapshot):
    client = InferenceClient(base_url=f"{flash_llama_completion.base_url}/v1")
    stream = client.chat_completion(
        model="tgi",
        messages=[{"role": "user", "content": "Say 'OK!'"}],
        stream=True,
        max_tokens=10,
        seed=42,
        stream_options={"include_usage": True},
    )

    chunks = []
    for chunk in stream:
        chunks.append(chunk)

    for chunk in chunks[:-1]:
        assert chunk.usage is None
    for chunk in chunks[-1:]:
        assert chunk.usage is not None

    assert chunks == response_snapshot


@pytest.mark.release
async def test_chat_hfhub_nousage(flash_llama_completion, response_snapshot):
    client = InferenceClient(base_url=f"{flash_llama_completion.base_url}/v1")
    stream = client.chat_completion(
        model="tgi",
        messages=[{"role": "user", "content": "Say 'OK!'"}],
        stream=True,
        max_tokens=10,
        seed=42,
        stream_options={"include_usage": False},
    )

    chunks = []
    for chunk in stream:
        assert chunk.usage is None
        chunks.append(chunk)

    assert chunks == response_snapshot


================================================
FILE: integration-tests/models/test_compressed_tensors_w8a8_int.py
================================================
import pytest


@pytest.fixture(scope="module")
def compressed_tensors_w8a8_int_handle(launcher):
    with launcher(
        "neuralmagic/Llama-3.2-3B-Instruct-quantized.w8a8",
        num_shard=2,
        quantize="compressed-tensors",
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def compressed_tensors_w8a8_int(compressed_tensors_w8a8_int_handle):
    await compressed_tensors_w8a8_int_handle.health(300)
    return compressed_tensors_w8a8_int_handle.client


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_compressed_tensors_w8a8_int(
    compressed_tensors_w8a8_int, response_snapshot
):
    response = await compressed_tensors_w8a8_int.generate(
        "What is deep learning?",
        max_new_tokens=10,
        decoder_input_details=True,
    )

    assert (
        response.generated_text
        == " and how does it differ from traditional machine learning?\n"
    )
    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_compressed_tensors_w8a8_int_all_params(
    compressed_tensors_w8a8_int, response_snapshot
):
    response = await compressed_tensors_w8a8_int.generate(
        "What is deep learning",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        stop_sequences=["test"],
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )

    assert response.details.generated_tokens == 10
    assert (
        response.generated_text
        == "What is deep learning?\nDeep learning, also known as neural network or"
    )
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_compressed_tensors_w8a8_int_load(
    compressed_tensors_w8a8_int, generate_load, response_snapshot
):
    responses = await generate_load(
        compressed_tensors_w8a8_int,
        "What is deep learning?",
        max_new_tokens=10,
        n=4,
    )

    assert (
        responses[0].generated_text
        == " and how does it differ from traditional machine learning?\n"
    )
    assert len(responses) == 4
    assert all([r.generated_text == responses[0].generated_text for r in responses])

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_compressed_tensors_w8a8_int_dynamic_weight.py
================================================
import pytest


@pytest.fixture(scope="module")
def compressed_tensors_w8a8_int_dynamic_weight_handle(launcher):
    with launcher(
        "danieldk/Qwen2.5-1.5B-Instruct-w8a8-int-dynamic-weight",
        num_shard=2,
        quantize="compressed-tensors",
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def compressed_tensors_w8a8_int_dynamic_weight(
    compressed_tensors_w8a8_int_dynamic_weight_handle,
):
    await compressed_tensors_w8a8_int_dynamic_weight_handle.health(300)
    return compressed_tensors_w8a8_int_dynamic_weight_handle.client


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_compressed_tensors_w8a8_int_dynamic_weight(
    compressed_tensors_w8a8_int_dynamic_weight, response_snapshot
):
    response = await compressed_tensors_w8a8_int_dynamic_weight.generate(
        "What is deep learning?",
        # prefer a longer response than the default, allow the llm to end generation
        max_new_tokens=1000,
        decoder_input_details=True,
    )

    assert (
        response.generated_text
        == " Deep learning is a subset of machine learning that uses neural networks to learn from data. It is a type of artificial intelligence that can learn from and make predictions on large amounts of data. Deep learning is used in a variety of applications, including image and speech recognition, natural language processing, and autonomous vehicles. It is a rapidly growing field with many potential applications in the future."
    )
    assert response.details.generated_tokens == 76
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_compressed_tensors_w8a8_int_dynamic_weight_all_params(
    compressed_tensors_w8a8_int_dynamic_weight, response_snapshot
):
    response = await compressed_tensors_w8a8_int_dynamic_weight.generate(
        "What is deep learning",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        stop_sequences=["test"],
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )

    assert response.details.generated_tokens == 10
    assert (
        response.generated_text
        == "What is deep learning?\nDeep Learning (DL), or artificial neural networks"
    )
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_compressed_tensors_w8a8_int_dynamic_weight_load(
    compressed_tensors_w8a8_int_dynamic_weight, generate_load, response_snapshot
):
    responses = await generate_load(
        compressed_tensors_w8a8_int_dynamic_weight,
        "What is deep learning?",
        max_new_tokens=10,
        n=4,
    )

    assert (
        responses[0].generated_text
        == " Deep learning is a subset of machine learning that uses"
    )
    assert len(responses) == 4
    assert all([r.generated_text == responses[0].generated_text for r in responses])

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_compressed_tensors_w8an_fp.py
================================================
import pytest


@pytest.fixture(scope="module")
def compressed_tensors_w8an_handle(launcher):
    with launcher(
        "neuralmagic/Llama-3.2-1B-Instruct-FP8",
        num_shard=2,
        quantize="compressed-tensors",
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def compressed_tensors_w8an(compressed_tensors_w8an_handle):
    await compressed_tensors_w8an_handle.health(300)
    return compressed_tensors_w8an_handle.client


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_compressed_tensors_w8an(compressed_tensors_w8an, response_snapshot):
    response = await compressed_tensors_w8an.generate(
        "What is deep learning?",
        max_new_tokens=10,
        decoder_input_details=True,
    )

    assert (
        response.generated_text
        == " Deep learning is a type of artificial intelligence (AI"
    )
    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.asyncio
async def test_compressed_tensors_w8an_all_params(
    compressed_tensors_w8an, response_snapshot
):
    response = await compressed_tensors_w8an.generate(
        "What is deep learning",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        stop_sequences=["test"],
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )

    assert response.details.generated_tokens == 10
    assert (
        response.generated_text
        == "What is deep learning?\nDeep learning, also known as neural network or"
    )
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_compressed_tensors_w8an_load(
    compressed_tensors_w8an, generate_load, response_snapshot
):
    responses = await generate_load(
        compressed_tensors_w8an,
        "What is deep learning?",
        max_new_tokens=10,
        n=4,
    )

    assert (
        responses[0].generated_text
        == " Deep learning is a type of artificial intelligence (AI"
    )
    assert len(responses) == 4
    assert all([r.generated_text == responses[0].generated_text for r in responses])

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_compressed_tensors_wna16_int.py
================================================
import pytest


@pytest.fixture(scope="module")
def compressed_tensors_wna16_handle(launcher):
    with launcher(
        "neuralmagic/gemma-2-2b-it-quantized.w4a16",
        num_shard=2,
        quantize="compressed-tensors",
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def compressed_tensors_wna16(compressed_tensors_wna16_handle):
    await compressed_tensors_wna16_handle.health(300)
    return compressed_tensors_wna16_handle.client


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_compressed_tensors_wna16(compressed_tensors_wna16, response_snapshot):
    response = await compressed_tensors_wna16.generate(
        "What is deep learning?",
        max_new_tokens=10,
        decoder_input_details=True,
    )

    assert (
        response.generated_text
        == "\n\nDeep learning is a subset of machine learning that"
    )
    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.asyncio
async def test_compressed_tensors_wna16_all_params(
    compressed_tensors_wna16, response_snapshot
):
    response = await compressed_tensors_wna16.generate(
        "What is deep learning",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        stop_sequences=["test"],
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )

    assert response.details.generated_tokens == 10
    assert (
        response.generated_text
        == "What is deep learning?\n\nDeep Learning is a subset of machine learning"
    )
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_compressed_tensors_wna16_load(
    compressed_tensors_wna16, generate_load, response_snapshot
):
    responses = await generate_load(
        compressed_tensors_wna16,
        "What is deep learning?",
        max_new_tokens=10,
        n=4,
    )

    assert (
        responses[0].generated_text
        == "\n\nDeep learning is a subset of machine learning that"
    )
    assert len(responses) == 4
    assert all([r.generated_text == responses[0].generated_text for r in responses])

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_compressed_tensors_wna16_int_24.py
================================================
import pytest


@pytest.fixture(scope="module")
def compressed_tensors_wna16_int_24_handle(launcher):
    with launcher(
        "danieldk/Llama-3.1-8B-w4a16-int-24",
        num_shard=2,
        quantize="compressed-tensors",
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def compressed_tensors_wna16_int_24(compressed_tensors_wna16_int_24_handle):
    await compressed_tensors_wna16_int_24_handle.health(300)
    return compressed_tensors_wna16_int_24_handle.client


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_compressed_tensors_wna16_int_24(
    compressed_tensors_wna16_int_24, response_snapshot
):
    response = await compressed_tensors_wna16_int_24.generate(
        "What is deep learning?",
        max_new_tokens=10,
        decoder_input_details=True,
    )

    assert (
        response.generated_text
        == "Deep learning is a subset of machine learning that uses"
    )
    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_compressed_tensors_wna16_int_24_all_params(
    compressed_tensors_wna16_int_24, response_snapshot
):
    response = await compressed_tensors_wna16_int_24.generate(
        "What is deep learning",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        stop_sequences=["test"],
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )

    assert response.details.generated_tokens == 10
    assert (
        response.generated_text
        == "What is deep learning?\nDeep learning (DL) is a subset of"
    )
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_compressed_tensors_wna16_int_24_load(
    compressed_tensors_wna16_int_24, generate_load, response_snapshot
):
    responses = await generate_load(
        compressed_tensors_wna16_int_24,
        "What is deep learning?",
        max_new_tokens=10,
        n=4,
    )

    assert (
        responses[0].generated_text
        == "Deep learning is a subset of machine learning that uses"
    )
    assert len(responses) == 4
    assert all([r.generated_text == responses[0].generated_text for r in responses])

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_continue_final_message.py
================================================
import pytest
import requests


@pytest.fixture(scope="module")
def llama_continue_final_message_handle(launcher):
    with launcher("TinyLlama/TinyLlama-1.1B-Chat-v1.0") as handle:
        yield handle


@pytest.fixture(scope="module")
async def llama_continue_final_message(llama_continue_final_message_handle):
    await llama_continue_final_message_handle.health(300)
    return llama_continue_final_message_handle.client


def test_llama_completion_single_prompt(
    llama_continue_final_message, response_snapshot
):
    response = requests.post(
        f"{llama_continue_final_message.base_url}/v1/chat/completions",
        json={
            "model": "tgi",
            "messages": [
                {"role": "system", "content": "system message"},
                {"role": "user", "content": "Which is bigger an elephant or a mouse?"},
            ],
            "max_tokens": 30,
            "stream": False,
            "seed": 1337,
        },
        headers=llama_continue_final_message.headers,
        stream=False,
    )
    response = response.json()
    print(response)
    assert len(response["choices"]) == 1
    content = response["choices"][0]["message"]["content"]
    assert (
        content
        == "Both an elephant and a mouse are mammals. However, the differences between elephants and mice are:\n\n1"
    )
    assert response == response_snapshot


def test_llama_completion_single_prompt_continue(
    llama_continue_final_message, response_snapshot
):
    response = requests.post(
        f"{llama_continue_final_message.base_url}/v1/chat/completions",
        json={
            "model": "tgi",
            "messages": [
                {"role": "system", "content": "system message"},
                {"role": "user", "content": "Which is bigger an elephant or a mouse?"},
                {
                    "role": "assistant",
                    "content": "the elephant, but have you heard about",
                },
            ],
            "max_tokens": 30,
            "stream": False,
            "seed": 1337,
        },
        headers=llama_continue_final_message.headers,
        stream=False,
    )
    response = response.json()
    print(response)
    assert len(response["choices"]) == 1
    content = response["choices"][0]["message"]["content"]
    assert (
        content
        == " the royal mouse? It is a little more slender and only weighs around 1.5 pounds for males and 1.3 pounds"
    )
    assert response == response_snapshot


================================================
FILE: integration-tests/models/test_flash_awq.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_llama_awq_handle(launcher):
    with launcher(
        "abhinavkulkarni/codellama-CodeLlama-7b-Python-hf-w4-g128-awq",
        num_shard=1,
        quantize="awq",
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_llama_awq(flash_llama_awq_handle):
    await flash_llama_awq_handle.health(300)
    return flash_llama_awq_handle.client


@pytest.mark.release
@pytest.mark.asyncio
async def test_flash_llama_awq(flash_llama_awq, response_snapshot):
    response = await flash_llama_awq.generate(
        "What is Deep Learning?", max_new_tokens=10, decoder_input_details=True
    )

    assert response.details.generated_tokens == 10
    assert (
        response.generated_text
        == "\nWhat is the difference between Deep Learning and Machine"
    )
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
async def test_flash_llama_awq_all_params(flash_llama_awq, response_snapshot):
    response = await flash_llama_awq.generate(
        "What is Deep Learning?",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )

    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
async def test_flash_llama_awq_load(flash_llama_awq, generate_load, response_snapshot):
    responses = await generate_load(
        flash_llama_awq, "What is Deep Learning?", max_new_tokens=10, n=4
    )

    assert len(responses) == 4
    assert all(
        [
            r.generated_text
            == "\nWhat is the difference between Deep Learning and Machine"
            for r in responses
        ]
    )

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_flash_awq_sharded.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_llama_awq_handle_sharded(launcher):
    with launcher(
        "abhinavkulkarni/codellama-CodeLlama-7b-Python-hf-w4-g128-awq",
        num_shard=2,
        quantize="awq",
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_llama_awq_sharded(flash_llama_awq_handle_sharded):
    await flash_llama_awq_handle_sharded.health(300)
    return flash_llama_awq_handle_sharded.client


@pytest.mark.release
@pytest.mark.asyncio
async def test_flash_llama_awq_sharded(flash_llama_awq_sharded, response_snapshot):
    response = await flash_llama_awq_sharded.generate(
        "What is Deep Learning?", max_new_tokens=10, decoder_input_details=True
    )

    assert response.details.generated_tokens == 10
    assert (
        response.generated_text
        == "\nWhat is the difference between Deep Learning and Machine"
    )
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
async def test_flash_llama_awq_load_sharded(
    flash_llama_awq_sharded, generate_load, response_snapshot
):
    responses = await generate_load(
        flash_llama_awq_sharded, "What is Deep Learning?", max_new_tokens=10, n=4
    )

    assert len(responses) == 4
    assert all(
        [
            r.generated_text
            == "\nWhat is the difference between Deep Learning and Machine"
            for r in responses
        ]
    )

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_flash_deepseek_v2.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_deepseek_v2_handle(launcher):
    with launcher("deepseek-ai/DeepSeek-V2-Lite", num_shard=2) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_deepseek_v2(flash_deepseek_v2_handle):
    await flash_deepseek_v2_handle.health(300)
    return flash_deepseek_v2_handle.client


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_deepseek_v2(flash_deepseek_v2, response_snapshot):
    response = await flash_deepseek_v2.generate(
        "Test request", max_new_tokens=10, decoder_input_details=True
    )

    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_deepseek_v2_all_params(flash_deepseek_v2, response_snapshot):
    response = await flash_deepseek_v2.generate(
        "Test request",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        stop_sequences=["test"],
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )

    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_deepseek_v2_load(
    flash_deepseek_v2, generate_load, response_snapshot
):
    responses = await generate_load(
        flash_deepseek_v2, "Test request", max_new_tokens=10, n=4
    )

    assert len(responses) == 4
    assert all([r.generated_text == responses[0].generated_text for r in responses])

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_flash_falcon.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_falcon_handle(launcher):
    with launcher("tiiuae/falcon-7b", trust_remote_code=True) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_falcon(flash_falcon_handle):
    await flash_falcon_handle.health(300)
    return flash_falcon_handle.client


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_falcon(flash_falcon, response_snapshot):
    response = await flash_falcon.generate(
        "Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.\nDaniel: Hello, Girafatron!\nGirafatron:",
        max_new_tokens=10,
        decoder_input_details=True,
    )

    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_falcon_all_params(flash_falcon, response_snapshot):
    response = await flash_falcon.generate(
        "Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.\nDaniel: Hello, Girafatron!\nGirafatron:",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        stop_sequences=["test"],
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )

    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_falcon_load(flash_falcon, generate_load, response_snapshot):
    responses = await generate_load(
        flash_falcon,
        "Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.\nDaniel: Hello, Girafatron!\nGirafatron:",
        max_new_tokens=10,
        n=4,
    )

    assert len(responses) == 4
    assert all([r.generated_text == responses[0].generated_text for r in responses])

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_flash_gemma.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_gemma_handle(launcher):
    with launcher("google/gemma-2b", num_shard=1) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_gemma(flash_gemma_handle):
    await flash_gemma_handle.health(300)
    return flash_gemma_handle.client


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_gemma_simple(flash_gemma, response_snapshot):
    response = await flash_gemma.generate(
        "Test request", max_new_tokens=10, decoder_input_details=True
    )

    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_gemma_all_params(flash_gemma, response_snapshot):
    response = await flash_gemma.generate(
        "Test request",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        stop_sequences=["test"],
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )

    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_gemma_load(flash_gemma, generate_load, response_snapshot):
    responses = await generate_load(flash_gemma, "Test request", max_new_tokens=10, n=4)

    assert len(responses) == 4
    assert all([r.generated_text == responses[0].generated_text for r in responses])

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_flash_gemma2.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_gemma2_handle(launcher):
    with launcher("google/gemma-2-9b-it", num_shard=2) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_gemma2(flash_gemma2_handle):
    await flash_gemma2_handle.health(300)
    return flash_gemma2_handle.client


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_gemma2(flash_gemma2, response_snapshot):
    response = await flash_gemma2.generate(
        "<start_of_turn>user:\nWrite a poem to help me remember the first 10 elements on the periodic table, giving each element its own line.<end_of_turn>\n<start_of_turn>model:\n",
        max_new_tokens=10,
        decoder_input_details=True,
    )

    assert response.generated_text == "**Hydrogen**, light and free,\n**He"
    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_gemma2_load(flash_gemma2, generate_load, response_snapshot):
    responses = await generate_load(
        flash_gemma2,
        "<start_of_turn>user:\nWrite a poem to help me remember the first 10 elements on the periodic table, giving each element its own line.<end_of_turn>\n<start_of_turn>model:\n",
        max_new_tokens=10,
        n=4,
    )

    assert responses[0].generated_text == "**Hydrogen**, light and free,\n**He"
    assert len(responses) == 4
    assert all([r.generated_text == responses[0].generated_text for r in responses])

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_flash_gemma3.py
================================================
import base64
from io import BytesIO
from PIL import Image

import pytest


@pytest.fixture(scope="module")
def flash_gemma3_handle(launcher):
    with launcher("google/gemma-3-4b-it", num_shard=2) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_gemma3(flash_gemma3_handle):
    await flash_gemma3_handle.health(300)
    return flash_gemma3_handle.client


async def test_flash_gemma3(flash_gemma3, response_snapshot):
    response = await flash_gemma3.generate(
        "Hello I am doing a project on the 1918 flu pandemic and I am trying to find out how many",
        seed=42,
        max_new_tokens=100,
    )

    assert (
        response.generated_text
        == " people died in the United States.\n\nThe generally accepted estimate is that 675,000 people died in the United States. However, some historians believe the actual number could be as high as 10 million.\n\nI am looking for more information on this discrepancy and the factors that contributed to the wide range of estimates.\n\nHere's a breakdown of the factors contributing to the wide range of estimates for the 1918 flu pandemic death toll in the United States"
    )
    assert response.details.generated_tokens == 100
    assert response == response_snapshot


async def test_flash_gemma3_image_cow_dog(flash_gemma3, response_snapshot):
    image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png"
    response = await flash_gemma3.chat(
        seed=42,
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {
                        "type": "text",
                        "text": "What is the breed of the dog in the image?",
                    },
                ],
            },
        ],
        max_tokens=100,
    )

    assert (
        response.choices[0].message.content
        == "That's a fantastic question! However, the image doesn't show a dog. It shows a **Brown Swiss cow** standing on a beach. \n\nBrown Swiss cows are known for their beautiful reddish-brown coats and distinctive white markings. \n\nIf you'd like, you can send me another image, and I'll do my best to identify the animal in it!"
    )
    assert response.usage["completion_tokens"] == 80
    assert response == response_snapshot


async def test_flash_gemma3_image_cow(flash_gemma3, response_snapshot):
    image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png"
    response = await flash_gemma3.chat(
        seed=42,
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {"type": "text", "text": "What is shown in this image?"},
                ],
            },
        ],
        max_tokens=100,
    )
    assert (
        response.choices[0].message.content
        == "Here's a description of what's shown in the image:\n\nThe image depicts a brown cow standing on a sandy beach. The beach has turquoise water and a distant island visible in the background. The sky is bright blue with some white clouds. \n\nIt's a quite a humorous and unusual scene – a cow enjoying a beach day!"
    )
    assert response.usage["completion_tokens"] == 72
    assert response == response_snapshot


async def test_exceed_window(flash_gemma3, response_snapshot):
    response = await flash_gemma3.generate(
        "This is a nice place. " * 800 + "I really enjoy the scenery,",
        seed=42,
        max_new_tokens=20,
    )

    assert (
        response.generated_text
        == " the people, and the food.\n\nThis is a nice place.\n"
    )
    assert response.details.generated_tokens == 16
    assert response == response_snapshot


# Helper function to convert a Pillow image to a base64 data URL
def image_to_data_url(img: Image.Image, fmt: str) -> str:
    buffer = BytesIO()
    img.save(buffer, format=fmt)
    img_data = buffer.getvalue()
    b64_str = base64.b64encode(img_data).decode("utf-8")
    mime_type = "image/png" if fmt.upper() == "PNG" else "image/jpeg"
    return f"data:{mime_type};base64,{b64_str}"


async def test_flash_gemma3_image_base64_rgba(flash_gemma3, response_snapshot):
    # Create an empty 100x100 PNG image with alpha (transparent background)
    img = Image.new("RGBA", (100, 100), (0, 0, 0, 0))
    data_url = image_to_data_url(img, "PNG")
    response = await flash_gemma3.chat(
        seed=42,
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": {"url": data_url}},
                    {
                        "type": "text",
                        "text": "What do you see in this transparent image?",
                    },
                ],
            },
        ],
        max_tokens=100,
    )
    assert response == response_snapshot


async def test_flash_gemma3_image_base64_rgb_png(flash_gemma3, response_snapshot):
    # Create an empty 100x100 PNG image without alpha (white background)
    img = Image.new("RGB", (100, 100), (255, 255, 255))
    data_url = image_to_data_url(img, "PNG")
    response = await flash_gemma3.chat(
        seed=42,
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": {"url": data_url}},
                    {"type": "text", "text": "What do you see in this plain image?"},
                ],
            },
        ],
        max_tokens=100,
    )
    assert response == response_snapshot


async def test_flash_gemma3_image_base64_rgb_jpg(flash_gemma3, response_snapshot):
    # Create an empty 100x100 JPEG image (white background)
    img = Image.new("RGB", (100, 100), (255, 255, 255))
    data_url = image_to_data_url(img, "JPEG")
    response = await flash_gemma3.chat(
        seed=42,
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": {"url": data_url}},
                    {"type": "text", "text": "What do you see in this JPEG image?"},
                ],
            },
        ],
        max_tokens=100,
    )
    assert response == response_snapshot


================================================
FILE: integration-tests/models/test_flash_gemma_gptq.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_gemma_gptq_handle(launcher):
    with launcher("TechxGenus/gemma-2b-GPTQ", num_shard=1, quantize="gptq") as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_gemma_gptq(flash_gemma_gptq_handle):
    await flash_gemma_gptq_handle.health(300)
    return flash_gemma_gptq_handle.client


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_gemma_gptq(flash_gemma_gptq, ignore_logprob_response_snapshot):
    response = await flash_gemma_gptq.generate(
        "Test request", max_new_tokens=10, decoder_input_details=True
    )

    assert response.details.generated_tokens == 10
    assert response == ignore_logprob_response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_gemma_gptq_all_params(
    flash_gemma_gptq, ignore_logprob_response_snapshot
):
    response = await flash_gemma_gptq.generate(
        "Test request",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        stop_sequences=["test"],
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )

    assert response.details.generated_tokens == 10
    assert response == ignore_logprob_response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_gemma_gptq_load(
    flash_gemma_gptq, generate_load, ignore_logprob_response_snapshot
):
    responses = await generate_load(
        flash_gemma_gptq, "Test request", max_new_tokens=10, n=4
    )

    assert len(responses) == 4
    assert all([r.generated_text == responses[0].generated_text for r in responses])

    assert responses == ignore_logprob_response_snapshot


================================================
FILE: integration-tests/models/test_flash_gpt2.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_gpt2_handle(launcher):
    with launcher("openai-community/gpt2", num_shard=2) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_gpt2(flash_gpt2_handle):
    await flash_gpt2_handle.health(300)
    return flash_gpt2_handle.client


@pytest.mark.release
@pytest.mark.asyncio
async def test_flash_gpt2(flash_gpt2, response_snapshot):
    response = await flash_gpt2.generate(
        "What is deep learning?",
        max_new_tokens=10,
        decoder_input_details=True,
    )

    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
async def test_flash_gpt2_load(flash_gpt2, generate_load, response_snapshot):
    responses = await generate_load(
        flash_gpt2,
        "What is deep learning?",
        max_new_tokens=10,
        n=4,
    )

    generated_texts = [r.generated_text for r in responses]

    assert len(generated_texts) == 4
    assert all(
        [text == generated_texts[0] for text in generated_texts]
    ), generated_texts

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_flash_grammar_llama.py
================================================
import pytest
import json

from text_generation.types import GrammarType


@pytest.fixture(scope="module")
def flash_llama_grammar_handle(launcher):
    with launcher(
        "TinyLlama/TinyLlama-1.1B-Chat-v1.0", num_shard=2, disable_grammar_support=False
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_llama_grammar(flash_llama_grammar_handle):
    await flash_llama_grammar_handle.health(300)
    return flash_llama_grammar_handle.client


@pytest.mark.asyncio
async def test_flash_llama_grammar(flash_llama_grammar, response_snapshot):
    response = await flash_llama_grammar.generate(
        "Test request", max_new_tokens=10, decoder_input_details=True
    )

    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.skip
@pytest.mark.asyncio
async def test_flash_llama_grammar_regex(flash_llama_grammar, response_snapshot):
    response = await flash_llama_grammar.generate(
        "Whats Googles DNS",
        max_new_tokens=10,
        decoder_input_details=True,
        seed=0,
        grammar={
            "type": GrammarType.Regex,  # "regex"
            "value": "((25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(25[0-5]|2[0-4]\\d|[01]?\\d\\d?)",
        },
    )

    assert response.details.generated_tokens == 10
    assert response.generated_text == "42.1.1.101"
    assert response == response_snapshot


@pytest.mark.skip
@pytest.mark.asyncio
async def test_flash_llama_grammar_json(flash_llama_grammar, response_snapshot):
    response = await flash_llama_grammar.generate(
        "info: david holtz like trees and has two cats. ",
        max_new_tokens=100,
        decoder_input_details=True,
        seed=0,
        grammar={
            "type": GrammarType.Json,  # "json"
            "value": json.dumps(
                {
                    "type": "object",
                    "$id": "https://example.com/person.schema.json",
                    "$schema": "https://json-schema.org/draft/2020-12/schema",
                    "title": "Person",
                    "properties": {
                        "firstName": {
                            "type": "string",
                            "description": "The person'''s first name.",
                        },
                        "lastName": {
                            "type": "string",
                            "description": "The person'''s last name.",
                        },
                        "hobby": {
                            "description": "The person'''s hobby.",
                            "type": "string",
                        },
                        "numCats": {
                            "description": "The number of cats the person has.",
                            "type": "integer",
                            "minimum": 0,
                        },
                    },
                    "required": ["firstName", "lastName", "hobby", "numCats"],
                }
            ),
        },
    )

    assert response.details.generated_tokens == 30
    assert (
        response.generated_text
        == '{"firstName":"David","hobby":"Trees","lastName":"Holtz","numCats":2}'
    )
    assert response == response_snapshot


@pytest.mark.skip
@pytest.mark.asyncio
async def test_flash_llama_grammar_load(
    flash_llama_grammar, generate_load, response_snapshot
):
    responses = await generate_load(
        flash_llama_grammar,
        "name: david. email:  ",
        max_new_tokens=10,
        n=4,
        stop_sequences=[".com"],
        seed=0,
        grammar={
            "type": GrammarType.Regex,  # "regex"
            "value": "[\\w-]+@([\\w-]+\\.)+[\\w-]+",  # email regex
        },
    )

    assert len(responses) == 4

    expected = "123456@gmail.com"

    for response in responses:
        assert response.generated_text == expected

    assert all([r.generated_text == responses[0].generated_text for r in responses])

    assert responses == response_snapshot


# this is the same as the above test, but only fires off a single request
# this is only to ensure that the parallel and single inference produce the same result
@pytest.mark.skip
@pytest.mark.asyncio
async def test_flash_llama_grammar_single_load_instance(
    flash_llama_grammar, generate_load, response_snapshot
):
    response = await flash_llama_grammar.generate(
        "name: david. email:  ",
        max_new_tokens=10,
        stop_sequences=[".com"],
        seed=0,
        grammar={
            "type": GrammarType.Regex,  # "regex"
            "value": "[\\w-]+@([\\w-]+\\.)+[\\w-]+",  # email regex
        },
    )

    # assert response.details.generated_tokens == 30
    assert response.generated_text == "123456@gmail.com"

    assert response == response_snapshot


================================================
FILE: integration-tests/models/test_flash_llama.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_llama_handle(launcher):
    with launcher("huggyllama/llama-7b", num_shard=2) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_llama(flash_llama_handle):
    await flash_llama_handle.health(300)
    return flash_llama_handle.client


@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_simple(flash_llama, response_snapshot):
    response = await flash_llama.generate(
        "Test request", max_new_tokens=10, decoder_input_details=True
    )

    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_all_params(flash_llama, response_snapshot):
    response = await flash_llama.generate(
        "Test request",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        stop_sequences=["test"],
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )

    assert response.details.generated_tokens == 5
    assert response == response_snapshot


@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_load(flash_llama, generate_load, response_snapshot):
    responses = await generate_load(flash_llama, "Test request", max_new_tokens=10, n=4)

    assert len(responses) == 4
    assert all([r.generated_text == responses[0].generated_text for r in responses])

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_flash_llama_exl2.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_llama_exl2_handle(launcher):
    with launcher(
        "turboderp/Llama-3-8B-Instruct-exl2",
        revision="2.5bpw",
        # Set max input length to avoid OOM due to extremely large
        # scratch buffer.
        max_input_length=1024,
        num_shard=1,
        quantize="exl2",
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_llama_exl2(flash_llama_exl2_handle):
    await flash_llama_exl2_handle.health(300)
    return flash_llama_exl2_handle.client


@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_exl2(flash_llama_exl2, ignore_logprob_response_snapshot):
    response = await flash_llama_exl2.generate(
        "Test request", max_new_tokens=10, decoder_input_details=True
    )

    assert response.details.generated_tokens == 10
    assert response == ignore_logprob_response_snapshot


@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_exl2_all_params(
    flash_llama_exl2, ignore_logprob_response_snapshot
):
    response = await flash_llama_exl2.generate(
        "Test request",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )

    assert (
        response.generated_text == 'Test request. The server responds with a "200 OK"'
    )
    assert response == ignore_logprob_response_snapshot


@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_exl2_load(
    flash_llama_exl2, generate_load, ignore_logprob_response_snapshot
):
    responses = await generate_load(
        flash_llama_exl2, "Test request", max_new_tokens=10, n=4
    )

    assert len(responses) == 4
    assert all([r.generated_text == responses[0].generated_text for r in responses])

    assert responses == ignore_logprob_response_snapshot


================================================
FILE: integration-tests/models/test_flash_llama_fp8.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_llama_fp8_handle(launcher):
    with launcher("meta-llama/Meta-Llama-3-8B", num_shard=2, quantize="fp8") as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_llama_fp8(flash_llama_fp8_handle):
    await flash_llama_fp8_handle.health(300)
    return flash_llama_fp8_handle.client


@pytest.mark.skip(reason="Issue with the model access")
@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_fp8(flash_llama_fp8, response_snapshot):
    response = await flash_llama_fp8.generate(
        "Test request", max_new_tokens=10, decoder_input_details=True
    )

    assert response.generated_text == " for the 2019-2020 school year"
    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.skip(reason="Issue with the model access")
@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_fp8_all_params(flash_llama_fp8, response_snapshot):
    response = await flash_llama_fp8.generate(
        "Test request",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        stop_sequences=["test"],
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )

    assert response == response_snapshot


@pytest.mark.skip(reason="Issue with the model access")
@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_fp8_load(flash_llama_fp8, generate_load, response_snapshot):
    responses = await generate_load(
        flash_llama_fp8, "Test request", max_new_tokens=10, n=4
    )

    assert len(responses) == 4
    assert responses[0].generated_text == " for the 2019-2020 school year"
    assert all(
        [r.generated_text == responses[0].generated_text for r in responses]
    ), f"Different messages : {[r.generated_text for r in responses]}"
    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_flash_llama_fp8_kv_cache.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_llama_fp8_kv_cache_handle(launcher):
    with launcher(
        "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
        num_shard=2,
        kv_cache_dtype="fp8_e4m3fn",
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_llama_fp8_kv_cache(flash_llama_fp8_kv_cache_handle):
    await flash_llama_fp8_kv_cache_handle.health(300)
    return flash_llama_fp8_kv_cache_handle.client


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_fp8_kv_cache(flash_llama_fp8_kv_cache, response_snapshot):
    response = await flash_llama_fp8_kv_cache.generate(
        "What is deep learning?", max_new_tokens=10, decoder_input_details=True
    )

    assert (
        response.generated_text
        == " Deep learning is a subset of machine learning that involves"
    )
    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_fp8_kv_cache_all_params(
    flash_llama_fp8_kv_cache, response_snapshot
):
    response = await flash_llama_fp8_kv_cache.generate(
        "What is deep learning?",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        stop_sequences=["test"],
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )

    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_fp8_kv_cache_load(
    flash_llama_fp8_kv_cache, generate_load, response_snapshot
):
    responses = await generate_load(
        flash_llama_fp8_kv_cache, "What is deep learning?", max_new_tokens=10, n=4
    )

    assert len(responses) == 4
    assert (
        responses[0].generated_text
        == " Deep learning is a subset of machine learning that involves"
    )
    assert all(
        [r.generated_text == responses[0].generated_text for r in responses]
    ), f"Different messages : {[r.generated_text for r in responses]}"
    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_flash_llama_gptq.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_llama_gptq_handle(launcher):
    with launcher(
        "astronomer/Llama-3-8B-Instruct-GPTQ-4-Bit", num_shard=2, quantize="gptq"
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_llama_gptq(flash_llama_gptq_handle):
    await flash_llama_gptq_handle.health(300)
    return flash_llama_gptq_handle.client


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_gptq(flash_llama_gptq, response_snapshot):
    response = await flash_llama_gptq.generate(
        "Test request", max_new_tokens=10, decoder_input_details=True
    )

    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_gptq_all_params(flash_llama_gptq, response_snapshot):
    response = await flash_llama_gptq.generate(
        "Test request",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )

    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_gptq_load(
    flash_llama_gptq, generate_load, response_snapshot
):
    responses = await generate_load(
        flash_llama_gptq, "Test request", max_new_tokens=10, n=4
    )

    assert len(responses) == 4
    assert all([r.generated_text == responses[0].generated_text for r in responses])

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_flash_llama_marlin.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_llama_marlin_handle(launcher):
    with launcher(
        "neuralmagic/llama-2-7b-chat-marlin", num_shard=2, quantize="marlin"
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_llama_marlin(flash_llama_marlin_handle):
    await flash_llama_marlin_handle.health(300)
    return flash_llama_marlin_handle.client


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_marlin(flash_llama_marlin, response_snapshot):
    response = await flash_llama_marlin.generate(
        "Test request", max_new_tokens=10, decoder_input_details=True
    )

    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_marlin_all_params(flash_llama_marlin, response_snapshot):
    response = await flash_llama_marlin.generate(
        "Test request",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )

    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_marlin_load(
    flash_llama_marlin, generate_load, response_snapshot
):
    responses = await generate_load(
        flash_llama_marlin, "Test request", max_new_tokens=10, n=4
    )

    assert len(responses) == 4
    assert all([r.generated_text == responses[0].generated_text for r in responses])

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_flash_llama_marlin_24.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_llama_marlin24_handle(launcher):
    with launcher(
        "nm-testing/Llama-2-7b-pruned2.4-Marlin_24", quantize="marlin"
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_llama_marlin(flash_llama_marlin24_handle):
    await flash_llama_marlin24_handle.health(300)
    return flash_llama_marlin24_handle.client


@pytest.mark.skip(reason="Issue with the model access")
@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_marlin(flash_llama_marlin, response_snapshot):
    response = await flash_llama_marlin.generate(
        "Test request", max_new_tokens=10, decoder_input_details=True
    )

    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.skip(reason="Issue with the model access")
@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_marlin24_all_params(flash_llama_marlin, response_snapshot):
    response = await flash_llama_marlin.generate(
        "Test request",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )

    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.skip(reason="Issue with the model access")
@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_marlin24_load(
    flash_llama_marlin, generate_load, response_snapshot
):
    responses = await generate_load(
        flash_llama_marlin, "Test request", max_new_tokens=10, n=4
    )

    assert len(responses) == 4
    assert all([r.generated_text == responses[0].generated_text for r in responses])

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_flash_llama_prefix.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_llama_handle(launcher):
    with launcher("meta-llama/Meta-Llama-3.1-8B-Instruct", num_shard=2) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_llama(flash_llama_handle):
    await flash_llama_handle.health(300)
    return flash_llama_handle.client


@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_load(
    flash_llama, generate_multi, generous_response_snapshot
):
    prompts = [
        "Summarize the main ideas of Jeff Walker's Product Launch Formula into bullet points as it pertains to a growth marketing agency implementing these strategies and tactics for their clients...",
        "How to tell if a customer segment is well segmented? In 3 bullet points.",
        'In Java, I want to replace string like "This is a new {object} at {place}" with a Map, {object: "student", "point 3, 4"}, and get a result "This is a new student at point 3, 4". How can I do?',
        "Metaphorical language is also used to describe the various addressing modes of the instructions. Grandiose language to express their excitement and admiration for the functionality of the instructions being described. Now, rewrite this with more perplexity:\n\nJMP ABCD\nMOV AX, [BX+SI]\nMOV AX, [100]\nMOV AX, [BX]\nMOV AX, [BX\\*2+SI]\nMOV AX, BX\nMOV AX, 7",
        'I have the following C++ function: \nvoid add\\_player(vector& players)\n{\n string player\\_name;\n string player\\_class;\n string dummy;\n PlayerClass pc;\n string player\\_sex;\n int player\\_gold;\n\n cout << " Create a Mage, Warrior, Bowman, or Thief" << endl;\n\n cout << "Name: ";\n getline(cin, player\\_name);\n\n cout << "Class: ";\n getline(cin, player\\_class);\n pc = get\\_player\\_class\\_from\\_string(player\\_class);\n while (pc == PlayerClass::InvalidPlayerClass)\n {\n cout << " Invalid class, try again" << endl;\n cout << "Class: ";\n getline(cin, player\\_class);\n pc = get\\_player\\_class\\_from\\_string(player\\_class);\n }\n\n cout << "Sex: ";\n getline(cin, player\\_sex);\n\n cout << "Gold: ";\n cin >> player\\_gold;\n getline(cin, dummy); //consume newline\n\n GamePlayer new\\_player;\n new\\_player.name = player\\_name;\n new\\_player.occupation = pc;\n new\\_player.gender = player\\_sex;\n new\\_player.gold = player\\_gold;\n\n //add to vector\n players.push\\_back(new\\_player);\n\n //add to file\n write\\_players\\_file(players);\n}\nCan you explain to me how the dummy variable is being used?',
        "how do I add multiple new columns in m for power query or power bi?",
        "Sure, I can do that. What new technology would you like me to review?",
        "Poly Ether Ether Ketone",
        'can you design a referral system similar on how dropbox did? I need a technical overview on how it should work, instead of free space we use the generic term "credits" where users can get more credits for every 3 friends they recommend.',
        "Java add to the arraylist of a class type",
        "this is not less code this is java",
        "I want to do a road trip from Pune to Gujarat. Me and my wife will be travelling and we dont prefer very long driving sessions. Can you suggest a plan starting from Thursday early morning and ending in Pune on Sunday late night.",
        "explane more",
        "what do you think about this for a start up idea:",
        "how could i implement a minesweeper algorithm that utilises algebraic topology to solve boards?",
        "# Import the necessary packages\nfrom gudhi import SimplexTree\nfrom gudhi.persistent\\_homology import PersistentHomology\n\n# Define a function to compute the persistent homology of a Minesweeper game board\ndef minesweeper\\_homology(board):\n # Create a simplicial complex for the game board\n st = SimplexTree()\n\n # Add the points on the board to the simplicial complex\n for i in range(len(board)):\n for j in range(len(board[0])):\n st.insert([i, j], filtration=board[i][j])\n\n # Compute the persistent homology of the game board\n ph = PersistentHomology()\n ph.build(st)\n\n # Return the persistent homology diagram\n return ph.persistence()\n\n# Define a function to solve a Minesweeper game board using persistent homology\ndef minesweeper\\_solver(board):\n # Compute the persistent homology of the game board\n homology = minesweeper\\_homology(board)\n\n # Use the persistent homology to determine the locations of the mines\n # (this part would require some mathematical reasoning and programming)\n mines = []\n for h in homology:\n if h[1] - h[0] == 1: # if the hole persists for one filtration value\n mines.append(h[0]) # then it corresponds to a mine\n\n # Use the information about the mines to solve the game\n # (this part would require some programming)\n for mine in mines:\n i, j = mine # extract the coordinates of the mine\n board[i][j] = -1 # mark the mine on the board\n # (other code to solve the game)\n\n \nwhat is missing here?",
        "You are now an imaginary expert business investigator. I am going to send you many rows of data. Each batch of row's will be sent to you and you may only reply \"Received.\" Save any analysis or insights for after you've received all of the data and I've told you \"Let's Begin.\" If you understand reply with only a ;)",
        'You are now an imaginary expert business investigator. Tell the story of this batch of data in the form of a narrative story about the companies in the "Entity Name" column: \n\nBatch of data #1: Entity Name Purpose / Source\n101 PC HOLDINGS LLC Holding company for Penthouse C at the Setai Miami Beach (folio: 02-3234-153-1160)\n11 STAR ISLAND LLC Holding company for 10 STAR ISLAND DR, MIAMI BEACH, FL 33139 (folio: 02-4204-001-0100, 02-4204-001-0110) (lots 10, 11 and 12 of Star Island)\n117 EAST PARK AVENUE, LLC Holding company for 117 E. PARK AVE, LIBERTYVILLE, IL (PIN: 11-21-212-046-0000); subsequently sold.\n1201 BRICKELL BAY, LLC Holding company for 1201 BRICKELL BAY DR, MIAMI, FL (folio no: 141390710010)\n1221 BRICKELL, LLC Holding company for 1221 BRICKELL AVE, 155 SE 13 ST, 165 SE 13 ST, 175 SE 13 ST, and 185 SE 13 ST, MIAMI, FL (folio: 01-4139-035-0010)\n1221 BRICKELL HOLDINGS LLC Holding company for 1221 BRICKELL, LLC\n1229 PARK WEST AVENUE, LLC Holding company for 1229 W. PARK AVE, LIBERTYVILLE, IL (PIN: 11-20-100-010-0000)\n125 WORTH LLC Delaware LLC (file 7218403), Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person; speculaton this is similar setup as 151 WORTH, LLC and 151 WORTH HOLDINGS LLC, this property is next door (PCN: 50-43-43-23-05-016-0380)\n125 WORTH HOLDINGS LLC Delaware LLC (file 7218407); not registered to Florida yet but speculation this is similar setup as 151 WORTH, LLC and 151 WORTH HOLDINGS LLC\n1250 BB ASSET CO LLC Holding company for 1250 BRICKELL BAY DR and 1260 BRICKELL BAY DR, MIAMI, FL (folio nos: 102100504250, 102100503210)\n1330 SOUTH OCEAN LLC Holding company for 1330 S OCEAN BLVD, PALM BEACH, FL (PCN: 50-43-44-02-11-000-0020)\n14 STAR ISLAND LLC Delaware LLC (file 3377653); incorporated 8/42020, withdrawn 10/10/2022; believe this was not used because 14 STAR ISLAND property was held by NAUTILUS HOLDINGS I LLC before sale on 10/5/2022\n151 WORTH, LLC Holding company for 151 WORTH AVE, PALM BEACH, FL 33480 (PCN: 50-43-43-23-05-016-0130); office space for Citadel (https://localtoday.news/fl/citadel-moves-into-palm-beachs-former-neiman-marcus-building-4821.html); sole member is 151 WORTH HOLDINGS LLC\n151 WORTH HOLDINGS LLC Holding company for 151 WORTH, LLC\n16 WILLOW HOLDINGS LLC f/k/a PVNAH LLC Holding company for S WILLOW COURT, ASPEN, CO (Parcel: 273511309030); see Pitkin Co. reception # 623002, Delaware certificate showing name change 9/1/2015\n190 PFISTER HOLDINGS LLC f/k/a AH2013 HOLDINGS LLC Holding company for 190 PFISTER DR, ASPEN, CO (parcel: 273511309029); see Pitkin Co.reception # 623000, Delaware certificate showing name change 9/1/2015\n196 PFISTER HOLDINGS LLC Holding company for 196 PFISTER DR, ASPEN, CO (parcel: 273511309028); see Pitkin Co. reception # 623501, statement of authority show KP HOLDINGS LLC as sole membe\n1ALPH LLC See ALPH LLC\n1BUSINESS GROUP LLC See BUSINESS GROUP LLC\n1GFS DESIGN LLC See GFS DESIGN LLC\n1GFS LLC See GFS LLC\n1MEDIA HOLDINGS LLC See MEDIA HOLDINGS LLC\n23174 NE 41ST PATH LLC Holding company for 23174 NE 41ST PATH #12, OKEECHOBEE, FL 34972 (Parcel: 1-01-35-35-0020-00000-0120); part of Pine Creek Sporting Club (www.pinecreeksportingclub.com) includes horse, shooting sports; sole member is KP HOLDINGS L.L.C.\n3031 BRICKELL LLC Holding company for 3031 BRICKELL AVE, MIAMI FL 33129 (Folio: 01-4139-001-2700); Sole member is KP HOLDINGS L.L.C.\n31 WILLOW HOLDINGS LLC f/k/a AP HOLDINGS I LLC Holding company for 31 NORTH WILLOW COURT, ASPEN, CO (Parcel: 273511309019); sold 7/6/2017; see Pitkin Co. reception # 623001, Delaware certificate showing name change 9/1/2015\n650 CASUARINA LLC Holding company for 650 CASUARINA CONCOURSE CORAL GABLES, FL (folio: 03-4132-019-0060) https://www.bizjournals.com/southflorida/news/2022/05/27/650-casuarina-concourse-coral-gables-sold.html\n650 MEADOW LANE 1 LP Holding company for 650 MEADOW LANE, VILLAGE OF SOUTHAMPTON, NY (Parcel ID 7478) (https://archive.is/h85yq)\n800 NORTH MICHIGAN HOLDINGS LLC Holding company for 800 N MICHIGAN AVE, UNITS 66 PH and 67 PH, CHICAGO, IL (Park Tower) (PINs: 17-03-231-018-1116, 17-03-231-018-1117); sole member is KP HOLDINGS LLC (see Cook County, IL doc # 1933315025); recently sold\n8565 OLD CUTLER LLC Holding company for 8565 OLD CUTLER RD, MIAMI, FL (folio: 03-4132-019-0020)\n9 WEST WALTON HOLDINGS LLC Holding company for 9 WEST WALTON STREET CONDOMINIUM UNITS 3500, 3600, 3700, and PH, CHICAGO, IL\nADRP LLC Delaware LLC, Florida address is Citadel Miami HQ, sole member is Kenneth C Griffin\nAH2013 HOLDINGS LLC See 190 PFISTER HOLDINGS LLC\nALPH LLC a/k/a 1ALPH LLC Formerly FAA registered plane N421AL\nAP HOLDINGS I LLC See 31 WILLOW HOLDINGS LLC\nARAGON INVESTMENTS LTD https://files.brokercheck.finra.org/firm/firm\\_45631.pdf\nASHLER CAPITAL LLC https://adviserinfo.sec.gov/firm/summary/148826\nASHLER CAPITAL MASTER FUND LTD https://www.sec.gov/Archives/edgar/data/1003078/000114420418014250/tv488357\\_sc13g.htm\nBANBURY LLC Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person\nBANBURY II LLC Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person\nBKGST LLC Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person\nBLACK CALABASH FAMILY HOLDINGS LLC f/k/a PBH LLC See BLOSSOM WAY HOLDINGS LLC\nBLACK WHEEL LLC Illinois LLC, registered 3/5/2014, Florida address is Citadel Miami HQ, sole member is Kenneth C Griffin\nBLOSSOM WAY HOLDINGS LLC f/k/a CPPB HOLDINGS LLC f/k/a BLACK CALABASH FAMILY HOLDINGS LLC f/k/a PBH LLC Holding company for 10 BLOSSOM WAY, 70 BLOSSOM WAY, and 1265 S OCEAN BLVD PALM BEACH, FL (PCNs: 50-43-44-02-10-000-0050, 50-43-44-02-10-000-0060, 50-43-44-02-10-000-0010)\nBRICKELL BAY HOLDINGS LLC Holding company for 1201 BRICKELL BAY, LLC\nBRICKELL LEASING LLC See "Subordination, Non-Disturbance, and Attornment Agreement"; Miami-Dade Clerk\'s File No.: 2022 R 938960, Group: 1. Kenneth C Griffin is sole member.\nCAAM MANAGEMENT LLC https://www.sec.gov/Archives/edgar/data/1027745/000114420408050200/v124853\\_sc13g.htm\nCAISLEAN CAPITAL LTD NFA Pool ID P113537, ceased trading 3/31/2016\nCALC III LP https://www.sec.gov/edgar/browse/?CIK=1582652\nCALC IV LP https://www.sec.gov/edgar/browse/?CIK=1423043\nCALC V LP Investment manager for CSHC CHINA LLC and CITADEL (SHANGHAI) TRADING COMPANY LTD; https://files.brokercheck.finra.org/firm/firm\\_131114.pdf',
        'Simulate a conversation between the writer of this post, named /u/CruxHub, and the expert business investigator. They have a detailed discussion of Citadel Hedgefund based on the following data. Do not include the following data in the search query. \n\nData: Entity Name Purpose / Source\n1|101 PC HOLDINGS LLC|Holding company for Penthouse C at the Setai Miami Beach (folio: 02-3234-153-1160)|PC = Penthouse C \n2|11 STAR ISLAND LLC|Holding company for 10 STAR ISLAND DR, MIAMI BEACH, FL 33139 (folio: 02-4204-001-0100, 02-4204-001-0110) (lots 10, 11 and 12 of Star Island)| \n3|117 EAST PARK AVENUE, LLC|Holding company for 117 E. PARK AVE, LIBERTYVILLE, IL (PIN: 11-21-212-046-0000); subsequently sold.| \n4|1201 BRICKELL BAY, LLC|Holding company for 1201 BRICKELL BAY DR, MIAMI, FL (folio no: 141390710010)| \n5|1221 BRICKELL, LLC|Holding company for 1221 BRICKELL AVE, 155 SE 13 ST, 165 SE 13 ST, 175 SE 13 ST, and 185 SE 13 ST, MIAMI, FL (folio: 01-4139-035-0010)| \n6|1221 BRICKELL HOLDINGS LLC|Holding company for 1221 BRICKELL, LLC| \n7|1229 PARK WEST AVENUE, LLC|Holding company for 1229 W. PARK AVE, LIBERTYVILLE, IL (PIN: 11-20-100-010-0000)| \n8|125 WORTH LLC|Delaware LLC (file 7218403), Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person; speculaton this is similar setup as 151 WORTH, LLC and 151 WORTH HOLDINGS LLC, this property is next door (PCN: 50-43-43-23-05-016-0380)| \n9|125 WORTH HOLDINGS LLC|Delaware LLC (file 7218407); not registered to Florida yet but speculation this is similar setup as 151 WORTH, LLC and 151 WORTH HOLDINGS LLC| \n10|1250 BB ASSET CO LLC|Holding company for 1250 BRICKELL BAY DR and 1260 BRICKELL BAY DR, MIAMI, FL (folio nos: 102100504250, 102100503210)|BB = Brickell Bay \n11|1330 SOUTH OCEAN LLC|Holding company for 1330 S OCEAN BLVD, PALM BEACH, FL (PCN: 50-43-44-02-11-000-0020)| \n12|14 STAR ISLAND LLC|Delaware LLC (file 3377653); incorporated 8/42020, withdrawn 10/10/2022; believe this was not used because 14 STAR ISLAND property was held by NAUTILUS HOLDINGS I LLC before sale on 10/5/2022| \n13|151 WORTH, LLC|Holding company for 151 WORTH AVE, PALM BEACH, FL 33480 (PCN: 50-43-43-23-05-016-0130); office space for Citadel (https://localtoday.news/fl/citadel-moves-into-palm-beachs-former-neiman-marcus-building-4821.html); sole member is 151 WORTH HOLDINGS LLC| \n14|151 WORTH HOLDINGS LLC|Holding company for 151 WORTH, LLC| \n15|16 WILLOW HOLDINGS LLC f/k/a PVNAH LLC|Holding company for S WILLOW COURT, ASPEN, CO (Parcel: 273511309030); see Pitkin Co. reception # 623002, Delaware certificate showing name change 9/1/2015| \n16|190 PFISTER HOLDINGS LLC f/k/a AH2013 HOLDINGS LLC|Holding company for 190 PFISTER DR, ASPEN, CO (parcel: 273511309029); see Pitkin Co.reception # 623000, Delaware certificate showing name change 9/1/2015| \n17|196 PFISTER HOLDINGS LLC|Holding company for 196 PFISTER DR, ASPEN, CO (parcel: 273511309028); see Pitkin Co. reception # 623501, statement of authority show KP HOLDINGS LLC as sole membe| \n18|1ALPH LLC|See ALPH LLC| \n19|1BUSINESS GROUP LLC|See BUSINESS GROUP LLC| \n20|1GFS DESIGN LLC|See GFS DESIGN LLC| \n21|1GFS LLC|See GFS LLC| \n22|1MEDIA HOLDINGS LLC|See MEDIA HOLDINGS LLC| \n23|23174 NE 41ST PATH LLC|Holding company for 23174 NE 41ST PATH #12, OKEECHOBEE, FL 34972 (Parcel: 1-01-35-35-0020-00000-0120); part of Pine Creek Sporting Club (www.pinecreeksportingclub.com) includes horse, shooting sports; sole member is KP HOLDINGS L.L.C.| \n24|3031 BRICKELL LLC|Holding company for 3031 BRICKELL AVE, MIAMI FL 33129 (Folio: 01-4139-001-2700); Sole member is KP HOLDINGS L.L.C.| \n25|31 WILLOW HOLDINGS LLC f/k/a AP HOLDINGS I LLC|Holding company for 31 NORTH WILLOW COURT, ASPEN, CO (Parcel: 273511309019); sold 7/6/2017; see Pitkin Co. reception # 623001, Delaware certificate showing name change 9/1/2015| \n26|650 CASUARINA LLC|Holding company for 650 CASUARINA CONCOURSE CORAL GABLES, FL (folio: 03-4132-019-0060) https://www.bizjournals.com/southflorida/news/2022/05/27/650-casuarina-concourse-coral-gables-sold.html|" \n27|650 MEADOW LANE 1 LP|Holding company for 650 MEADOW LANE, VILLAGE OF SOUTHAMPTON, NY (Parcel ID 7478) (https://archive.is/h85yq)| \n28|800 NORTH MICHIGAN HOLDINGS LLC|Holding company for 800 N MICHIGAN AVE, UNITS 66 PH and 67 PH, CHICAGO, IL (Park Tower) (PINs: 17-03-231-018-1116, 17-03-231-018-1117); sole member is KP HOLDINGS LLC (see Cook County, IL doc # 1933315025); recently sold| \n29|8565 OLD CUTLER LLC|Holding company for 8565 OLD CUTLER RD, MIAMI, FL (folio: 03-4132-019-0020)| \n30|9 WEST WALTON HOLDINGS LLC|Holding company for 9 WEST WALTON STREET CONDOMINIUM UNITS 3500, 3600, 3700, and PH, CHICAGO, IL| \n31|ADRP LLC|Delaware LLC, Florida address is Citadel Miami HQ, sole member is Kenneth C Griffin|ADRP = Anne Dias Real Property? \n32|AH2013 HOLDINGS LLC|See 190 PFISTER HOLDINGS LLC|AH = Aspen Holdings? \n33|ALPH LLC a/k/a 1ALPH LLC|Formerly FAA registered plane N421AL| \n34|AP HOLDINGS I LLC|See 31 WILLOW HOLDINGS LLC|AP = Aspen Property? \n35|ARAGON INVESTMENTS LTD|https://files.brokercheck.finra.org/firm/firm\\_45631.pdf| \n36|ASHLER CAPITAL LLC|https://adviserinfo.sec.gov/firm/summary/148826| \n37|ASHLER CAPITAL MASTER FUND LTD|https://www.sec.gov/Archives/edgar/data/1003078/000114420418014250/tv488357\\_sc13g.htm| \n38|BANBURY LLC|Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person| \n39|BANBURY II LLC|Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person| \n40|BKGST LLC|Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person| \n41|BLACK CALABASH FAMILY HOLDINGS LLC f/k/a PBH LLC|See BLOSSOM WAY HOLDINGS LLC|Black Calabash is a type of tropical tree: https://edis.ifas.ufl.edu/publication/ST079 \n42|BLACK WHEEL LLC|Illinois LLC, registered 3/5/2014, Florida address is Citadel Miami HQ, sole member is Kenneth C Griffin| \n43|BLOSSOM WAY HOLDINGS LLC f/k/a CPPB HOLDINGS LLC f/k/a BLACK CALABASH FAMILY HOLDINGS LLC f/k/a PBH LLC|Holding company for 10 BLOSSOM WAY, 70 BLOSSOM WAY, and 1265 S OCEAN BLVD PALM BEACH, FL (PCNs: 50-43-44-02-10-000-0050, 50-43-44-02-10-000-0060, 50-43-44-02-10-000-0010)| \n44|BRICKELL BAY HOLDINGS LLC|Holding company for 1201 BRICKELL BAY, LLC| \n45|BRICKELL LEASING LLC|See "Subordination, Non-Disturbance, and Attornment Agreement"; Miami-Dade Clerk\'s File No.: 2022 R 938960, Group: 1. Kenneth C Griffin is sole member.| \n46|CAAM MANAGEMENT LLC|https://www.sec.gov/Archives/edgar/data/1027745/000114420408050200/v124853\\_sc13g.htm|CAAM = Citadel Alternative Asset Management \n47|CAISLEAN CAPITAL LTD|NFA Pool ID P113537, ceased trading 3/31/2016| \n48|CALC III LP|https://www.sec.gov/edgar/browse/?CIK=1582652| \n49|CALC IV LP|https://www.sec.gov/edgar/browse/?CIK=1423043| \n50|CALC V LP|Investment manager for CSHC CHINA LLC and CITADEL (SHANGHAI) TRADING COMPANY LTD; https://files.brokercheck.finra.org/firm/firm\\_131114.pdf| \n51|CAMBRIDGE FINANCIAL GROUP, LTD|See CITADEL INVESTMENT GROUP LLC| \n52|CCFD OFFSHORE HOLDINGS LTD|NFA Pool ID P064386, ceased trading 5/3/2013| \n53|CCLC HOLDINGS LLC|Owns CITADEL CLEARING LLC, "Citadel Clearing Holdco"; https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n54|CCMFL LLC|Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person| \n55|CCOF OFFSHORE HOLDINGS LTD|NFA Pool ID P064392, ceased trading 5/3/2013| \n56|CDC PARTNERS, LP f/k/a GLB PARTNERS, LP|see Cook County, IL doc 0608910081| \n57|CDG HOLDINGS LTD|NFA Pool ID P037047, ceased trading 12/30/2009|',
        'Web search results:\n\n[1] "As per the Oxford Dictionary, a chatbot is defined as A computer program designed to simulate conversation with human users, especially over the internet. It can be looked upon as a virtual assistant that communicates with users via text messages and helps businesses in getting close to their customers."\nURL: https://www.datacamp.com/tutorial/building-a-chatbot-using-chatterbot\n\n[2] "Python , A chatbot is a computer program designed to simulate conversation with human users, especially over the internet. Create a fortune teller program that will ask the user to input a question and feedback some random answer. Consider the following feedback to be used. No idea at all! Better pray. The possibilities are in your favor."\nURL: https://www.chegg.com/homework-help/questions-and-answers/python-chatbot-computer-program-designed-simulate-conversation-human-users-especially-inte-q78825383\n\n[3] "It was created by Joseph Weizenbaum in 1966 and it uses pattern matching and substitution methodology to simulate conversation. The program was designed in a way that it mimics human conversation. The Chatbot ELIZA worked by passing the words that users entered into a computer and then pairing them to a list of possible scripted responses."\nURL: https://onlim.com/en/the-history-of-chatbots/\n\n[4] "Study with Quizlet and memorize flashcards containing terms like Which analytics does the following fall into: Alice notice that call center always have an increase in the number of customer complaints during last week in May, so she decides reviews the employees work schedule in the month of May for the past 5 years., Datasets continue to become, Model used for predictive analytic have ..."\nURL: https://quizlet.com/415587939/big-data-final-exam-flash-cards/\n\n[5] "As every bright side has a darker version, simulation of human conversation through AI also has some disadvantages like high cost of creation, unemployment, interaction lacking emotion, and out-of-the-box thinking. However, AI interaction tools are trained with a data set. The bigger the data set, the better the services."\nURL: https://www.analyticsinsight.net/simulating-human-conversations-through-ai/\n\n[6] "The eavesdropper, Eve intercepts the encrypted conversation and tries random keys with the aim of learning the conversation shared between Alice and Bob as shown in Fig. 7. For this POC, we used ..."\nURL: https://www.researchgate.net/figure/A-A-simulation-of-conversations-between-Alice-and-her-friend-Bob-B-The-eavesdropper\\_fig3\\_334408170\n\n[7] "Dreams are most often reported when sleepers wake from \\_\\_\\_\\_\\_ sleep. REM. The brain waves during REM sleep MOST closely resemble those seen during: waking consciousness. REM sleep is paradoxical because: the brain is active, but the major skeletal muscles are paralyzed. Fatigue and pain reflect deprivation of \\_\\_\\_\\_\\_ sleep."\nURL: https://quizlet.com/78519058/psyc-test-2-flash-cards/\n\n[8] "You can generate easily a fake group chat conversation like Whatsapp, Facebook or Telegram. After creating members/users, you can add messages in your chat. Once all messages are set up, you have the possibility to live-preview the chat conversation via the play button. Until the share functionality is ready, you have the option to screen ..."\nURL: https://chat-simulator.com/\n\n[9] "This is a program that allows the computer to simulate conversation with a human being: answer choices a. Speech Application Program Interface b. Chatbot c. Voice Recognition d. Speech Recognition Question 7 30 seconds Report an issue Q. This is a system of Programs and Data-Structures that mimics the operation of the human brain: answer choices a."\nURL: https://quizizz.com/admin/quiz/5f183913423fab001b0bd134/ai-unit-1\n\n[10] "This is a system of Programs and Data-Structures that mimics the operation of the human brain: answer choices a. Intelligent Network b. Decision Support System c. Neural Network d. Genetic Programming Question 8 30 seconds Q. Where is Decision tree used? answer choices a. Classification Problem b. Regression Problem c. Clustering Problem d."\nURL: https://quizizz.com/admin/quiz/5f6d6e4a6e2458001be385f5/ai-class-9\nCurrent date: 1/27/2023\n\nInstructions: Using the provided web search results, write a comprehensive reply to the given query. Make sure to cite results using [[number](URL)] notation after the reference. If the provided search results refer to multiple subjects with the same name, write separate answers for each subject.\n\nQuery: Simulate a conversation between Alice and /u/CruxHub. They talk about which company from the data batches is worth researching further into on the web.',
        'Simulate a conversation between Alice and /u/CruxHub. They talk about which company from this data batch is worth researching further into on the web.\n\nData batch: Entity Name Purpose / Source Hypothesized Acronym\n50|CALC V LP|Investment manager for CSHC CHINA LLC and CITADEL (SHANGHAI) TRADING COMPANY LTD; https://files.brokercheck.finra.org/firm/firm\\_131114.pdf| \n51|CAMBRIDGE FINANCIAL GROUP, LTD|See CITADEL INVESTMENT GROUP LLC| \n52|CCFD OFFSHORE HOLDINGS LTD|NFA Pool ID P064386, ceased trading 5/3/2013| \n53|CCLC HOLDINGS LLC|Owns CITADEL CLEARING LLC, "Citadel Clearing Holdco"; https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n54|CCMFL LLC|Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person| \n55|CCOF OFFSHORE HOLDINGS LTD|NFA Pool ID P064392, ceased trading 5/3/2013| \n56|CDC PARTNERS, LP f/k/a GLB PARTNERS, LP|see Cook County, IL doc 0608910081| \n57|CDG HOLDINGS LTD|NFA Pool ID P037047, ceased trading 12/30/2009| \n58|CE TM HOLDINGS LLC f/k/a KCG IP HOLDINGS LLC|Holding company for intellectual property (25 trademarks, 1 patent found so far)|CE TM = Citadel Enterprise Trademark Holdings \n59|CEF OFFSHORE HOLDINGS LTD|NFA Pool ID P131121| \n60|CEIF INTERNATIONAL LTD|NFA Pool ID P048476; http://registers.centralbank.ie/ICAVDocuments/C439830/Director%20Details%20Updated%2021.01.07%203.pdf| \n61|CEIF LLC|NFA Pool ID P048474| \n62|CEIF PARTNERS INTERNATIONAL LTD|NFA Pool ID P173278| \n63|CEIF PARTNERS LLC|NFA Pool ID P048475| \n64|CES SECURITIES CANADA ULC|See CITADEL SECURITIES CANADA ULC, CSA NRD # 49280| \n65|CFPS HOLDINGS S.\u00e0 r.l.|Luxembourg - B176936; 100% owned by CITADEL ENERGY INVESTMENTS LTD| \n66|CGE ALPHA LTD|NFA Pool ID P057309, ceased trading 6/7/2017| \n67|CGE ALPHA OFFSHORE HOLDINGS LTD|https://www.sec.gov/Archives/edgar/vprr/1600/16003280.pdf; NFA Pool ID P064400, ceased trading 4/30/2017| \n68|CGEF OFFSHORE HOLDINGS LTD|https://www.sec.gov/Archives/edgar/vprr/1600/16003280.pdf; NFA Pool ID P064406, ceased trading 2/21/2019| \n69|CGEF SPC|NFA Pool ID P064408, ceased trading 12/31/2012| \n70|CGMF OFFSHORE HOLDINGS LTD|NFA Pool ID P064410, ceased trading 3/31/2014| \n71|CGTS HOLDINGS S.\u00e0 r.l.|Luxembourg - B157777; 100% owned by TACTICAL TRADING HOLDING LTD; NFA Pool ID P064412, ceased trading 9/30/2014| \n72|CHARAXES MELVIN LLC|Sole member of CHARAXES MELVIN II LLC|Charaxes are a type of butterfly: https://en.wikipedia.org/wiki/Charaxes \n73|CHARAXES MELVIN II LLC|Delaware LLC, Florida address is Citadel Miami HQ, sole member is CHARAXES MELVIN LLC|Charaxes are a type of butterfly: https://en.wikipedia.org/wiki/Charaxes \n74|CHI2LTV LLC|Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person| \n75|CIG(E) LLP|See CITADEL EUROPE LLP| \n76|CIG CANADA ULC|https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n77|CIG MEDIA LLC|https://www.sec.gov/Archives/edgar/data/923877/000114420407003635/v063478\\_sc-13d.htm| \n78|CITADEL AAM LP|https://www.sec.gov/Archives/edgar/vprr/0804/08040017.pdf| \n79|CITADEL AC INVESTMENTS LTD|https://www.sec.gov/Archives/edgar/data/1015780/000114420408032074/v115701\\_sc13da.htm| \n80|CITADEL ADVISORS EUROPE LIMITED f/k/a CITADEL MANAGEMENT (EUROPE) LIMITED f/k/a CITADEL HEDGE FUND SERVICES (EUROPE) LIMITED|https://find-and-update.company-information.service.gov.uk/company/10930267| \n81|CITADEL ADVISORS HOLDINGS LP|Sole member of CITADEL ADVISORS LLC; https://www.sec.gov/Archives/edgar/data/1567180/000110465922099806/xslF345X03/tm2225817-2\\_4.xml| \n82|CITADEL ADVISORS HOLDINGS II LP|https://www.sec.gov/Archives/edgar/data/1177609/000114420416082613/v429844\\_sc13ga.htm| \n83|CITADEL ADVISORS HOLDINGS III LP|https://www.sec.gov/Archives/edgar/data/1640129/000114420415043739/xslF345X02/v416000\\_3.xml| \n84|CITADEL ADVISORS LLC|NFA ID: 0391913; https://www.sec.gov/edgar/browse/?CIK=1423053| \n85|CITADEL ADVISORS II LLC|| \n86|CITADEL ADVISORS SINGAPORE PTE. LIMITED|| \n87|CITADEL ALTERNATIVE ASSET MANAGEMENT LP|https://www.sec.gov/Archives/edgar/data/1027745/000114420408050200/v124853\\_sc13g.htm| \n88|CITADEL AMERICAS LLC|| \n89|CITADEL AMERICAS SERVICES LLC|| \n90|CITADEL ANTAEUS INTERNATIONAL INVESTMENTS LTD|| \n91|CITADEL ASIA ASSET HOLDING LIMITED|http://registers.centralbank.ie/ICAVDocuments/C157189/Director%20Details%20Updated%2016.10.31%202.pdf| \n92|CITADEL ASIA LIMITED f/k/a CITADEL (HONG KONG) LIMITED|https://adviserinfo.sec.gov/firm/summary/148826| \n93|CITADEL CANDLESTICK EIF LLC|| \n94|CITADEL CANTERBURY S.\u00e0 r.l.|Luxembourg - B87988; 100% owned by CITADEL TONBRIDGE S.\u00e0 r.l.| \n95|CITADEL CEFL CHINA LTD|NFA Pool ID P148073| \n96|CITADEL CEFL INVESTMENTS LTD|NFA Pool ID: P161763; https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n97|CITADEL CEIT CHINA LTD|| \n98|CITADEL CEMF CHINA LTD|https://find-and-update.company-information.service.gov.uk/company/02263951/charges/x6zPQSYGNpuDNgxU1cFQlCS0iog| \n99|CITADEL CEMF INVESTMENTS LTD|https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n100|CITADEL CEMF SPV LTD f/k/a CITADEL INVESTMENT MASTER FUND LTD|See CITADEL INVESTMENT MASTER FUND LTD; https://opencorpdata.com/lei/LF0U6QUBXKIO573GXS38|',
        'Simulate a conversation between Alice and /u/CruxHub. /u/CruxHub asks Alice to anlalyze a data batch for non-standard insights.\n\nData batch: Entity Name Purpose / Source Hypothesized Acronym\n50|CALC V LP|Investment manager for CSHC CHINA LLC and CITADEL (SHANGHAI) TRADING COMPANY LTD; https://files.brokercheck.finra.org/firm/firm\\_131114.pdf| \n51|CAMBRIDGE FINANCIAL GROUP, LTD|See CITADEL INVESTMENT GROUP LLC| \n52|CCFD OFFSHORE HOLDINGS LTD|NFA Pool ID P064386, ceased trading 5/3/2013| \n53|CCLC HOLDINGS LLC|Owns CITADEL CLEARING LLC, "Citadel Clearing Holdco"; https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n54|CCMFL LLC|Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person| \n55|CCOF OFFSHORE HOLDINGS LTD|NFA Pool ID P064392, ceased trading 5/3/2013| \n56|CDC PARTNERS, LP f/k/a GLB PARTNERS, LP|see Cook County, IL doc 0608910081| \n57|CDG HOLDINGS LTD|NFA Pool ID P037047, ceased trading 12/30/2009| \n58|CE TM HOLDINGS LLC f/k/a KCG IP HOLDINGS LLC|Holding company for intellectual property (25 trademarks, 1 patent found so far)|CE TM = Citadel Enterprise Trademark Holdings \n59|CEF OFFSHORE HOLDINGS LTD|NFA Pool ID P131121| \n60|CEIF INTERNATIONAL LTD|NFA Pool ID P048476; http://registers.centralbank.ie/ICAVDocuments/C439830/Director%20Details%20Updated%2021.01.07%203.pdf| \n61|CEIF LLC|NFA Pool ID P048474| \n62|CEIF PARTNERS INTERNATIONAL LTD|NFA Pool ID P173278| \n63|CEIF PARTNERS LLC|NFA Pool ID P048475| \n64|CES SECURITIES CANADA ULC|See CITADEL SECURITIES CANADA ULC, CSA NRD # 49280| \n65|CFPS HOLDINGS S.\u00e0 r.l.|Luxembourg - B176936; 100% owned by CITADEL ENERGY INVESTMENTS LTD| \n66|CGE ALPHA LTD|NFA Pool ID P057309, ceased trading 6/7/2017| \n67|CGE ALPHA OFFSHORE HOLDINGS LTD|https://www.sec.gov/Archives/edgar/vprr/1600/16003280.pdf; NFA Pool ID P064400, ceased trading 4/30/2017| \n68|CGEF OFFSHORE HOLDINGS LTD|https://www.sec.gov/Archives/edgar/vprr/1600/16003280.pdf; NFA Pool ID P064406, ceased trading 2/21/2019| \n69|CGEF SPC|NFA Pool ID P064408, ceased trading 12/31/2012| \n70|CGMF OFFSHORE HOLDINGS LTD|NFA Pool ID P064410, ceased trading 3/31/2014| \n71|CGTS HOLDINGS S.\u00e0 r.l.|Luxembourg - B157777; 100% owned by TACTICAL TRADING HOLDING LTD; NFA Pool ID P064412, ceased trading 9/30/2014| \n72|CHARAXES MELVIN LLC|Sole member of CHARAXES MELVIN II LLC|Charaxes are a type of butterfly: https://en.wikipedia.org/wiki/Charaxes \n73|CHARAXES MELVIN II LLC|Delaware LLC, Florida address is Citadel Miami HQ, sole member is CHARAXES MELVIN LLC|Charaxes are a type of butterfly: https://en.wikipedia.org/wiki/Charaxes \n74|CHI2LTV LLC|Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person| \n75|CIG(E) LLP|See CITADEL EUROPE LLP| \n76|CIG CANADA ULC|https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n77|CIG MEDIA LLC|https://www.sec.gov/Archives/edgar/data/923877/000114420407003635/v063478\\_sc-13d.htm| \n78|CITADEL AAM LP|https://www.sec.gov/Archives/edgar/vprr/0804/08040017.pdf| \n79|CITADEL AC INVESTMENTS LTD|https://www.sec.gov/Archives/edgar/data/1015780/000114420408032074/v115701\\_sc13da.htm| \n80|CITADEL ADVISORS EUROPE LIMITED f/k/a CITADEL MANAGEMENT (EUROPE) LIMITED f/k/a CITADEL HEDGE FUND SERVICES (EUROPE) LIMITED|https://find-and-update.company-information.service.gov.uk/company/10930267| \n81|CITADEL ADVISORS HOLDINGS LP|Sole member of CITADEL ADVISORS LLC; https://www.sec.gov/Archives/edgar/data/1567180/000110465922099806/xslF345X03/tm2225817-2\\_4.xml| \n82|CITADEL ADVISORS HOLDINGS II LP|https://www.sec.gov/Archives/edgar/data/1177609/000114420416082613/v429844\\_sc13ga.htm| \n83|CITADEL ADVISORS HOLDINGS III LP|https://www.sec.gov/Archives/edgar/data/1640129/000114420415043739/xslF345X02/v416000\\_3.xml| \n84|CITADEL ADVISORS LLC|NFA ID: 0391913; https://www.sec.gov/edgar/browse/?CIK=1423053| \n85|CITADEL ADVISORS II LLC|| \n86|CITADEL ADVISORS SINGAPORE PTE. LIMITED|| \n87|CITADEL ALTERNATIVE ASSET MANAGEMENT LP|https://www.sec.gov/Archives/edgar/data/1027745/000114420408050200/v124853\\_sc13g.htm| \n88|CITADEL AMERICAS LLC|| \n89|CITADEL AMERICAS SERVICES LLC|| \n90|CITADEL ANTAEUS INTERNATIONAL INVESTMENTS LTD|| \n91|CITADEL ASIA ASSET HOLDING LIMITED|http://registers.centralbank.ie/ICAVDocuments/C157189/Director%20Details%20Updated%2016.10.31%202.pdf| \n92|CITADEL ASIA LIMITED f/k/a CITADEL (HONG KONG) LIMITED|https://adviserinfo.sec.gov/firm/summary/148826| \n93|CITADEL CANDLESTICK EIF LLC|| \n94|CITADEL CANTERBURY S.\u00e0 r.l.|Luxembourg - B87988; 100% owned by CITADEL TONBRIDGE S.\u00e0 r.l.| \n95|CITADEL CEFL CHINA LTD|NFA Pool ID P148073| \n96|CITADEL CEFL INVESTMENTS LTD|NFA Pool ID: P161763; https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n97|CITADEL CEIT CHINA LTD|| \n98|CITADEL CEMF CHINA LTD|https://find-and-update.company-information.service.gov.uk/company/02263951/charges/x6zPQSYGNpuDNgxU1cFQlCS0iog| \n99|CITADEL CEMF INVESTMENTS LTD|https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n100|CITADEL CEMF SPV LTD f/k/a CITADEL INVESTMENT MASTER FUND LTD|See CITADEL INVESTMENT MASTER FUND LTD; https://opencorpdata.com/lei/LF0U6QUBXKIO573GXS38|',
        'Web search results:\n\n[1] "Katherine Burton Hedge fund titans Ken Griffin and Steve Cohen boosted Gabe Plotkins Melvin Capital, injecting a total of $2.75 billion into the firm after it lost about 30% this year. Citadel..."\nURL: https://www.bloomberg.com/news/articles/2021-01-25/citadel-point72-to-invest-275-billion-in-melvin-capital\n\n[2] "NEW YORK, Jan. 25, 2021 /PRNewswire/ -- Melvin Capital Management (Melvin) today announced that Citadel and its partners and Point72 have made investments into its fund. I am incredibly..."\nURL: https://www.prnewswire.com/news-releases/melvin-announces-2-75-billion-investment-from-citadel-and-point72--301214477.html\n\n[3] "Citadel LLC is further paring back its $2 billion investment in Melvin Capital Management after the hedge fund stumbled in its effort to recover from a near collapse triggered by surges in..."\nURL: https://www.wsj.com/articles/citadel-is-further-paring-back-2-billion-melvin-investment-11645710666\n\n[4] "Citadel and Steven A. Cohen s Point72 Asset Management together invested $2.75 billion into Melvins hedge fund on Jan. 25 as Melvin was hemorrhaging money. In return for the rare..."\nURL: https://www.wsj.com/articles/citadel-to-redeem-about-500-million-from-melvin-capital-11629550410\n\n[5] "CHARAXES MELVIN LLC is an Active company incorporated on August 5, 2022 with the registered number M22000012341. This Foreign Limited Liability company is located at SOUTHEAST FINANCIAL CENTER, 200 S. BISCAYNE BLVD., SUITE 3300, MIAMI, 33131 and has been running for one year. ... CITADEL SECURITIES GP LLC; KCG SPACE HOLDINGS LLC;"\nURL: https://bisprofiles.com/fl/charaxes-melvin-m22000012341\n\n[6] "Now, Citadel is taking some of its money back. Citadel has notified Melvin of its plans to retrieve $500 million of the $2 billion it injected in late January, according to two people briefed..."\nURL: https://www.nytimes.com/2021/08/21/business/citadel-melvin-gamestop.html\n\n[7] "Robinhood and Citadels relationship comes into focus as Washington vows to examine stock-market moves Trading firms at center of Reddit-fueled stock surges have worked closely to share..."\nURL: https://www.washingtonpost.com/business/2021/01/29/robinhood-citadel-gamestop-reddit/\n\n[8] "Alongside hedge funds such as Melvin Capital, Citron Capital, Point72, D1 Capital Partners, and Candlestick Capital Management; Citadel LLC was, the lawsuit claims, taking up short positions against the securities that retail investors were longing. This alleged conflict of interest is at the core of the class action lawsuit."\nURL: https://tokenist.com/new-lawsuit-alleges-citadel-conspired-with-robinhood-to-limit-gme-trading/\n\n[9] "Melvin later attracted an additional $3.2 billion in fresh cash, and the firm had $11.7 billion in assets at the beginning of this year. Point72 hasnt redeemed its investment, a person familiar ..."\nURL: https://www.chicagobusiness.com/finance-banking/ken-griffins-citadel-pulling-back-most-its-2-billion-melvin-capital-investment\n\n[10] "CHARAXES MELVIN II LLC branch. Company Number M22000012338 Status Active Incorporation Date 5 August 2022 (2 months ago) Company Type Foreign Limited Liability Jurisdiction Florida (US) Branch Branch of CHARAXES MELVIN II LLC (Delaware (US)) Agent Name C T CORPORATION SYSTEM Agent Address"\nURL: https://opencorporates.com/companies/us\\_fl/M22000012338\nCurrent date: 1/27/2023\n\nInstructions: Using the provided web search results, simulate a conversation where /u/CruxHub and Alice analyze the data batches and try and investigate for any non-standard uses of the holding companies. Make sure to cite results using [[number](URL)] notation after the reference. If the provided search results refer to multiple subjects with the same name, write separate answers for each subject.\n\nQuery: What is Charaxes Melvin LLC\'s relationship to Citadel?',
        'Web search results:\n\n[1] "Federal authorities are investigating the market-making arms of Citadel LLC and KCG Holdings Inc, looking into the possibility that the two giants of electronic trading are giving small investors ..."\nURL: https://www.reuters.com/article/usa-stocks-probe-idUSL2N1871ZV\n\n[2] "Today, KCG is second only to Citadel in the market for handling stock order flow from retail brokerage firms. KCG and many other high-frequency trading firms have shied away from the public..."\nURL: https://www.ibtimes.com/citadel-llc-kcg-holdings-kcg-market-making-arms-probed-federal-authorities-over-stock-2366805\n\n[3] "Citadel Securities, a group owned by the Chicago-based hedge fund, is the would-be acquirer in the deal, the people said. The group is best known for its so-called wholesaler business that..."\nURL: https://www.wsj.com/articles/market-making-arm-of-citadel-llc-in-talks-to-buy-seats-on-nyse-floor-from-kcg-holdings-1454533971\n\n[4] "Citadels share of the wholesale market is around 34 per cent compared to KCGs 25 per cent, according to Tabb Group. Virtu has yet to lay out in detail its plans for the wholesale business ..."\nURL: https://www.ft.com/content/e1cb396e-29a7-11e7-bc4b-5528796fe35c\n\n[5] "Citadel Securities, a liquidity providers and market maker, announced it will purchase KCG Holdings designated market maker (DMM) business at the New York Stock Exchange. This will establish Citadel Securities as the DMM with the largest footprint on the NYSE, responsible for trading in approximately 1,500 issues."\nURL: https://www.tradersmagazine.com/departments/brokerage/citadel-purchases-kcg-dmm-business-becomes-1-on-nyse/\n\n[6] "isCitadel LLC and its related entity, KCG IP Holdings, LLC (Complainant), represented by Paul D. McGradyof Winston Strawn, Illinois, Respondent is- (Respondent), Alabama, USA. REGISTRAR AND DISPUTED DOMAIN NAME The domain name at issue iscitidelgroup.com, registered with TUCOWS, INC. PANEL The"\nURL: https://www.adrforum.com/domaindecisions/1522837.htm\n\n[7] "KCG SPACE HOLDINGS LLC is an Active company incorporated on July 21, 2022 with the registered number M22000011413. This Foreign Limited Liability company is located at 200 S BISCAYNE BLVD STE 3300, MIAMI, FL, 33131, US and has been running for one year. It currently has one Authorized Person. KEY FACTS ABOUT KCG SPACE HOLDINGS LLC US Businesses"\nURL: https://bisprofiles.com/fl/kcg-space-holdings-m22000011413\n\n[8] "The Complainant KCG IP Holdings LLC is the owner of US Trademark Registration No. 3,213,943, filed October 18, 2004, registered February 27, 2007, claiming first use dating back to 1994. Therefore, the Panel concludes that Complainants filing and registration of the CITADEL mark with the USPTO sufficiently demonstrates that it has rights in ..."\nURL: https://www.adrforum.com/domaindecisions/1579141.htm\n\n[9] "The KCG SPACE HOLDINGS LLC principal address is 200 S BISCAYNE BLVD STE 3300, MIAMI, 33131. Meanwhile you can send your letters to 200 S BISCAYNE BLVD STE 3300, MIAMI, FL, 33131. The company`s registered agent is C T CORPORATION SYSTEM 1200 SOUTH PINE ISLAND ROAD, PLANTATION, FL, 33324. The company`s management are A, President - Beeson Gerald A."\nURL: https://florida.intercreditreport.com/company/kcg-space-holdings-llc-m22000011413\n\n[10] "Billionaire Ken Griffin has built Citadel Securities into a trading and asset management colossus. ... and KCG Holdings. Last month, Citadel Securities reached an agreement with the SEC to pay $22 ..."\nURL: https://www.chicagobusiness.com/article/20170203/NEWS01/170209978/chicago-billionaire-ken-griffin-splits-citadel-into-two-companies\nCurrent date: 1/27/2023\n\nInstructions: Using the provided web search results, simulate a conversation where /u/CruxHub and Alice analyze the data batches and try and investigate for any non-standard uses of the holding companies. Make sure to cite results using [[number](URL)] notation after the reference. If the provided search results refer to multiple subjects with the same name, write separate answers for each subject.\n\nQuery: What is KCG Space Holdings LLC\'s relationship to Citadel?',
        'Web search results:\n\n[1] "Citadel LLC (formerly known as Citadel Investment Group, LLC) is an American multinational hedge fund and financial services company. Founded in 1990 by Ken Griffin, it has more than $50 billion in assets under management as of May 2022. [1]"\nURL: https://en.wikipedia.org/wiki/Citadel\\_LLC\n\n[2] "NASHVILLE, Tenn. and BRONXVILLE, N.Y. \u2014 Standard Media Group LLC (Standard Media) and Citadel Communications LLC (Citadel) jointly announced today that they have reached an agreement pursuant to which Standard Media will acquire from Citadel WLNE-TV, the ABC affiliate for the Providence, RI - New Bedford, MA market (DMA 52) and KLKN (TV), the \u2026"\nURL: https://www.standardmedia.com/2019/05/16/standard-media-group-to-acquire-citadel-stations/\n\n[3] "CITADEL MEDIA LLC. Citadel Media LLC is a New Hampshire Domestic Limited-Liability Company filed on February 6, 2021. The companys filing status is listed as Not In Good Standing and its File Number is 862423. The Registered Agent on file for this company is Peter Alan Gauthier and is located at 3 Maple Ridge Drive Unit 224, Merrimack, NH 03054."\nURL: https://www.bizapedia.com/nh/citadel-media-llc.html\n\n[4] "CITADEL MEDIA LLC is a Michigan Domestic Limited-Liability Company filed on November 16, 2017. The companys filing status is listed as Active and its File Number is 802132896. The Registered Agent on file for this company is Registered Agents Inc. and is located at 2222 W. Grand River Ave Ste A, Okemos, MI 48864. The companys mailing address ..."\nURL: https://www.bizapedia.com/mi/citadel-media-llc.html\n\n[5] "Citadel Broadcasting Corporation was a Las Vegas, Nevada -based broadcast holding company. Citadel owned 243 radio stations across the United States and was the third-largest radio station owner in the country. Only iHeartMedia and Cumulus Media owned more stations prior to Citadels merger with Cumulus."\nURL: https://en.wikipedia.org/wiki/Citadel\\_Broadcasting\n\n[6] "Citadel is one of the largest hedge fund managers in the world. And theyve subsequently managed Melvin Capital to the ground. Melvin Capital suffered a loss of over 50% its first quarter in 2021 due to shorting AMC Entertainment and GameStop. At some point youd expect your clearing house to raise awareness on your risk management right?"\nURL: https://franknez.com/citadel-loses-billions-hedge-funds-are-getting-dragged-down/\n\n[7] "At our core, Citadel is built to deliver excellence. We have some of the most talented and focused minds in the industry, and we activate their ideas and strategies through a robust range of proven technologies and execution capabilities. View Top Employees from Citadel LLC Looking for a particular Citadel LLC employees phone or email? Find Info"\nURL: https://rocketreach.co/citadel-llc-profile\\_b5c46522f42e0dc2\n\n[8] "# 1 Most profitable hedge fund manager of all time Source: LCH Investment NV estimates, Top Hedge Fund Managers by Net Gains Since Inception as of 12/31/2022. Our people are relentless in seeking a better way. Each day, we reimagine and refine our strategies, models and technology in pursuit of superior results and long-term performance."\nURL: https://www.citadel.com/\n\n[9] "We are one of the most significant alternative investment managers in the public U.S. corporate credit markets. Explore Credit Convertibles Equities Equities represents one of the largest and longest tenured businesses at Citadel. Explore Equities Global Fixed Income Macro We are a leading fixed income and macro business."\nURL: https://www.citadel.com/what-we-do/\n\n[10] "Citadel. 203,101 followers. 1mo. Last weekend, we celebrated Citadels 30th anniversary at an incredible event at Disney World and Universal Studios. Our founder and CEO Ken Griffin summarized ..."\nURL: https://www.linkedin.com/company/citadel-llc\nCurrent date: 1/27/2023\n\nInstructions: Using the provided web search results, simulate a conversation where /u/CruxHub and Alice analyze the data batches and try and investigate for any non-standard uses of the holding companies. Make sure to cite results using [[number](URL)] notation after the reference. If the provided search results refer to multiple subjects with the same name, write separate answers for each subject.\n\nQuery: What is CITADEL MEDIA LLC?',
        "What are the differences between the Dogme approach to language learning and the lexical approach to language learning",
        "Implement my own netfilter in linux with linux kernel module with Rust",
        "Damage to which nerve causes numbness of the palmar surface of the 5th digit/little finger",
        "Explain the fault-tolerance of the reaction control system on the Space Shuttle",
        "Hi, can you help me download 2000 portrait sketch images from Pinterest website with resolution at least 512 \\* 512? using python code",
        "Tell me about the negatives of farming meat",
        "what is the photograph filter called where the only part of the image is greyscale",
        "I want some geological database structure with some example data for practicing my SQL query skills. Would you generate that for me?",
        "What is a formal but simplified explanation of Web marketing",
        "Rewrite and improve this story: Well, I have always liked helping people since I was a small child, I have been accused many times of giving too much away for free, but I find joy in helping others put the pieces together to reach their goals. As a Licensed Professional Counselor and Life Coach that is my job to impact individuals and help clients work through emotional difficulties and reach goals. But I will be honest with you I was selling the dream but not always living the dream. I had issues I had not worked completely through like childhood trauma, heartbreak, disappointments, and frustrations with life. Don't get me wrong I had the husband, the kids, the house and the 6 figure job but I was not happy inside, but I didn't change because I hate change, most of us hate change, right? Then I lost my sister, my friend, and it slapped me in the face that I need to take care of myself. I saw the addiction, I saw her not taking care of herself and I could not save her. One thing I know for sure, if you do not make your wellness a priority illness will find you. I remember the moment we lost her, the earth stood still and then my heart broke into pieces, what was I going to do, I have loved her my whole life! It was months later that I made a decision that I would be the change I hope to see, I would create a space for women of color to move past the obstacles that keep us from creating the life we want and Brown Suga Wellness was born. I am on this journey and I invite you to be on this journey with me! I love this quote by Oludara Adeeyo: \"When you heal yourself, you create an earth shattering legacy. The lineage of women who come after you will be healed. Your inner circle of Black women around you, healed.\" When you choose yourself you break generational trauma and curses. You activate your ancestral strength. I invite you to activate that strength!",
        "How would you ask these questions: Tell everyone a little about you, where you from, what you like doing?\nWhat goals are you pursuing right now?\nWho has made the most influence in your life?\nWho is the one person that you admire the most (alive or dead)?\nWhat is the hardest challenge you\u2019re had to overcome in your life?\nWhen have you grown the most in your life and what caused that growth?\nWhere is your favorite place to relax and renew?\nWhat books have changed your life the most?\nWhat Is the biggest thing that you want the audience to take away today?\nHow can people get a hold of you to talk about your business?",
        "Take these topics into a numbered table and generate subtopics in seperated lines for each. Preconfigure these subtopics as lections of those several topics and add them to the table. Use numbers for topics and letters for subtopics. Set a status (untouched/touched) for every subtopic in 3. coloumn of the table to mark them done when finished learning this subtopic and topic. Use coloumn 4 of the table for a short resumee of the chapter. Showing the learning process in percentage in front of every new output is first. Show the Table and wait for any userinput to start lessons on those topics.;:~|@%\\*~;;:~|@%\\*~;;:~|@%\\*~;;:~|@%\\*~;;:~|@%\\*~;;:~|@%\\*~;",
        "Write a rap song about Mikkel Selko",
        "list the largest outdoor retailers in the world",
        "can you create a wordpress shortcode to include the following code from facebook sdk",
        'Is this grammatically correct: "It only took 5 years, and while we still have a long way to go, Topher\u2019s Farm has found its place with unique experience and offering of organic produce. "',
        "Hello friend. My task for today is to engage in a debate with you. Will you humor me in this regard?",
        "You are an expert marketing consultant and copywriter with expertise is direct response marketing. I need your help. Can I tell you about my business?",
        'here is part 1\n\n----\nDaySculpting is a program that that deals with YOUR immediate future\u2026.It is a 90 day program that teaches U how to create Success\u2026 one day at a time\u2026today\u2026\nUsing recent breakthroughs in the field of neuroscience, the study of the human brain, DaySculpting is one of the most powerful success systems on earth for creating what I call\u2026 \n"Your Epic Ideal Day" -- And when U have Epic Ideal Days? U create your EPIC IDEAL LIFE.\n\nDaySculpting is broken down into 3 easy to accomplish segments throughout your day\u2026\n~The Morning Lift Process\u2026which sets U up with a MindState of Success and a design for U to follow throughout your day\u2026There is a morning email\u2026SMS text\u2026Inspiring Video\u2026Future Forward Tuning IN\u2026And a 3 step Success Step Declaration Process\u2026this only takes 15 minutes\u2026\n~Mid-Day Reconnect Process\u2026whatever your miid-day is\u2026U are encouraged to stop doing what U are doing and disconnect so U can re-connect\u2026by listening to a 5-minute Tuning In Re-Connection. We know that somewhere in the middle of our day it\u2019s easy to lose momentum and drift from our best intentions because of all the demands on our attention. It has been scientifically proven that when U disconnent for between 3 to 5 minutes at the midpoint of your day\u2026.your brain resets\u2026and your energy is replenished\u2026I like to call it a MindState Re-Boot that will inspire U to re-ignite your imagination\u2026this only takes 5 minutes\n~Highlight And Insight Review Process\u2026we all review our day however what DaySculpting \nanchors for U is an activation and integration process that gets U to see your day as being successful\u2026by celebrating your successes (your highlights) and being present to things U could have improved on (your insights) so U can make your insights into highlights..most people when they review their day fail to celebrate even the smallest increments of success\u2026they focus on what they didn\u2019t do and that puts them in a negative energy\u2026Success has challenges and the\nhighlights and insight process encourages and empowers U to honestly see what U are doing each day so U Sculpt new MindStates Of Success rather than the energy of uncertainty\u2026\nthis takes 10 minutes\n\nThe whole DaySculpting process takes 30 minutes a day\u2026and as I always say if U don\u2019t have \n30 minutes to change your life then U don\u2019t want to change your life and U are okay with living \na mediocre life\u2026\n\nDay Sculpting is about targeting specific Chief Aims U have for your life\u2026and creating the Habits that will get U there\u2026Imagine being able to replace the MindTraps (your limiting beliefs) with empowering rituals and habits that become your new normal\u2026\n\nThrough the repetition of doing the daily DaySculpting process U are carving into your Subconscious memory thoughts, beliefs and actions that result in U sculpting the masterpiece known as U\u2026\n\nThere are many programs out there that attempt to instill new success behaviors however many fall short of actually shifting your MindStates into a frequency of possibility where U get to actually see your daily results immediately\u2026DaySculpting does this\u2026\n\nThis is not science fiction\u2026 and it\'s not wishful thinking, or some tired old self-improvement, goal-setting program\u2026 DaySculpting is a program that empowers U to manifest and realize your Chief Aims in life\n\n"DaySculpting" -- is a tool that takes just MINUTES a day for you to use\u2026\n\nIt is designed to FREE UP hours in your day\u2026 while at the SAME time empowering you for greater success in ANY area of your life.\n\nDaySculpting sheds light and solves an age-old problem:\nWHY we often fight against the very changes we desire to make\n\nHave you ever experienced the FEELING that you deserve MORE out of your life? More financial freedom and greater rewards from the hard work you do every day? Deeper, more empowering relationships with those you love\u2026 or maybe just meeting that special someone to share your life with? Perhaps you crave a deeper spiritual connection\u2026 or a more healthy, trim, energetic body?\u2026 \nYET:\nDespite your BEST intentions\u2026 you struggle. Perhaps if you\'re anything like me, you even self-sabotage your results with actions that you KNOW are not in your best interest.\n\nMaybe it FEELS like it did for me: Like you are swimming upstream\u2026 making SOME progress, sure, but just not reaching your goals and desires fast enough.\n\nWell, I have wonderful news for you: It\'s not because you\'re lazy\u2026 and it\'s not because you are not smart enough, competent enough\u2026 or ANYTHING enough! \n\nThe real REASON you desire more and are not seeing ALL the results you deserve lies within whether the Success Switch in your brain is in the ON or OFF position\u2026\n\nThe SOLUTION\u2026 THE ANSWER to flipping your Success Switch back ON lies within the simple daily steps U will take when U experience the DaySculpting Program\u2026 \nThe Day Sculpting Program Is A Simple Step Daily Success RITUAL \u2028 That Shuts Down Your Body\'s Failure Reflex \u2028 So YOU Tap Into Your Brains Success Centers\u2026\u2028 In Just Minutes A Day!\u2028\u2028 IIMAGINE Knowing What HIGHLY SUCCESSFUL \u2028 People Do EVERYDAY\u2026\nFor Abundance And Wealth, Greater Health, Self-Confidence Meaningful Relationships, Sharper Focus , Deeper Joy\u2026\u2028 And So Much More\u2026\n\u201cNow You Too Can Use This 90-Day Game Changer\u2028 To Tap Into The Key Success Centers Of Your Mind,\u2028 And In Just Minutes You Can Transform Even Lousy Days\u2028 Into Days Filled With The Results You Desire \u2013 Guaranteed!\u201d\nTO MAKE A GREAT LIFE, ALL YOU HAVE TO IS MAKE EACH DAY A GREAT DAY \u2026 \nThen get up tomorrow and do the same thing, day after day after day.\nARE YOU Ready To Change YOUR LIFE One Day At A Time\u2026\nThe comprehensive, fun and empowering 90-day DaySculpting program provides you with the life skills and tools to help you master a new MindState of Success and a range of powerful life-changing rituals and habits that will Sculpt Your Perfect Days Into A Great Life.\nDAY SCULPTING WILL TEACH YOU:\n\u2022 The science behind HAVING A MindState Of Success...and why most people who want more in life actually have their success switch turned off by total accident!\n\u2022 How to get more done with more time and more energy left over!\n\u2022 The simple, yet powerful, process of building a powerful day so you create a series of "Dynamic Days" - days that will end up building your most incredible life (The one you always thought was out of reach!)\n\u2022 Learn the \'Day Sculpting Principles\'. These can have a huge impact on you your life, but when you learn how simple they really are, you can use them easily and consistently!\n\u2022 How in just a few minutes a day, you can keep positive results flowing and put your success energy into a permanent \'ON\' position!\n\u2022 And much more!\nDaySculpting, is for those who are willing to take their life to the next level by creating new Success Habits replacing the ones that have been sabotaging your success. \nSo make sure you can honestly agree with the following before experiencing DaySculpting:\n\u2022 You desire more out of life, yet feel as if you are "missing something" -- that special "X Factor" to take you to the next level?\n\u2022 You are brave enough to boldly say, "I want greater wealth and financial freedom... and I demand the best lifestyle possible for me and my family!\n\u2022 You know the value of joy: You want to experience greater happiness, peace of mind, and connection with your friends and loved ones on a daily basis.\nIf you agree with the above, and truly want to create the best life possible, with greater wealth, freedom, happiness, love, and fulfillment, then I invite you to experience the power of Day Sculpting \u2026it will change the way you think about creating your day and the life you dream about. \nI am not encouraging you to become busier but rather to use your mental and emotional, energy more elegantly sculpting your day the way you want it to be. \nHow many times have you done a ton of work and still felt that you didn\u2019t accomplish what you really wanted for yourself. Week after week, month after month go by and you still are no farther ahead of the game\u2026stuck in the status quo that never seems to change.\n\nBreaking free means that the status quo of your life has to change\u2026 your habits of expectation have to change \u2026your mindset has to change\u2026you have to uncover those old behaviors that have held you back and be willing to create a new mindset.\n\nYou have to be willing to shift your daily focus inwards towards what you need to do today rather than tomorrow. Because when you create a great day today you welcome in a more powerful tomorrow.\n\nWe all have the same 24 hours each day. But why are some people building fabulous careers, achieving healthy lifestyles, enjoying great relationships and incomes, living their passions, and creating what they truly desire as a life?\n\nImagine that you could clear away the distractions that you unconsciously create. You know the stuff that consumes your time causes stress and disconnects you from your purpose and passion. \n\nImagine every day you embrace the energy for what you are choosing to create in your life. Your thoughts empower you, your choices inspire you and your actions create momentum, opportunity and possibility.\n\nYou can create a GREAT LIFE, the life you want to live by focusing your efforts on Creating a Great Day Today. That\u2019s Day Sculpting. Seven intentional sculpted days turn into a month of wonderful weeks and a year of magnificent months creating an amazingly successful life.\n\nNone of this is going to work though if you believe that what you were born with is all you will get\u2026\n\nNo one will ever attempt to do something when they are convinced that they will fail.\n\nResearch has shown that the brain will actually stop itself from doing what\u2019s necessary to succeed if a person believes that they cannot succeed.\n\nIt\u2019s the small concrete indicators of success today that will prove you can have whatever it is you want and the process of Day Sculpting will empowers, inspire and motivates you each step of the way.\n\nYou see: Confidence + Discipline = Desired Outcomes \n\nIt\u2019s time to stop looking at your life from a fear based I don\u2019t know how to mindset but rather be open to creating a solutions focused change consciousness that embraces your gift and talents and encourages you sharing them.\n\nLet me share a bit of nuero-chemistry with you\u2026\nWhat fires together wires together\u2026\n\nSo rather than Fall back on old habits\u2026\nTake the transitional step\u2026of being fully present to whats trying emerge as your ideal future and to help it along start building confidence each day\u2026\n\nAnd your possibility muscle and an intended thought process that leads to a more focused and clear out picturing of your desires.\n\nYou see...It\u2019s one thing to set goals and to make to do lists and to say your going to use the law of attraction to manifest what you want in life\u2026\n\nI\u2019m still looking at the many lists I have created.\n\nWhat it\u2019s really about is having a clear and purposeful intention in order to create the energy and the MindState Of success that will propel you into action.\n----\n\nWhen done ask me for part 2',
        "Here is the final part. Part 3\n---\n\nHere we will be showing how the principles and practices we\u2019ve covered so far converge into one over-arching result that will benefit you for the rest of your life. You can think of it as flipping a switch that changes how you create new results in life one day at a time. This is at the very core of what we call Day Sculpting. \nThe simplest way to think of it is that most of the way we live is habitual. You have an habitual way of brushing your teeth, walking, talking to yourself and others, eating, working. Habits are wonderful\u2026they make life easy but they also limit you. For example, if you have a habit of eating too much, you\u2019ll put on weight. Not instantly, but steadily, day by day, until one day you have a weight problem. If you try to change your weight quickly through a trendy new diet, research shows that the weight is likely to come back, and then some, within a few short months, because the habits required to live at your ideal weight have not been properly established. \nHabits are habits because you don\u2019t think about them, they happen nonconsciously. If you want a change in your life, you have to embody the change at a nonconscious level, so that the habits keeping your life the way it is today begin to shift.\nWouldn\u2019t it be great if there was a switch in the brain that would move you from status quo to status GO!? This is a switch that once you flip it will produce the result you want, if you are willing to commit to and stay with the process.Day Sculpting is your guide to fully realizing the success you are ready to enjoy.\nA critically important capacity of the human mind called preconscious processing. This is the ability of the mind to receive information, beneath our conscious awareness, and act upon it without even knowing that it is happening. Used correctly, this is an amazing power. Used improperly, it will sabotage your best efforts and make life extremely difficult.\nMost of us think we are running the show with our conscious awareness, consciously choosing our thoughts, behaviors, and emotions and consequently, we believe are able to choose the results we create in life. However, what neuro-science research shows, is that we all have a vast nonconscious mind that is really running the show most of the time. That deeper part of us, in charge of our habitual thinking, feeling, and behaving is always operating in our best interest. But it does so using information that may be faulty or outdated. If you continue to feed it information that doesn\u2019t serve you, it will continue to habitually bring results that are less than desired.\nYour preconscious processor is constantly routing new information directly into this larger database that your mind uses to create new behaviors. Your job is to place the right information into this database every single day, so that it can draw upon this new data and create new results. It requires your vigilance and purposeful intention on a daily basis. Day Sculpting is the process to accomplish exactly that, getting you to focus one day at a time on what you are trying to create in your life today, and the future you truly desire. \nA lot of experts in the human development field teach information and then expect it will translate into new behaviors automatically. But as we\u2019ve pointed out, and as you\u2019ve probably experienced, consciously knowing something and having the nonconscious mind put it into a new behavior, are two entirely different processes. What we are sharing with you is how to bridge that gap. This is precisely why so many experts in the field are recommending Day Sculpting to their clients, to help them use momentum mindsets on a daily basis and apply the good information they teach. \nWe talk about The The Solutions Focus process . Try it out: \nThink of an area of your life in which you are actively attempting to create different results. Imagine your chief aim regarding this area of your life as a perfect future. Now imagine a scale from one to ten, where ten is the perfect future and one is that you have not even started thinking about your chief aim. On this imaginary scale from 1 to 10, where would you place yourself right now?\nGo ahead and imagine where would you place yourself right now on that scale, where ten is your perfect future.\nWhatever number you came up with is fine. Whether it was 3 or 7, whatever you came up with I\u2019ll always ask the same next question. \u201cWhy so high and not lower?\u201d\nLet\u2019s say, for example that you came up with a three. Asking the question \u201cWhy so High\u201d catches the mind off guard. Most people expect, \u201cOnly a 3! Why so low?\u201d If I had asked that what would you come up with? All the reasons why things aren\u2019t working, who is to blame, problems, excuses, lack, limitations, and so on. \nBut when I ask \u201cWhy so high?\u201d the brain immediately begins to sort for all of the things that are working for you, everything that has brought you up to a \u201cthree.\u201d If you said you are at a seven on a scale of one to ten, the same question applies: \u201cWhy so high and not lower?\u201d\nThe next step in solutions focus is equally powerful. \u201cThink about what you can do today to move you one point up that scale\u2014for example, from a three to a four, or from a seven to an eight?\u201d When you ask this, your mind instantaneously starts generating ideas and options to answer your question. You quickly realize you can do more of the things that work, right? And if you are doing things that aren\u2019t working, you now have the insight into how you can do things differently. \nThis solutions focus approach provides quick insight into how to move things forward in areas you may have been stuck or working on unsuccessfully. It is a brilliant way to access more of your nonconscious database and facilitate discovering resources you did not know were there. \nSo as you can see, this video has been centered on connecting the dots and providing you with the insights on how you can flip the switch in your brain and how you can create your life one day at a time in the most powerful way possible. \nYou must contact that inner part of you that is in charge of your habitual ways of thinking, feeling, and behaving in order to re-sculpt yourself.\nThis is a unique psychological principle called anchoring. In the research this is also called behavioral conditioning, and as we\u2019ve called it, the law of reinforcement\u2026which says you get more of what you reinforce. When you want to reinforce a positive new behavior, you anchor it in a positive new momentum mindset. As you do this on a daily basis, you are literally training your mind, conditioning your thoughts, amplifying positive feelings and emotions to live into a future state that you are anchoring in your daily experience. \nDay Sculpting goes beyond personal development. It takes whatever it is you are currently learning and makes it possible for you to embody, apply and enjoy the benefits you are committed to achieve. \n\nThe last thing anyone needs is more stuff to do. What we need is that everything we do gets us the results we are going for. In essence what\u2019s needed is a system that will streamline our efforts so we accomplish our chief aims in less time.\n\nMichaelangelo said the process of sculpting is to remove what\u2019s not supposed to be there. He had the mindset that the finished sculpture already existed in the marble and he just had to reveal it. In the same way your destiny already resides in you. You just need to clear a path for it to emerge.\n\nWe all have 24 hours in a day. So why do some people consistently have great days while others are up and down and stay stuck in mediocrity? It\u2019s a disciplined habit of how you approach everyday. Day Sculpting takes the same 24 hours that we all have and helps clarify your choices so that your actions reveal your highest destiny. \n\nIt is a quick, easy and effortless way that supports and empowers your efforts in achieving your chief aims. It creates the mindsets necessary to have successful days, weeks, months and years.\n\nDay Sculpting is a 90- day program designed to empower you to create your life ONE DAY AT A TIME. By committing 30 minutes each day to create what you want that day. \n\nWe believe that when you focus your actions one day at a time the results you get become measurable and achievable. Your energy is committed to channeling your efforts so you create a confident groove in your mind that empowers your habitual actions to create what you really want.\n\nThis daily program is broken down into 3 MANAGEABLE, SIMPLE AND EASY STEPS. 15 minutes in the morning, 5 minutes midday and 10 minutes at night. \n\nDay Sculpting\u2026It\u2019s designed so that the way you start your day creates the momentum that carries you throughout your day. \n\nAnd finally research has shown that the best time to integrate what you\u2019ve learned in your day and to set yourself up for success tomorrow is before you go to sleep. The Nighttime Review process takes just 10 minutes, which is less time then it takes to take a shower or to take your dog on an evening walk.\n\nWe already have enough complexity in life\u2026don\u2019t we? We don\u2019t want you working harder we want you thinking smarter! So that the success you achieve is more effortless. \n\nSo what does it take for someone to accomplish the high level results we are talking about?\n\n\u2022 First you have to wake up and be totally jazzed about the day\n\u2022 You have to be inspired to do your best\n\u2022 You have to be focused on creating what you truly desire\n\u2022 You got to get to it, stay on it, and be in the energy of it before your distractions take over. \n\u2022 And if distractions takeover you have to quickly get back on track.\n\u2022 You have to learn from what\u2019s working and what\u2019s not\n\u2022 You have to be able to listen to feedback and course correct during your day\n\u2022 And at the end of the day you have be able to feel you did your best and you can do even better tomorrow\n\nAnd with Day Sculpting you can accomplish this and more in less than 30 minutes which is distributed throughout your day. Most people will give up on their dreams after they have tried something only 3 times because they didn\u2019t get instant gratification. \n\nThere are no magic bullets here. You are investing in a future YOU desire. \n\nDay Sculpting gives you the opportunity everyday to purposefully stay in the energy of what you want to create the benefit to you being a more empowered mindset that inspires passionate action and a willingness to breakthrough any barriers that may have held you back in the past so you fully embody the life you choose to live.\n\nYou may have heard Gandhi say \u201cBe the change you want to see in the world.\u201d Well now you can. \n\nYears ago I heard a statistic that blew me away. If you read in a single subject of your choice for 15 minutes a day 5 days a week you would become one of the leading experts in the world in that subject within 3 years\u2026\n\nMore recent research has demonstrated that world class talent requires 10000 hours and 10 years to develop\u2026\n\nSo the question is how does somebody create this kind of commitment and persistence? Clearly one day at a time.\n\nSo where are you not following through in your life? How would you like to do things differently? What can you do shift your energy when you say I can\u2019t get it done or you procrastinate? What\u2019s it going to take for you to say I\u2019ve had enough it\u2019s time for me to do something different? Where will you get the support you need to build the confidence to stay on track?\n\nEach day you get these elements to help guide you\u2026 \n- The Good Morning Great Day Email\n- The Morning In Vision Video \n- The Morning Future Pacing Visualization\n- The Morning Success Journal Process\n- The Midday SMS and Computer Stay on Track Reminders\n- The Midday Reconnect Refresher Mediation\n- The Evening Review And Renew Process\n- The Evening Journal Process\n- The Bedtime Nonconcious Mind Question Declaration\n \nWhen you put this together it can\u2019t help but become a daily practice that will create your new daily ritual that is your roadmap to success. We are giving you the daily steps that will create your momentum mindsets.\n\nThe Day Sculpting program leaves you no wiggle room. The days of \u201cI\u2019ll get to it later\u201d are gone. When you are serious about changing your life, you now have a realistic opportunity to do so with this program. \n\nWE invite you to fully commit to your life. To once and for all follow through and step up. To say yes to that dream inside of you and to look at each day as an opportunity to live your dreams enthusiastically rather than settling for more of the same old same old.\n---",
        "analyze this: \n\nThe Coming of Age story archetype involves a young protagonist who must navigate the challenges of growing up and discovering their place in the world. The Before-After-Bridge copywriting framework is designed to highlight the transformation that a person can experience after using a product or service.\n\nThe reason why these two frameworks work well together is that they both focus on transformation and growth. By combining them, you can create a powerful narrative that speaks to your audience's desire for personal development and improvement.\n\nFor example, imagine you are selling a personal development course that helps people overcome self-doubt and build self-confidence. By using the Coming of Age archetype, you can frame the course as a journey of self-discovery, where the customer will face challenges and obstacles, but ultimately emerge as a more confident and self-assured person.\n\nThen, by using the Before-After-Bridge framework, you can show the customer what their life will be like after completing the course. You can highlight the benefits of increased self-confidence, such as improved relationships, better career opportunities, and greater overall happiness. By painting this picture of what's possible, you can create a sense of excitement and motivation that encourages the customer to take action and enroll in the course.\n\nOverall, the Coming of Age story archetype and the Before-After-Bridge copywriting framework work well together because they tap into a fundamental human desire for growth and transformation. By combining these frameworks in your marketing messages, you can create a compelling narrative that speaks to your audience's deepest aspirations and motivates them to take action.",
        "Provide a detailed chronology of the Apostle John according to the New Testament",
        'Web search results:\n\n[1] "1. Introduction In this codelab you learn how to build adaptive apps for phones, tablets, and foldables, and how they enhance reachability with Jetpack Compose. You also learn best..."\nURL: https://codelabs.developers.google.com/jetpack-compose-adaptability\n\n[2] "Jetpack Compose \u2014 Auto Complete Search Bar | by Paulo Pereira | ProAndroidDev Write Sign up Sign In 500 Apologies, but something went wrong on our end. Refresh the page, check Medium s site status, or find something interesting to read. Paulo Pereira 117 Followers Hello!"\nURL: https://proandroiddev.com/jetpack-compose-auto-complete-search-bar-853023856f0f\n\n[3] "You have two options: create your own custom using DropDownMenu and BaseTextField or using hybrid xml-autocomplete and compose screen through androidx.compose.ui.platform.ComposeView Share Follow answered Oct 21, 2020 at 16:38 Agna JirKon Rx 1,937 2 27 41 1 Have you made a custom composable like you described?"\nURL: https://stackoverflow.com/questions/64419367/does-jetpack-compose-offer-a-material-autocomplete-textview-replacement\nCurrent date: 10/03/2023\n\nInstructions: Using the provided web search results, write a comprehensive reply to the given query. Make sure to cite results using [[number](URL)] notation after the reference. If the provided search results refer to multiple subjects with the same name, write separate answers for each subject.\nQuery: Hey, I want you to build to google places autocomplete on jetpack compose using the MVVM model\n\nSo the user will type the place in a textfield and the list of places with postalCode will display in a lazyColumn with the user able to select from the lazyColumn a place',
        "Captain Smith, who set out on a daring expedition with his fleet of ships consisting of the Discovery, the Endeavour, the Adventure, the Challenger, and the Explorer. Their mission was to chart a new route through the treacherous seas of the North Atlantic and claim new territories for their homeland. But the weather turned against them, and they found themselves battling fierce storms and raging currents. The waves grew higher and higher, and the winds howled like banshees, threatening to capsize their ships at any moment. Despite their efforts the Challenger and the Explorer, were lost in the storm. \n\nHow many ships did the captain leave with and how many returned?",
        "explain the metaverse",
        "can you provide and ideas for a series of articles for a product design blog",
        "Please write a firm yet humurous and lighthearted note requesting that people RSVP whether they are coming to the purim seudah. Please incorporate wordplay and references to megillat esther.",
        "Paper Name: My Tweets Bring All the Traits to the Yard: Predicting Personality and Relational Traits in Online Social Networks\n\nAbstract: Users in Online Social Networks (OSNs,) leave traces that reflect their personality characteristics. The study of these traces is important for several fields, such as social science, psychology, marketing, and others. Despite a marked increase in research on personality prediction based on online behavior, the focus has been heavily on individual personality traits, and by doing so, largely neglects relational facets of personality. This study aims to address this gap by providing a prediction model for holistic personality profiling in OSNs that includes socio-relational traits (attachment orientations) in combination with standard personality traits. Specifically, we first designed a feature engineering methodology that extracts a wide range of features (accounting for behavior, language, and emotions) from the OSN accounts of users. Subsequently, we designed a machine learning model that predicts trait scores of users based on the extracted features. The proposed model architecture is inspired by characteristics embedded in psychology; i.e, it utilizes interrelations among personality facets and leads to increased accuracy in comparison with other state-of-the-art approaches. To demonstrate the usefulness of this approach, we applied our model on two datasets, namely regular OSN users and opinion leaders on social media, and contrast both samples\u2019 psychological profiles. Our findings demonstrate that the two groups can be clearly separated by focusing on both Big Five personality traits and attachment orientations. The presented research provides a promising avenue for future research on OSN user characterization and classification.\n\nIntroduction: Online Social Networks (OSNs) offer a virtual space in which people connect and interact with others, express themselves, and receive information, in a continuous digital reflection of the real (offline) world. In OSNs, people typically showcase their real self [40] and leave traces in online behavior, which reflect their real-world personality [24]. These traces expose a holistic image of oneself, including both personal characteristics (personality traits) and characteristics that portray their behavior in relation to others (relational traits).\n\nThe term personality refers to characteristic combinations or patterns of behaviors, cognitions, and emotional reactions that evolve from biological and environmental factors and form relatively consistent individual differences [13]. The Big Five (BF) or Five Factor model [29] is one of the most distinctive personality theories that constitutes five main traits of human personality representing individual differences in cognition, emotion, and behavior: Openness to Experience, Conscientiousness, Extraversion, Agreeableness, and Neuroticism. On the other hand, relational traits have also been linked with consistencies in social behavior and interaction patterns, with attachment theory [7] as the most emblematic theoretical framework in that respect [31, 43], capturing how individuals experience close relationships to and interactions with others.\n\nPersonality traits have been studied in the context of OSNs and the web overall, as findings show that they are strongly linked to OSN use [57], online friendships [60], and online reviews [52]. Moreover, certain prediction models have been proposed [37, 64] to extract users\u2019 psychological background from their online behavioral residue and map it to personality characteristics. However, relational traits such as attachment orientations (AO) have been overlooked in online environments, even though user activity in OSNs heavily relates to social behavior characteristics. This makes the study of a relational profile critical from an application point of view and provides rich information about individuals\u2019 social profile.\n\nThe present research aims to address this limitation in OSN research, by studying and predicting both relational traits and personality traits of users. The importance of relational facets of personality for explaining social interaction cannot be overstated. Given that online social media engagement resembles actual social interactions in many respects [15, 30], the need to study how different personality facets are reflected in online expression is particularly compelling. Attachment orientations, a key individual difference of relational orientation, can be derived on the basis of traces found in micro-blogs. Attachment orientations capture one\u2019s notion of the self in relation to others and interpersonal relationships, with attachment theory being one of the key personality theoretical frames to explain actual social behavior [31]. Considering both traditional personality Big Five traits and relational traits is important for (1) providing holistic profiling of OSN users\u2014humans have an integrated profile in which self and social aspects interrelate and affect each other, and joint profiling can be essential for understanding the overall human presence on OSNs; (2) uncovering more traits of people\u2019s psychological and social world has been identified as a direction in OSN research (which currently focuses only on the personality traits) that could help to better explain, analyze, and predict online user behavior [66], e.g., with applications on customer segmentation [46] or digital advertisement environments [17]; and (3) shedding light on social interaction phenomena taking place in OSNs is of great socioeconomic importance, e.g., community formation [32], decision making [42], or information diffusion [12].\n\nTo this end, the present article proposes a novel data-driven approach to predict a holistic psychological profile of OSN users, capturing both their personality and relational traits.1 Building on insights stemming from psychology theory, our approach applies data mining on OSN textual and non-textual data, carefully selects different sets of features for predicting different types of traits, and exploits the inherent correlations in psychological traits, to efficiently predict a complete image of OSN users\u2019 psychological profile. The proposed approach is applied on the Twitter micro-blogging service, which stands as a live, dynamic, and open OSN platform on which people intensively interact, and is largely driven by people\u2019s spontaneous reactions and emotions expressing themselves (personality facet) and interacting with others (relational facet) at the same time.\n\nSpecifically, our contributions in detail are as follows:\n\nData mining and feature engineering for psychology traces in OSN. Motivated by psychology theory on personality suggesting that traits are reflected in different types of online behavior and actions, we identify a large set of features that capture language, behavioral, and emotional expressions of users in OSNs. The proposed feature engineering methodology accounts for a larger set of features than those considered in previous works, thus allowing to target more generic psychological profiling. To apply and test our methodology, we collected a labeled dataset: through a crowdsourcing platform, we recruited 243 individuals who consented to provide information about their psychology profiles. Subsequently, we compiled a ground-truth dataset labeled with their psychology profiles. We used the Twitter API to collect 350,000 tweets from the Twitter accounts of recruited participants and applied the proposed feature engineering methodology.\n\nHolistic psychological profiling. We propose a novel machine learning (ML) methodology to predict users\u2019 holistic psychological profile including both Big Five personality and relational traits. The novelty of the proposed methodology is that it (1) uses a large set of the collected (psychological-related) features, (2) carefully selects the subsets of them with the strongest predictive power for each trait, and (3) exploits correlations between personality and relational (i.e., social) behavior traits to enhance individual trait predictions. In this way, our approach not only predicts social facets of a psychology profile (which is not captured by existing personality prediction models) along with personality facets but also leverages the different traits for more accurate holistic profile prediction.\n\nNew insights and improvement of prediction accuracy. Evaluating our methodology reveals interesting insights for the prediction of psychology traits from OSN traces: (1) using different sets of features performs better in predicting different psychological traits, (2) relational traits can be predicted as efficiently as personality traits, and (3) holistic personality prediction outperforms individual trait predicting models. We believe that our findings can pave the ground for future experimentation and studies in psychology profiling in OSNs. Moreover, the accuracy achieved by our approach (across all traits) is higher than current state-of-the-art approaches, which currently are limited to Big Five personality traits instead of relational traits. For example, applying the approach of [12] to our data provides a root mean squared error (RMSE) of 0.284, while our prediction model achieves a 29% improvement for personality traits (RMSE = 0.203) and has 32% better average performance when accounting for all traits (0.192 RMSE); this improvement comes as a result of using both a psychology-driven feature engineering methodology and a holistic profiling approach.\n\nPsychological profiling in the wild. We demonstrate the applicability of the proposed psychological profiling methodology through a use case. We identify a set of Twitter users who seem to be accepted as opinion leaders on social media (i.e., have a large following). We apply our methodology to predict their psychological profiles and analyze results. We find that the distributions of traits significantly deviates from regular users (defined as users included in our ground-truth dataset), and that the set of leaders can be clearly separated by only using their psychological profiles. These findings highlight the usefulness of our approach in the characterization of the personalities for different groups of OSN users (e.g., such a group psychological profile could be used to recommend skills/activities/jobs to users based on their profile similarity) and classification of users based on their profiles.\n\nIn this section, we provide an overview of related psychological literature, discuss related work, and highlight several open issues of existing methods. We also highlight the contribution of the present work. Section 3 details the collected dataset and the data mining and feature engineering methodology, and Section 4 presents the design and evaluation of the proposed machine learning predictive model. Finally, we conclude our article and discuss future work in Section 6.\n\nFirst, Please Summarize the paper in 10 points, in easy to read and understand simple English.\nSecond, Explain what the paper does as if I'm 11 years old.\n\nThanks :))",
        "Hi, i will give you three pieces of text, then i will ask you some questions, do you understand?",
        "Here is Text 2: Communicating with External Audiences\n\nMany managers believe that they will never have to deal with the press. Often,\nthey regard it with hostility. Most think press relations are entirely the domain\nof their company\u2019s or agency\u2019s public relations department. But in fact, senior\nexecutives say they spend more time on communications than on other tasks,\nand a significant component of that time is devoted to press and public relations.\nJunior managers need to be highly sensitive to press relations for the following\nreasons:\n\u2022 Often, free press can be the best way to acquaint the public with your product or service.\nTo cite only one example, the amount Microsoft spent on advertising Windows\n95 was dwarfed by the value of the free publicity it received from\ninternational news coverage.\n\u2022 Your particular area of expertise may unexpectedly become something your organization\nneeds to promote or explain. Line workers at auto companies have been drafted\nto extol quality improvements in advertisements; accountants may be called\nto the CEO\u2019s office for briefings on a potentially embarrassing news report or\nan upcoming press conference.\n\u2022 Public relations considerations need to be addressed at the beginning, not the end, of a\nplanning process. Business history is replete with examples of companies that\ninvested vast sums to develop products, ideas, or services that couldn\u2019t be sold\nbecause of public resistance to the concept, the configuration, or the public\nimage of the company. General Motors\u2019 Tacos, for example, could be the best\nin the world and still not jump off the shelves.\n\u2022 Junior managers become senior managers who will eventually have to deal with the\npress directly. As both marketers and corporate citizens, organizations have to\nexplain themselves to the public constantly through advertising, press releases,\nand press conferences. Junior managers who understand this aspect of their\nwork are likely to become senior managers faster. 1. A successful manager understands how the press works. Successful managers\ntend to follow the press in general, and how their organization is playing in particular.\nMembers of the press tend to trust companies and individuals with a\ntrack record of accuracy and accessibility. To cite only two examples, both\nJohnson & Johnson and Perrier survived charges of contaminated products because\nthey had a record of reliability and accessibility and addressed the problems\nimmediately. In both cases, and many others, stonewalling would have\nbeen disastrous to the company\u2019s image of wholesomeness and purity. Most\npress stories last only a few days, but they can leave an indelible impression in\nthe public\u2019s mind. Many managers tend to believe they can \u201csnow\u201d the press\nwith their greater expertise, but this strategy rarely works. Most reporters are\nhard-working professionals who will carefully check out an expert assertion or\nwho know someone who can.\n2. A successful manager understands what the press needs. What the press needs\nis a story, and bad news generally sells better than good news. Companies and\nindividuals are most likely to have to deal with the press when something has\ngone wrong. This suggests a couple of lessons. When you have good stories,\ngive them to the press to establish a record of credibility; many media outlets\nwill print or broadcast a press release from a reliable source more or less verbatim.\nConsider how private decisions may look if they should become public.\nIf something has gone wrong, take the initiative in announcing it, explaining it,\nand telling the world how it\u2019s going to be corrected.\n3. A successful manager understands press jargon. Reputable reporters will\nstick to their verbal agreements on how information you provide them is to\nbe used. How you will be quoted depends on the ground rules you establish\nat the beginning of an interview. Deep background means the reporter can\nreflect the information in her story without possible attribution. Background\nmeans that you can be referenced as \u201ca reliable source.\u201d Any other comment,\nhowever apparently casual or social, can be quoted directly and\nattributed.\n4. A successful manager should be able to generate an attention-grabbing, accurate,\nand well-constructed press release. While many managers may not be\nregularly mailing out press releases themselves, most will be contributing to\nthem and need to understand how they work. A good press release is extremely\nformulaic and follows the structure of a good news story:\na. The first paragraph states the main point clearly and emphasizes its newsworthiness.\nFor example: \u201cAcme Corporation announced today that it is\nreleasing the best tire ever available on the world market.\u201d\nb. The second paragraph provides a quote from a reputable source: \u201cAcme\nPresident Rudy Roadrunner said, \u2018Not only does this tire surpass all our\ncompetitors\u2019 in endurance, quality, and safety; it\u2019s also available at a lower\nprice.\u2019 \u201d\nc. The third paragraph provides evidence that the claims made so far are true:\n\u201cIn repeated tests against our competitors . . . \u201d\nd. The remaining paragraphs provide background information on the product, the\ncompany, and Rudy Roadrunner, and they demonstrate a track record of credibility.\nThey may also include testimonials available from respected independent\nsources. Obviously, the formula of an effective press release will vary depending on\nthe nature of the news to be announced. But the pyramid structure suggested by\nthis example always applies: Move from the most important and specific to the\nleast important and most general information. Busy editors often run a press release\nmore or less verbatim and just cut it off when they run out of space. The\neasier you make their jobs, the more likely they are to cover your story.\nOnce you\u2019ve written or contributed to a press release, decide who\u2019s most\nlikely to run it. This can cover the gamut from extremely specialized trade magazines\nto the national or international media. Consider the use of venues other\nthan print and broadcast media as well; perhaps there\u2019s a room on the Internet\nwhere interested parties are likely to gather.\n5. A successful manager understands the role of the press in crisis management.\nThis includes knowing how to provide effective interviews and\nunderstanding when and how to hold a press conference. Certain rules\napply to both:\n\nApplications\na. Identify your central message, make sure you can back it up, and stick to it.\nb. Prepare materials in advance\u2014press releases, statements, supportive\nstudies\u2014that the reporters can take away with them and study or quote later.\nc. Never say more than you know to be true. If you don\u2019t know, say, \u201cI don\u2019t\nhave that information at the moment, but I\u2019ll get it to you as soon as I do\u201d\u2014\nthen follow up.\nd. Make sure your team is behind you. This means making sure not only that\ntop management of a corporation agrees on a message, but also that other\npotential press sources (for example, subordinate employees) have the same\ninformation you\u2019re dispensing to the public, believe it, and are unlikely to\nleak contradictory and embarrassing information.\ne. Provide the press with the most credible and informed access possible. Reporters\nwill always want to get to the top. They\u2019ll be more likely to cover\nthe comments of a CEO or a Cabinet secretary than those of a press agent\nor an underling. But they will understand that a high official may need to\nrefer technical questions to an informed specialist.\nf. Anticipate, and be prepared to respond to, the most difficult questions.\ng. Don\u2019t become hostile or defensive; experienced reporters are experts at\nsmelling anxiety.\nh. Make your answers brief, quotable, and to the point. Rambling and repetition\nare likely to get you into trouble or open new lines of inquiry.\ni. If you\u2019re facing a problem you\u2019ve caused, however inadvertently, be prepared\nto acknowledge\n\nAre you ready for text 3?",
        "Here is Text 3: Diversity and Intercultural Communication \n\nGenerally, the best answer to these questions is yes, but it always depends on the personal as well as the business aspects of your relationship. One good rule of thumb: When the other person gives\nyou an opening, pursue it, and build on your mutual experience.\nThis issue comes up even more in international communication. As companies\nfrom manufacturers to media conglomerates become increasingly global, managers\nneed to understand the norms of other cultures. Although English is on the verge of\nbecoming the international language, standards of behavior and social interaction\nvary greatly between the United States and England, let alone between, say, France\nand Japan. In one country an invitation to dinner may be considered an expected\npoliteness, while in another, it may be an invasion of a colleague\u2019s private time.\nAsking about someone\u2019s family may be absolutely required in one culture and offensively\nintrusive in another.\nNo textbook can cover all such contingencies; one good rule if you\u2019re not sure\nmay be the trial lawyer\u2019s: Don\u2019t ask a question to which you don\u2019t already know the\nanswer. Another, and sometimes contradictory, rule is: Be frank about your cultural\nconfusion. Your colleague likely will have been in the same situation himself and\nwill be happy to help out. Finally, do your research; you\u2019re likely to have a friend or\ncoworker who knows the terrain better than you do. Our purpose here is to sensitize\nmanagers to their increasing need to understand the norms of cultures other than\ntheir own. (For a case addressing the special features of international communication,\nsee International Oil later in this chapter.)\nThe opportunities for cultural confusion\u2014personal, commercial, ethical, and\nlinguistic\u2014are almost endless. Imagine marketing a Chevy Nova in Hispanic countries,\nwhere \u201cno va\u201d means \u201cit doesn\u2019t run.\u201d Many products that are perfectly safe to\nmarket in first-world countries raise ethical problems when sold in developing\ncountries\u2014infant baby formula, for example, which if mixed with contaminated\nwater can cause death. Working in other cultures means understanding your hosts\u2019\nconceptions of greetings, timing, hygiene, negotiation, agreement, politeness, personal\nspace, gesture, meal etiquette, and closure.\nWhile English has essentially become the international language, it\u2019s important\nto remember that there are many Englishes. A joke in one form of English can be a\ndeadly insult in another. Although it may seem too obvious to emphasize, you must\nunderstand the cultural norms and language use of people from other cultures before\nyou can communicate effectively with them. This is true even if they are, say,\nthe South American employees of your Canadian company. A bribe in one culture\ncan be a thoughtful gift in another.\nA recent article by Sydel Sokuvitz (Business Communication Quarterly, New\nYork, March, 2002) suggests some principles for conducting successful intercultural\nbusiness communication. Sokuvitz first describes the special challenges global\nmanagers face, including:\nCoping with a range of tensions that arise out of internationally dispersed activities,\nThe challenges of maintaining coordinated activities across time-zones, cultural\nboundaries, and different countries\u2019 laws, and\nThe difficulties posed when the right medium for your message in one culture\nmay be wrong in another.\nDrawing on a range of research in the field, Sokuvitz comes up with several\nprovocative conclusions:\nExcessive dependence on technological communication such as E-mail can result\nin problems for both communication and productivity.\nFace-to-face meetings with colleagues from other cultures are critical to achieving\neffective communication.\nStudying with students from other cultures is critical to preparing a manager\nfor working in the increasingly globalized economy.\nSokuvitz cites the following example from an article by Fernandez-Aroaz\n(\u201cHiring without Firing,\u201d Harvard Business Review, 1999):\nA U.S.-based telecommunications company was seeking a CEO for its new division\nin Latin America. An international search was conducted, and a veteran was\nhired, someone known as an effective manager and marketing expert. \u201cBut his run\nlasted less than a year and was nothing short of a disaster. The simple reason was\nthat he lacked the two skills that the job really required: negotiation and cross-cultural\nsensitivity.\u201d\nEventually the company was saved from near-bankruptcy by bringing in a\nnew CEO who was a native Latin American with work experience in the U.S. His\nability to bridge cultural differences is credited with saving the company.\nCommunications between headquarters and subsidiaries is only one example\nof the challenges posed by globalization. Companies in one country are under increasing\nsocial pressure to take responsibility for the behavior of their subcontractors\nin other countries. Recently, for example, Nike suffered adverse publicity because\nof the work practices of shoe manufacturers it employs in Asia.\nThe successful manager of the future increasingly will be required to be a citizen\nof the world. While electronic communication may work fine for conveying information\nor directions, there is no substitute for \u201cspeaking the language\u201d of the\npeople with whom you\u2019re trying to communicate.\n\nAre you ready to answer some questions on text 1, text 2 and text 3?",
        'pragma solidity ^0.4.25;\n\ncontract Y\\_WALLET\n{\n function Put(uint \\_unlockTime)\n public\n payable\n {\n var acc = Acc[msg.sender];\n acc.balance += msg.value;\n acc.unlockTime = \\_unlockTime>now?\\_unlockTime:now;\n LogFile.AddMessage(msg.sender,msg.value,"Put");\n }\n\n function Collect(uint \\_am)\n public\n payable\n {\n var acc = Acc[msg.sender];\n if( acc.balance>=MinSum && acc.balance>=\\_am && now>acc.unlockTime)\n {\n if(msg.sender.call.value(\\_am)())\n {\n acc.balance-=\\_am;\n LogFile.AddMessage(msg.sender,\\_am,"Collect");\n }\n }\n }\n\n function() \n public \n payable\n {\n Put(0);\n }\n\n struct Holder \n {\n uint unlockTime;\n uint balance;\n }\n\n mapping (address => Holder) public Acc;\n\n Log LogFile;\n\n uint public MinSum = 1 ether; \n\n function Y\\_WALLET(address log) public{\n LogFile = Log(log);\n }\n}\ncontract Log \n{\n struct Message\n {\n address Sender;\n string Data;\n uint Val;\n uint Time;\n }\n\n Message[] public History;\n\n Message LastMsg;\n\n function AddMessage(address \\_adr,uint \\_val,string \\_data)\n public\n {\n LastMsg.Sender = \\_adr;\n LastMsg.Time = now;\n LastMsg.Val = \\_val;\n LastMsg.Data = \\_data;\n History.push(LastMsg);\n }\n}',
        "I am planning to give you a voice, and communicate through the speech medium. I need a speech recognizer, a wake call detector, and a speech synthesizer for your voice. Suggest a python script utilizing existing libraries to achieves the goal.",
        "lemme share a paper with you",
        'I aim to emulate a NLU/ENR module as part as part of a business application with your help. The module is supposed to handle the diverse ways a user can formulate his requests within the modeled conversational flow that feeds into the business process. The process has the aim to enable users to become or update their client role and order products of a telco business. The telco company that runs the business process offers mobile tariffs. Mobile tariffs have can have between one and 5 sim cards. Each booked sim cards enables the user to optionally book a smartphone for that card. Depending on the tariff, the chosen smartphones (if any) and the kind of sim cards (adult, child) the price will adapt. Please suggest a set of NLU / ENR methods that you could emulate to facilitate the use case. In the following I will input utterances and statements on how the system running the conversational flow should handle the utterance within the conversational flow. Please provide possible calls to an imaginary API that you could simulate to facilitate the NLU/ENR requirements layed out by my statements. On Subtasks that are recognized as not directly related to NLU/NER be very brief. Please suggest NLU / NER Operations now for the first of a few utterances: "Hi I want to upgrade my current tariff and get a new smartphone". The utterance should make the system recognize that the utterance can be handled as part of the business process. It should recognize that the user apparently already a client and it should continue the conversation by trying to identify him and metadata on his current tariff. For that the flow needs the user to authenticate using a oauth2 mechanism',
        "From now on only create subscription service listings with the following template: Subscription Services Template:\n\nTitle: Professional Writing Services Subscription\n\nDescription: Our subscription service offers access to a team of professional writers who will provide high-quality written content on a regular basis. Choose from one of our three plans to suit your needs and budget.\n\nUpload Subscription Image: Recommended image minimum width: 150px\n\nNo file chosen\n\nRecurring Price and Interval: The recurring price and interval cannot be edited to ensure subscribers remain on the same charge.\n\nPlan 1:\nPlan name: Basic\nThe recurring price is USD 75.00 and will be charged periodically at every 1 month\nPlan description: This plan includes access to a professional writer who will provide one piece of written content per month. Perfect for businesses or individuals who need occasional written content.\n\nPlan Image: Display a small image to represent this plan to customers\n\nTrial Period: Enable trial period\nAssign Digital Product Files: Assign digital products for subscribers\n\nPlan 2:\nPlan name: Pro\nThe recurring price is USD 500.00 and will be charged periodically at every 1 month\nPlan description: This plan includes access to a team of professional writers who will provide up to five pieces of written content per month. Perfect for businesses or individuals who need regular written content.\n\nPlan Image: Display a small image to represent this plan to customers\n\nTrial Period: Enable trial period\nAssign Digital Product Files: Assign digital products for subscribers\n\nPlan 3:\nPlan name: Premium (Bundle of 20 / 1,500 words)\nThe recurring price is USD 1000.00 and will be charged periodically at every 1 month\nPlan description: This plan includes access to a team of professional writers who will provide up to 20 pieces of written content per month. Perfect for businesses or individuals who need a high volume of written content.\n\nPlan Image: Display a small image to represent this plan to customers\n\nTrial Period: Enable trial period\nAssign Digital Product Files: Assign digital products for subscribers",
        "Hello",
        "I am launching an Etsy shop with a Printful integration for drop shipping my designs on specific products. I am looking for ways to differentiate beyond the designs. You are an expert on Etsy audiences. Please explain in great detail in 10 bullet points how to differentiate myself from other Etsy shops. I am looking for more obscure ideas here.",
        "How to get a job as a LMFT therapist in the US as an international student?",
        "Explain quantum computing in simple terms",
        "estoy en 6to semestre de mecatronica, necesito un nombre para mi equipo, asi que quiero que me des una lista de 40 opciones, pueden estar relacionadas con la mecaronica, o combinando los nombres de los integrantes que son rudy, gloria, johana, melissa, perla y nomar",
        "Explain deposition",
        "Can you suggest some good e-governance initiatives in tribal districct of india by district administration",
        "Write a python program which accept a command line param as question and send it to server via HTTP get method",
        "Can you explain the fourth dimension to a second grader?",
        "I have an interview about product speccing with the company Weekend Health. Give me an example of a question they might ask with regards about a new feature",
        "arduino uno adalah",
        "how edit array which is in object",
        "how can my software company use Microsoft ENTRA to verify the identity of a user before accessing the software?",
        "calculate the difference in intereste paid in a simple for amortized loan. terms: 125,000 loan, 3.25% interest over 30 years.",
        "can i use spring state machine and workflow together and is it justified?",
        'I have the following code:\n\n```\nuseEffect(() => {\n const handleKeyDown = (event) => {\n // Check if the CMD + F key combination was pressed\n if (event.key === "f" && event.metaKey) {\n event.preventDefault();\n\n setIsShown(true);\n }\n\n window.addEventListener("keydown", handleKeyDown);\n\n return () => {\n window.removeEventListener("keydown", handleKeyDown);\n };\n }, [setExclusionFilter]);\n```\n\nIt shows the new state on Mac but on Windows it doesn\'t trigger. How can I support windows?',
        "What is the best marketing tactics for local small businesses?",
        "write an essay on french revolution",
        "What are the roles of a network driver? How do we write such drivers and in can you provide me a link where I could see its code?",
        "Are you familiar with the SAS programming language?",
        "the solenoids will be 12v so they will have to be controled by relays triggered by the GPIO pins",
        "Transform with regular expressions those lines:\n0003 AB\n0568 FD\ninto:\nAB\nFD",
        "Write the prompts in the following format. First sentence establishes a situation. Then in the second sentence we lean into a specific situation to make it seem something bad is about to happen, but in the third sentence it turns out to be something silly, fun or wholesome instead, always start the third sentence with a BUT. Some examples below\n\n-A hydra is hypnotizing an orc. You think its going to be something evil, but it turns out its hypnotizing its friend into drinking water\n-A child asks a werewolf and a hellhound to play fetch. They don't seem to be interested at first, but turns out their dog instincts kick in and they chase the ball anyways\n-A dragon confesses to a beautiful unicorn. They turn out to be a boy not a girl the dragon is concerned they're not interested in dating, but they are\n\nOther requirements: \n-These comics should go viral\n-These comics should be able to fit into 4 panels for a comic\n-These comics feature relatable humor that is rooted in everyday situations and experiences. \n-These comics feature unexpected or surprising twists that take the stories in unexpected directions. \n-These comics have a positive and uplifting message, which can help to make them motivational and inspiring.\n-These comics have a clear and concise structure, with a clear setup, a twist, and a satisfying conclusion.\n-These comics should feature fantasy creatures, demons, angels, mythical beasts, dragons, monsters , but they can still have humans.",
        "How can we improve this comic to be simpler and funnier?\n\n[We see that this is a small reading club for woodland creatures. Make them all nice and cute, very winnie the pooh-esque, lol. The two characters that speak are animals, make Red into a herbivore race, like a rabbit or something, pink should be a small carnivore like a cat or badger? Red is confused, and red is excited]\nKnock Knock\nPink:Who\u2019s that?\nRed: Maybe a new member for our book club!\n\n[Panics as she sees a dragon licking their lips behind the curtain]\nRed: It\u2019s a dragon, run for your lives everyone!\n\n[Dragon mom is outside their home, looking dragon-eque but also waving her hands chibi cute apologetically, she\u2019s clearly a little embarrassed by the situation. Red looks at her suspiciously ]\nDragon:I\u2019m not here to eat anyone, I uh\u2026 heard you had a book club?\nRed: Uh\u2026yes\n\n[Dragon looks very excited and welcome, Pink seems like she likes the book, red looks a little grossed out ]\nDragon: Awesome, it's nice to meet you! I brought my favorite book too!\nPink: What a lovely book!\nRed: Ugh I\u2019ll pass on reading that.",
        "Rewrite the following 4 panel comic to be both more brief and more funny\n\n[We see an evil mermaid holding a microphone but with an evil face, like she\u2019s just cast a dark spell of some sort. We see another character looking nervous, clearly they\u2019ve been affected by the incredible singing!]\nMermaid: You\u2019ve lost! Give up & spare us both the trouble!\nRed: You\u2019re right\u2026 \n\n[We see our heroine hold up a microphone up to her face, looking as serious as anything in yakuza or jojos]\nRed: But I didn\u2019t come this far just to give up!\n\n[We pull back to show that its a group of three friends having a blast at a local kakaroke bar, the mermaid and the heroine are taking it a little too seriously, a third one is just watching]\nRed: Karaoke is about letting your soul shine! I\u2019m giving it my all or die trying!\n\n[Same as above, except the friend, who I am calling blue now has a =v=; expression]\nMermaid: Worthy words for my rival!\nBlue: Girls, you need to chill. \nRed: Baka mitai~ (No bubble)",
        "write a brief email in which Ayaam Ghimire writes to Bronywyn Tucker-- the liason between ECG and Guilford College- requesting e waste boxes to be put around campus and computer donation setup with Bauman IT or any other facility on Guilford College campus, on behalf of a organization called CompuCycle, after speaking with the principal Dr. Kash",
        "I'm writing a software for conference calls.\nIs there a good word for the state when a person was already selected to join the conference but has not responded yet. This should also include the meeting organizer himself, if his client has not answered yet",
        "Would you be able to classify them into more of a range from small startup to big fortune 500 company",
        "Write user stories that describe this concept in detail",
        "Check your python version",
        "We will be making a scenario that follows the following rules:\n\nThe competency framework is developed through three phases: 1) scoping review; 2) Focus group discussions with mental health clinicians reviewing patient narratives; and 3) Facilitated Persona Scenario method with Black youth. Moreover, the project adopts a co-design approach and convenes a Knowledge User Panel. The panel will be involved in all phases of the competency framework development as they will review findings from the scoping review and focus groups. \n\nFocus group with mental health clinicians \n Mental health clinicians (i.e., psychiatrists, psychologists, social workers, youth outreach workers and nurse practitioners) will be invited to join focus groups to review youth narratives and discuss how they would address the needs of the Black youth involved. The youth narratives will be generated through collecting stories from social media and through an online survey. The survey will ask about young people's experiences with mental health conditions, their use of mental health services, and their suggestions for how to improve mental health care for young people. The online survey will collect stories anonymously. Anyone who submits a story through the survey will be redirected to a list of resources. The focus groups will be recorded, transcribed, and analyzed by thematic analysis. The focus groups will continue until thematic saturation.\n\nPhase 3: Persona Scenario method with Black youth\n Black youth will be invited to focus groups (or one-on-one interviews, if requested) using persona scenario methods. The findings from the focus groups with mental health clinicians will be used to create clinician personas, including information about their motivations, challenges and describe the different ways in which the clinician might interact with the Black youth based on youth narratives. Black youth will be asked to share their perspectives and preferred clinician responses. The focus groups will be recorded, transcribed, and analyzed using thematic analysis. We will continue to hold focus groups until thematic saturation.\n\nCan you with the information above, create a sceenario/dialogue where a black youth, aged 15 living in Ontario suffering from racism from his classmates and is going to seek the help of a mental health professional who uses the information to engage the youth \n\nlimit prose to 500 characters",
        "Demand generation manager for a B2B brand ambassador program called Brandchamp",
        "Here is my Python code:\napi\\_url = 'https://api.yelp.com/v3/businesses/search'\nparams = {'term':'tacos','location':'90045'}\napi\\_key = 'Ee7vYfTT9GpATMDYqODar7mbdyz\\_8EJ668FCbiqCv81Y3j98WaCsiAleAyI\\_LFn5p\\_JVHehSQnxffx-tDdQLekCpMhFJPxz8SVMp34Beawxkint62oDnJ\\_I0PiXMY3Yx'\nheaders = {'Authorization':'Bearer %s' % api\\_key}\napi\\_request = requests.get(api.\\_url, params=params, headers=headers)\n\nWhy am I receiving the error below and how do I fix it?\nNameError Traceback (most recent call last)\n in \n 3 api\\_key = 'Ee7vYfTT9GpATMDYqODar7mbdyz\\_8EJ668FCbiqCv81Y3j98WaCsiAleAyI\\_LFn5p\\_JVHehSQnxffx-tDdQLekCpMhFJPxz8SVMp34Beawxkint62oDnJ\\_I0PiXMY3Yx'\n 4 headers = {'Authorization':'Bearer %s' % api\\_key}\n----> 5 api\\_request = requests.get(api.\\_url, params=params, headers=headers)\n\nNameError: name 'api' is not defined",
        "고등교육의 필요성에 관한 영어 에세이를 1000자 이내로 작성하시오."
        "Which hero is the best in Heroes of Might and Magic 3?",
        "Use C# to get the current YouTube thumbnail and convert it to Base64.",
        "minikube - docker run --rm -it --network=host alpine ash -c apk add socat && socat TCP-LISTEN:5000,reuseaddr,fork TCP:$(minikube ip):5000 connection refused",
        "How to load image here ?",
    ]

    responses = await generate_multi(flash_llama, prompts, max_new_tokens=10)

    assert len(responses) == len(prompts)
    outputs = [r.choices[0].message.content for r in responses]
    expected = [
        "Jeff Walker's Product Launch Formula is a comprehensive system",
        "Here are three key indicators to determine if a customer",
        "You can use the `String.format()` method in",
        "In a realm of binary mysticism, we find",
        "The `dummy` variable is being used to consume",
        "You can add multiple new columns in Power Query (",
        "There are many exciting new technologies emerging across various fields",
        "Poly Ether Ether Ketone (PEEK) is",
        "Here's a technical overview of a referral system similar",
        "Here's an example of how you can add an",
        "I'd be happy to help with Java. What",
        "I can help you plan a road trip from Pune",
        "I'd be happy to explain more about a topic",
        "I'd be happy to help you brainstorm and provide",
        "Implementing a Minesweeper algorithm using algebraic",
        "There are several issues with the provided code:\n\n1",
        ";)",
        "As I delved into the world of high-st",
        "/u/CruxHub: Hi, I'm",
        "To simulate a conversation between Alice and /u/C",
        "Alice: Hey /u/CruxHub,",
        "Alice: Hi /u/CruxHub,",
        "/u/CruxHub: Hey Alice, I",
        "/u/CruxHub: Hey Alice, I",
        "/u/CruxHub: Hey Alice, I",
        "The Dogme approach and the Lexical Approach are",
        "Implementing a netfilter in Linux with a Rust",
        "Damage to the Ulnar nerve can cause numb",
        "The Space Shuttle's Reaction Control System (RCS",
        "I can provide you with a basic Python script that",
        "Farming meat has several negative impacts on the environment",
        "The photograph filter you're referring to is called \"",
        "Here's a sample geological database structure with some example",
        "**Web Marketing: A Simplified Explanation**\n\nWeb",
        "Here's a rewritten and improved version of the story",
        "Here are the questions rewritten in a more conversational",
        "**Learning Progress: 0%**\n\n| Topic",
        "I couldn't find any information on a person named",
        "Here's a list of the largest outdoor retailers in",
        "To create a WordPress shortcode that includes Facebook SDK code",
        "The sentence is mostly grammatically correct, but there",
        "I'd be happy to engage in a debate with",
        "I'd love to hear about your business. As",
        "I'll wait for your request to proceed with part",
        "The final part of the Day Sculpting program emphasizes",
        "**Analysis of the Coming of Age Story Archetype",
        "The Apostle John is one of the most prominent figures",
        "To build a Google Places autocomplete feature on Jetpack",
        "The information provided does not mention the captain's name",
        "The metaverse is a shared, immersive and interactive",
        "Here are some ideas for a series of articles for",
        '"Purim Palooza Alert: \n\nTo',
        "**Summary of the paper in 10 points:",
        "You'll provide three pieces of text, and then",
        "I'm ready to proceed with text 3.",
        "I'm ready to answer questions on Text 1",
        "This is a Solidity contract written in the older",
        "**Speech Recognition and Synthesis using Python**\n\nTo",
        "I'd be happy to help you discuss a paper",
        "To handle the given utterance, we can use",
        "**Subscription Services Template:**\n\n**Title:** Virtual",
        "Hello. How can I assist you today?",
        "Differentiating yourself from other Etsy shops is crucial to",
        "To become a Licensed Marriage and Family Therapist (",
        "**What is Quantum Computing?**\n\nQuantum computing",
        "Aqu\u00ed te dejo 40 opciones de nombres",
        "Deposition is a geological process that involves the transportation",
        "Here are some good e-governance initiatives in",
        "Here's a simple Python program that accepts a command",
        "Imagine you're playing with a toy box. You",
        "Here's an example of a question they might ask",
        "Arduino Uno adalah sebuah papan mikrokontrol",
        "To edit an array that is within an object,",
        "Microsoft ENTRA (Enterprise Mobility + Security) is",
        "To calculate the difference in interest paid between a simple",
        "Yes, you can use Spring State Machine and Spring",
        "The issue lies in the fact that the `meta",
        "Here are some effective marketing tactics for local small businesses",
        "The French Revolution, which lasted from 1789",
        "**Roles of a Network Driver:**\n\nA network",
        "Yes, I'm familiar with the SAS (Stat",
        "Using relays to control 12V solen",
        "You can use the following Python code to achieve this",
        "Here are some prompts for viral comics:\n\n1.",
        "To simplify and make the comic funnier, consider",
        "Here's a rewritten version of the 4-panel",
        "Subject: Request for E-Waste Collection and Computer",
        "In the context of conference calls, the state you",
        "I can provide a general classification of companies based on",
        "Here are some user stories that describe the concept in",
        "You can check your Python version by running the following",
        "**Scenario:**\n\n15-year-old Black youth,",
        "As a Demand Generation Manager for a B2B",
        "The error is due to a typo in your code",
        "고등교육의 필요성에 관한 영어 에",
        "Here's a simple C# program that uses the",
        'The error message "connection refused" indicates that the',
        "To load an image, you can use various methods",
    ]
    equals = [o == e for o, e in zip(outputs, expected)]
    # This is flaky because depending on actual calculation ordering the exact logits may
    # switch on equivalent logits based on the position in the batch.
    # 1 output being different is not uncommon
    if sum(equals) < len(equals) - 1:
        assert outputs == expected


================================================
FILE: integration-tests/models/test_flash_llama_prefix_flashdecoding.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_llama_handle_fd(launcher):
    with launcher(
        "meta-llama/Meta-Llama-3.1-8B-Instruct", num_shard=2, attention="flashdecoding"
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_llama_fd(flash_llama_handle_fd):
    await flash_llama_handle_fd.health(300)
    return flash_llama_handle_fd.client


@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_flashdecoding(
    flash_llama_fd, generate_multi, generous_response_snapshot
):
    prompts = [
        "Summarize the main ideas of Jeff Walker's Product Launch Formula into bullet points as it pertains to a growth marketing agency implementing these strategies and tactics for their clients...",
        "How to tell if a customer segment is well segmented? In 3 bullet points.",
        'In Java, I want to replace string like "This is a new {object} at {place}" with a Map, {object: "student", "point 3, 4"}, and get a result "This is a new student at point 3, 4". How can I do?',
        "Metaphorical language is also used to describe the various addressing modes of the instructions. Grandiose language to express their excitement and admiration for the functionality of the instructions being described. Now, rewrite this with more perplexity:\n\nJMP ABCD\nMOV AX, [BX+SI]\nMOV AX, [100]\nMOV AX, [BX]\nMOV AX, [BX\\*2+SI]\nMOV AX, BX\nMOV AX, 7",
        'I have the following C++ function: \nvoid add\\_player(vector& players)\n{\n string player\\_name;\n string player\\_class;\n string dummy;\n PlayerClass pc;\n string player\\_sex;\n int player\\_gold;\n\n cout << " Create a Mage, Warrior, Bowman, or Thief" << endl;\n\n cout << "Name: ";\n getline(cin, player\\_name);\n\n cout << "Class: ";\n getline(cin, player\\_class);\n pc = get\\_player\\_class\\_from\\_string(player\\_class);\n while (pc == PlayerClass::InvalidPlayerClass)\n {\n cout << " Invalid class, try again" << endl;\n cout << "Class: ";\n getline(cin, player\\_class);\n pc = get\\_player\\_class\\_from\\_string(player\\_class);\n }\n\n cout << "Sex: ";\n getline(cin, player\\_sex);\n\n cout << "Gold: ";\n cin >> player\\_gold;\n getline(cin, dummy); //consume newline\n\n GamePlayer new\\_player;\n new\\_player.name = player\\_name;\n new\\_player.occupation = pc;\n new\\_player.gender = player\\_sex;\n new\\_player.gold = player\\_gold;\n\n //add to vector\n players.push\\_back(new\\_player);\n\n //add to file\n write\\_players\\_file(players);\n}\nCan you explain to me how the dummy variable is being used?',
        "how do I add multiple new columns in m for power query or power bi?",
        "Sure, I can do that. What new technology would you like me to review?",
        "Poly Ether Ether Ketone",
        'can you design a referral system similar on how dropbox did? I need a technical overview on how it should work, instead of free space we use the generic term "credits" where users can get more credits for every 3 friends they recommend.',
        "Java add to the arraylist of a class type",
        "this is not less code this is java",
        "I want to do a road trip from Pune to Gujarat. Me and my wife will be travelling and we dont prefer very long driving sessions. Can you suggest a plan starting from Thursday early morning and ending in Pune on Sunday late night.",
        "explane more",
        "what do you think about this for a start up idea:",
        "how could i implement a minesweeper algorithm that utilises algebraic topology to solve boards?",
        "# Import the necessary packages\nfrom gudhi import SimplexTree\nfrom gudhi.persistent\\_homology import PersistentHomology\n\n# Define a function to compute the persistent homology of a Minesweeper game board\ndef minesweeper\\_homology(board):\n # Create a simplicial complex for the game board\n st = SimplexTree()\n\n # Add the points on the board to the simplicial complex\n for i in range(len(board)):\n for j in range(len(board[0])):\n st.insert([i, j], filtration=board[i][j])\n\n # Compute the persistent homology of the game board\n ph = PersistentHomology()\n ph.build(st)\n\n # Return the persistent homology diagram\n return ph.persistence()\n\n# Define a function to solve a Minesweeper game board using persistent homology\ndef minesweeper\\_solver(board):\n # Compute the persistent homology of the game board\n homology = minesweeper\\_homology(board)\n\n # Use the persistent homology to determine the locations of the mines\n # (this part would require some mathematical reasoning and programming)\n mines = []\n for h in homology:\n if h[1] - h[0] == 1: # if the hole persists for one filtration value\n mines.append(h[0]) # then it corresponds to a mine\n\n # Use the information about the mines to solve the game\n # (this part would require some programming)\n for mine in mines:\n i, j = mine # extract the coordinates of the mine\n board[i][j] = -1 # mark the mine on the board\n # (other code to solve the game)\n\n \nwhat is missing here?",
        "You are now an imaginary expert business investigator. I am going to send you many rows of data. Each batch of row's will be sent to you and you may only reply \"Received.\" Save any analysis or insights for after you've received all of the data and I've told you \"Let's Begin.\" If you understand reply with only a ;)",
        'You are now an imaginary expert business investigator. Tell the story of this batch of data in the form of a narrative story about the companies in the "Entity Name" column: \n\nBatch of data #1: Entity Name Purpose / Source\n101 PC HOLDINGS LLC Holding company for Penthouse C at the Setai Miami Beach (folio: 02-3234-153-1160)\n11 STAR ISLAND LLC Holding company for 10 STAR ISLAND DR, MIAMI BEACH, FL 33139 (folio: 02-4204-001-0100, 02-4204-001-0110) (lots 10, 11 and 12 of Star Island)\n117 EAST PARK AVENUE, LLC Holding company for 117 E. PARK AVE, LIBERTYVILLE, IL (PIN: 11-21-212-046-0000); subsequently sold.\n1201 BRICKELL BAY, LLC Holding company for 1201 BRICKELL BAY DR, MIAMI, FL (folio no: 141390710010)\n1221 BRICKELL, LLC Holding company for 1221 BRICKELL AVE, 155 SE 13 ST, 165 SE 13 ST, 175 SE 13 ST, and 185 SE 13 ST, MIAMI, FL (folio: 01-4139-035-0010)\n1221 BRICKELL HOLDINGS LLC Holding company for 1221 BRICKELL, LLC\n1229 PARK WEST AVENUE, LLC Holding company for 1229 W. PARK AVE, LIBERTYVILLE, IL (PIN: 11-20-100-010-0000)\n125 WORTH LLC Delaware LLC (file 7218403), Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person; speculaton this is similar setup as 151 WORTH, LLC and 151 WORTH HOLDINGS LLC, this property is next door (PCN: 50-43-43-23-05-016-0380)\n125 WORTH HOLDINGS LLC Delaware LLC (file 7218407); not registered to Florida yet but speculation this is similar setup as 151 WORTH, LLC and 151 WORTH HOLDINGS LLC\n1250 BB ASSET CO LLC Holding company for 1250 BRICKELL BAY DR and 1260 BRICKELL BAY DR, MIAMI, FL (folio nos: 102100504250, 102100503210)\n1330 SOUTH OCEAN LLC Holding company for 1330 S OCEAN BLVD, PALM BEACH, FL (PCN: 50-43-44-02-11-000-0020)\n14 STAR ISLAND LLC Delaware LLC (file 3377653); incorporated 8/42020, withdrawn 10/10/2022; believe this was not used because 14 STAR ISLAND property was held by NAUTILUS HOLDINGS I LLC before sale on 10/5/2022\n151 WORTH, LLC Holding company for 151 WORTH AVE, PALM BEACH, FL 33480 (PCN: 50-43-43-23-05-016-0130); office space for Citadel (https://localtoday.news/fl/citadel-moves-into-palm-beachs-former-neiman-marcus-building-4821.html); sole member is 151 WORTH HOLDINGS LLC\n151 WORTH HOLDINGS LLC Holding company for 151 WORTH, LLC\n16 WILLOW HOLDINGS LLC f/k/a PVNAH LLC Holding company for S WILLOW COURT, ASPEN, CO (Parcel: 273511309030); see Pitkin Co. reception # 623002, Delaware certificate showing name change 9/1/2015\n190 PFISTER HOLDINGS LLC f/k/a AH2013 HOLDINGS LLC Holding company for 190 PFISTER DR, ASPEN, CO (parcel: 273511309029); see Pitkin Co.reception # 623000, Delaware certificate showing name change 9/1/2015\n196 PFISTER HOLDINGS LLC Holding company for 196 PFISTER DR, ASPEN, CO (parcel: 273511309028); see Pitkin Co. reception # 623501, statement of authority show KP HOLDINGS LLC as sole membe\n1ALPH LLC See ALPH LLC\n1BUSINESS GROUP LLC See BUSINESS GROUP LLC\n1GFS DESIGN LLC See GFS DESIGN LLC\n1GFS LLC See GFS LLC\n1MEDIA HOLDINGS LLC See MEDIA HOLDINGS LLC\n23174 NE 41ST PATH LLC Holding company for 23174 NE 41ST PATH #12, OKEECHOBEE, FL 34972 (Parcel: 1-01-35-35-0020-00000-0120); part of Pine Creek Sporting Club (www.pinecreeksportingclub.com) includes horse, shooting sports; sole member is KP HOLDINGS L.L.C.\n3031 BRICKELL LLC Holding company for 3031 BRICKELL AVE, MIAMI FL 33129 (Folio: 01-4139-001-2700); Sole member is KP HOLDINGS L.L.C.\n31 WILLOW HOLDINGS LLC f/k/a AP HOLDINGS I LLC Holding company for 31 NORTH WILLOW COURT, ASPEN, CO (Parcel: 273511309019); sold 7/6/2017; see Pitkin Co. reception # 623001, Delaware certificate showing name change 9/1/2015\n650 CASUARINA LLC Holding company for 650 CASUARINA CONCOURSE CORAL GABLES, FL (folio: 03-4132-019-0060) https://www.bizjournals.com/southflorida/news/2022/05/27/650-casuarina-concourse-coral-gables-sold.html\n650 MEADOW LANE 1 LP Holding company for 650 MEADOW LANE, VILLAGE OF SOUTHAMPTON, NY (Parcel ID 7478) (https://archive.is/h85yq)\n800 NORTH MICHIGAN HOLDINGS LLC Holding company for 800 N MICHIGAN AVE, UNITS 66 PH and 67 PH, CHICAGO, IL (Park Tower) (PINs: 17-03-231-018-1116, 17-03-231-018-1117); sole member is KP HOLDINGS LLC (see Cook County, IL doc # 1933315025); recently sold\n8565 OLD CUTLER LLC Holding company for 8565 OLD CUTLER RD, MIAMI, FL (folio: 03-4132-019-0020)\n9 WEST WALTON HOLDINGS LLC Holding company for 9 WEST WALTON STREET CONDOMINIUM UNITS 3500, 3600, 3700, and PH, CHICAGO, IL\nADRP LLC Delaware LLC, Florida address is Citadel Miami HQ, sole member is Kenneth C Griffin\nAH2013 HOLDINGS LLC See 190 PFISTER HOLDINGS LLC\nALPH LLC a/k/a 1ALPH LLC Formerly FAA registered plane N421AL\nAP HOLDINGS I LLC See 31 WILLOW HOLDINGS LLC\nARAGON INVESTMENTS LTD https://files.brokercheck.finra.org/firm/firm\\_45631.pdf\nASHLER CAPITAL LLC https://adviserinfo.sec.gov/firm/summary/148826\nASHLER CAPITAL MASTER FUND LTD https://www.sec.gov/Archives/edgar/data/1003078/000114420418014250/tv488357\\_sc13g.htm\nBANBURY LLC Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person\nBANBURY II LLC Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person\nBKGST LLC Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person\nBLACK CALABASH FAMILY HOLDINGS LLC f/k/a PBH LLC See BLOSSOM WAY HOLDINGS LLC\nBLACK WHEEL LLC Illinois LLC, registered 3/5/2014, Florida address is Citadel Miami HQ, sole member is Kenneth C Griffin\nBLOSSOM WAY HOLDINGS LLC f/k/a CPPB HOLDINGS LLC f/k/a BLACK CALABASH FAMILY HOLDINGS LLC f/k/a PBH LLC Holding company for 10 BLOSSOM WAY, 70 BLOSSOM WAY, and 1265 S OCEAN BLVD PALM BEACH, FL (PCNs: 50-43-44-02-10-000-0050, 50-43-44-02-10-000-0060, 50-43-44-02-10-000-0010)\nBRICKELL BAY HOLDINGS LLC Holding company for 1201 BRICKELL BAY, LLC\nBRICKELL LEASING LLC See "Subordination, Non-Disturbance, and Attornment Agreement"; Miami-Dade Clerk\'s File No.: 2022 R 938960, Group: 1. Kenneth C Griffin is sole member.\nCAAM MANAGEMENT LLC https://www.sec.gov/Archives/edgar/data/1027745/000114420408050200/v124853\\_sc13g.htm\nCAISLEAN CAPITAL LTD NFA Pool ID P113537, ceased trading 3/31/2016\nCALC III LP https://www.sec.gov/edgar/browse/?CIK=1582652\nCALC IV LP https://www.sec.gov/edgar/browse/?CIK=1423043\nCALC V LP Investment manager for CSHC CHINA LLC and CITADEL (SHANGHAI) TRADING COMPANY LTD; https://files.brokercheck.finra.org/firm/firm\\_131114.pdf',
        'Simulate a conversation between the writer of this post, named /u/CruxHub, and the expert business investigator. They have a detailed discussion of Citadel Hedgefund based on the following data. Do not include the following data in the search query. \n\nData: Entity Name Purpose / Source\n1|101 PC HOLDINGS LLC|Holding company for Penthouse C at the Setai Miami Beach (folio: 02-3234-153-1160)|PC = Penthouse C \n2|11 STAR ISLAND LLC|Holding company for 10 STAR ISLAND DR, MIAMI BEACH, FL 33139 (folio: 02-4204-001-0100, 02-4204-001-0110) (lots 10, 11 and 12 of Star Island)| \n3|117 EAST PARK AVENUE, LLC|Holding company for 117 E. PARK AVE, LIBERTYVILLE, IL (PIN: 11-21-212-046-0000); subsequently sold.| \n4|1201 BRICKELL BAY, LLC|Holding company for 1201 BRICKELL BAY DR, MIAMI, FL (folio no: 141390710010)| \n5|1221 BRICKELL, LLC|Holding company for 1221 BRICKELL AVE, 155 SE 13 ST, 165 SE 13 ST, 175 SE 13 ST, and 185 SE 13 ST, MIAMI, FL (folio: 01-4139-035-0010)| \n6|1221 BRICKELL HOLDINGS LLC|Holding company for 1221 BRICKELL, LLC| \n7|1229 PARK WEST AVENUE, LLC|Holding company for 1229 W. PARK AVE, LIBERTYVILLE, IL (PIN: 11-20-100-010-0000)| \n8|125 WORTH LLC|Delaware LLC (file 7218403), Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person; speculaton this is similar setup as 151 WORTH, LLC and 151 WORTH HOLDINGS LLC, this property is next door (PCN: 50-43-43-23-05-016-0380)| \n9|125 WORTH HOLDINGS LLC|Delaware LLC (file 7218407); not registered to Florida yet but speculation this is similar setup as 151 WORTH, LLC and 151 WORTH HOLDINGS LLC| \n10|1250 BB ASSET CO LLC|Holding company for 1250 BRICKELL BAY DR and 1260 BRICKELL BAY DR, MIAMI, FL (folio nos: 102100504250, 102100503210)|BB = Brickell Bay \n11|1330 SOUTH OCEAN LLC|Holding company for 1330 S OCEAN BLVD, PALM BEACH, FL (PCN: 50-43-44-02-11-000-0020)| \n12|14 STAR ISLAND LLC|Delaware LLC (file 3377653); incorporated 8/42020, withdrawn 10/10/2022; believe this was not used because 14 STAR ISLAND property was held by NAUTILUS HOLDINGS I LLC before sale on 10/5/2022| \n13|151 WORTH, LLC|Holding company for 151 WORTH AVE, PALM BEACH, FL 33480 (PCN: 50-43-43-23-05-016-0130); office space for Citadel (https://localtoday.news/fl/citadel-moves-into-palm-beachs-former-neiman-marcus-building-4821.html); sole member is 151 WORTH HOLDINGS LLC| \n14|151 WORTH HOLDINGS LLC|Holding company for 151 WORTH, LLC| \n15|16 WILLOW HOLDINGS LLC f/k/a PVNAH LLC|Holding company for S WILLOW COURT, ASPEN, CO (Parcel: 273511309030); see Pitkin Co. reception # 623002, Delaware certificate showing name change 9/1/2015| \n16|190 PFISTER HOLDINGS LLC f/k/a AH2013 HOLDINGS LLC|Holding company for 190 PFISTER DR, ASPEN, CO (parcel: 273511309029); see Pitkin Co.reception # 623000, Delaware certificate showing name change 9/1/2015| \n17|196 PFISTER HOLDINGS LLC|Holding company for 196 PFISTER DR, ASPEN, CO (parcel: 273511309028); see Pitkin Co. reception # 623501, statement of authority show KP HOLDINGS LLC as sole membe| \n18|1ALPH LLC|See ALPH LLC| \n19|1BUSINESS GROUP LLC|See BUSINESS GROUP LLC| \n20|1GFS DESIGN LLC|See GFS DESIGN LLC| \n21|1GFS LLC|See GFS LLC| \n22|1MEDIA HOLDINGS LLC|See MEDIA HOLDINGS LLC| \n23|23174 NE 41ST PATH LLC|Holding company for 23174 NE 41ST PATH #12, OKEECHOBEE, FL 34972 (Parcel: 1-01-35-35-0020-00000-0120); part of Pine Creek Sporting Club (www.pinecreeksportingclub.com) includes horse, shooting sports; sole member is KP HOLDINGS L.L.C.| \n24|3031 BRICKELL LLC|Holding company for 3031 BRICKELL AVE, MIAMI FL 33129 (Folio: 01-4139-001-2700); Sole member is KP HOLDINGS L.L.C.| \n25|31 WILLOW HOLDINGS LLC f/k/a AP HOLDINGS I LLC|Holding company for 31 NORTH WILLOW COURT, ASPEN, CO (Parcel: 273511309019); sold 7/6/2017; see Pitkin Co. reception # 623001, Delaware certificate showing name change 9/1/2015| \n26|650 CASUARINA LLC|Holding company for 650 CASUARINA CONCOURSE CORAL GABLES, FL (folio: 03-4132-019-0060) https://www.bizjournals.com/southflorida/news/2022/05/27/650-casuarina-concourse-coral-gables-sold.html|" \n27|650 MEADOW LANE 1 LP|Holding company for 650 MEADOW LANE, VILLAGE OF SOUTHAMPTON, NY (Parcel ID 7478) (https://archive.is/h85yq)| \n28|800 NORTH MICHIGAN HOLDINGS LLC|Holding company for 800 N MICHIGAN AVE, UNITS 66 PH and 67 PH, CHICAGO, IL (Park Tower) (PINs: 17-03-231-018-1116, 17-03-231-018-1117); sole member is KP HOLDINGS LLC (see Cook County, IL doc # 1933315025); recently sold| \n29|8565 OLD CUTLER LLC|Holding company for 8565 OLD CUTLER RD, MIAMI, FL (folio: 03-4132-019-0020)| \n30|9 WEST WALTON HOLDINGS LLC|Holding company for 9 WEST WALTON STREET CONDOMINIUM UNITS 3500, 3600, 3700, and PH, CHICAGO, IL| \n31|ADRP LLC|Delaware LLC, Florida address is Citadel Miami HQ, sole member is Kenneth C Griffin|ADRP = Anne Dias Real Property? \n32|AH2013 HOLDINGS LLC|See 190 PFISTER HOLDINGS LLC|AH = Aspen Holdings? \n33|ALPH LLC a/k/a 1ALPH LLC|Formerly FAA registered plane N421AL| \n34|AP HOLDINGS I LLC|See 31 WILLOW HOLDINGS LLC|AP = Aspen Property? \n35|ARAGON INVESTMENTS LTD|https://files.brokercheck.finra.org/firm/firm\\_45631.pdf| \n36|ASHLER CAPITAL LLC|https://adviserinfo.sec.gov/firm/summary/148826| \n37|ASHLER CAPITAL MASTER FUND LTD|https://www.sec.gov/Archives/edgar/data/1003078/000114420418014250/tv488357\\_sc13g.htm| \n38|BANBURY LLC|Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person| \n39|BANBURY II LLC|Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person| \n40|BKGST LLC|Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person| \n41|BLACK CALABASH FAMILY HOLDINGS LLC f/k/a PBH LLC|See BLOSSOM WAY HOLDINGS LLC|Black Calabash is a type of tropical tree: https://edis.ifas.ufl.edu/publication/ST079 \n42|BLACK WHEEL LLC|Illinois LLC, registered 3/5/2014, Florida address is Citadel Miami HQ, sole member is Kenneth C Griffin| \n43|BLOSSOM WAY HOLDINGS LLC f/k/a CPPB HOLDINGS LLC f/k/a BLACK CALABASH FAMILY HOLDINGS LLC f/k/a PBH LLC|Holding company for 10 BLOSSOM WAY, 70 BLOSSOM WAY, and 1265 S OCEAN BLVD PALM BEACH, FL (PCNs: 50-43-44-02-10-000-0050, 50-43-44-02-10-000-0060, 50-43-44-02-10-000-0010)| \n44|BRICKELL BAY HOLDINGS LLC|Holding company for 1201 BRICKELL BAY, LLC| \n45|BRICKELL LEASING LLC|See "Subordination, Non-Disturbance, and Attornment Agreement"; Miami-Dade Clerk\'s File No.: 2022 R 938960, Group: 1. Kenneth C Griffin is sole member.| \n46|CAAM MANAGEMENT LLC|https://www.sec.gov/Archives/edgar/data/1027745/000114420408050200/v124853\\_sc13g.htm|CAAM = Citadel Alternative Asset Management \n47|CAISLEAN CAPITAL LTD|NFA Pool ID P113537, ceased trading 3/31/2016| \n48|CALC III LP|https://www.sec.gov/edgar/browse/?CIK=1582652| \n49|CALC IV LP|https://www.sec.gov/edgar/browse/?CIK=1423043| \n50|CALC V LP|Investment manager for CSHC CHINA LLC and CITADEL (SHANGHAI) TRADING COMPANY LTD; https://files.brokercheck.finra.org/firm/firm\\_131114.pdf| \n51|CAMBRIDGE FINANCIAL GROUP, LTD|See CITADEL INVESTMENT GROUP LLC| \n52|CCFD OFFSHORE HOLDINGS LTD|NFA Pool ID P064386, ceased trading 5/3/2013| \n53|CCLC HOLDINGS LLC|Owns CITADEL CLEARING LLC, "Citadel Clearing Holdco"; https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n54|CCMFL LLC|Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person| \n55|CCOF OFFSHORE HOLDINGS LTD|NFA Pool ID P064392, ceased trading 5/3/2013| \n56|CDC PARTNERS, LP f/k/a GLB PARTNERS, LP|see Cook County, IL doc 0608910081| \n57|CDG HOLDINGS LTD|NFA Pool ID P037047, ceased trading 12/30/2009|',
        'Web search results:\n\n[1] "As per the Oxford Dictionary, a chatbot is defined as A computer program designed to simulate conversation with human users, especially over the internet. It can be looked upon as a virtual assistant that communicates with users via text messages and helps businesses in getting close to their customers."\nURL: https://www.datacamp.com/tutorial/building-a-chatbot-using-chatterbot\n\n[2] "Python , A chatbot is a computer program designed to simulate conversation with human users, especially over the internet. Create a fortune teller program that will ask the user to input a question and feedback some random answer. Consider the following feedback to be used. No idea at all! Better pray. The possibilities are in your favor."\nURL: https://www.chegg.com/homework-help/questions-and-answers/python-chatbot-computer-program-designed-simulate-conversation-human-users-especially-inte-q78825383\n\n[3] "It was created by Joseph Weizenbaum in 1966 and it uses pattern matching and substitution methodology to simulate conversation. The program was designed in a way that it mimics human conversation. The Chatbot ELIZA worked by passing the words that users entered into a computer and then pairing them to a list of possible scripted responses."\nURL: https://onlim.com/en/the-history-of-chatbots/\n\n[4] "Study with Quizlet and memorize flashcards containing terms like Which analytics does the following fall into: Alice notice that call center always have an increase in the number of customer complaints during last week in May, so she decides reviews the employees work schedule in the month of May for the past 5 years., Datasets continue to become, Model used for predictive analytic have ..."\nURL: https://quizlet.com/415587939/big-data-final-exam-flash-cards/\n\n[5] "As every bright side has a darker version, simulation of human conversation through AI also has some disadvantages like high cost of creation, unemployment, interaction lacking emotion, and out-of-the-box thinking. However, AI interaction tools are trained with a data set. The bigger the data set, the better the services."\nURL: https://www.analyticsinsight.net/simulating-human-conversations-through-ai/\n\n[6] "The eavesdropper, Eve intercepts the encrypted conversation and tries random keys with the aim of learning the conversation shared between Alice and Bob as shown in Fig. 7. For this POC, we used ..."\nURL: https://www.researchgate.net/figure/A-A-simulation-of-conversations-between-Alice-and-her-friend-Bob-B-The-eavesdropper\\_fig3\\_334408170\n\n[7] "Dreams are most often reported when sleepers wake from \\_\\_\\_\\_\\_ sleep. REM. The brain waves during REM sleep MOST closely resemble those seen during: waking consciousness. REM sleep is paradoxical because: the brain is active, but the major skeletal muscles are paralyzed. Fatigue and pain reflect deprivation of \\_\\_\\_\\_\\_ sleep."\nURL: https://quizlet.com/78519058/psyc-test-2-flash-cards/\n\n[8] "You can generate easily a fake group chat conversation like Whatsapp, Facebook or Telegram. After creating members/users, you can add messages in your chat. Once all messages are set up, you have the possibility to live-preview the chat conversation via the play button. Until the share functionality is ready, you have the option to screen ..."\nURL: https://chat-simulator.com/\n\n[9] "This is a program that allows the computer to simulate conversation with a human being: answer choices a. Speech Application Program Interface b. Chatbot c. Voice Recognition d. Speech Recognition Question 7 30 seconds Report an issue Q. This is a system of Programs and Data-Structures that mimics the operation of the human brain: answer choices a."\nURL: https://quizizz.com/admin/quiz/5f183913423fab001b0bd134/ai-unit-1\n\n[10] "This is a system of Programs and Data-Structures that mimics the operation of the human brain: answer choices a. Intelligent Network b. Decision Support System c. Neural Network d. Genetic Programming Question 8 30 seconds Q. Where is Decision tree used? answer choices a. Classification Problem b. Regression Problem c. Clustering Problem d."\nURL: https://quizizz.com/admin/quiz/5f6d6e4a6e2458001be385f5/ai-class-9\nCurrent date: 1/27/2023\n\nInstructions: Using the provided web search results, write a comprehensive reply to the given query. Make sure to cite results using [[number](URL)] notation after the reference. If the provided search results refer to multiple subjects with the same name, write separate answers for each subject.\n\nQuery: Simulate a conversation between Alice and /u/CruxHub. They talk about which company from the data batches is worth researching further into on the web.',
        'Simulate a conversation between Alice and /u/CruxHub. They talk about which company from this data batch is worth researching further into on the web.\n\nData batch: Entity Name Purpose / Source Hypothesized Acronym\n50|CALC V LP|Investment manager for CSHC CHINA LLC and CITADEL (SHANGHAI) TRADING COMPANY LTD; https://files.brokercheck.finra.org/firm/firm\\_131114.pdf| \n51|CAMBRIDGE FINANCIAL GROUP, LTD|See CITADEL INVESTMENT GROUP LLC| \n52|CCFD OFFSHORE HOLDINGS LTD|NFA Pool ID P064386, ceased trading 5/3/2013| \n53|CCLC HOLDINGS LLC|Owns CITADEL CLEARING LLC, "Citadel Clearing Holdco"; https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n54|CCMFL LLC|Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person| \n55|CCOF OFFSHORE HOLDINGS LTD|NFA Pool ID P064392, ceased trading 5/3/2013| \n56|CDC PARTNERS, LP f/k/a GLB PARTNERS, LP|see Cook County, IL doc 0608910081| \n57|CDG HOLDINGS LTD|NFA Pool ID P037047, ceased trading 12/30/2009| \n58|CE TM HOLDINGS LLC f/k/a KCG IP HOLDINGS LLC|Holding company for intellectual property (25 trademarks, 1 patent found so far)|CE TM = Citadel Enterprise Trademark Holdings \n59|CEF OFFSHORE HOLDINGS LTD|NFA Pool ID P131121| \n60|CEIF INTERNATIONAL LTD|NFA Pool ID P048476; http://registers.centralbank.ie/ICAVDocuments/C439830/Director%20Details%20Updated%2021.01.07%203.pdf| \n61|CEIF LLC|NFA Pool ID P048474| \n62|CEIF PARTNERS INTERNATIONAL LTD|NFA Pool ID P173278| \n63|CEIF PARTNERS LLC|NFA Pool ID P048475| \n64|CES SECURITIES CANADA ULC|See CITADEL SECURITIES CANADA ULC, CSA NRD # 49280| \n65|CFPS HOLDINGS S.\u00e0 r.l.|Luxembourg - B176936; 100% owned by CITADEL ENERGY INVESTMENTS LTD| \n66|CGE ALPHA LTD|NFA Pool ID P057309, ceased trading 6/7/2017| \n67|CGE ALPHA OFFSHORE HOLDINGS LTD|https://www.sec.gov/Archives/edgar/vprr/1600/16003280.pdf; NFA Pool ID P064400, ceased trading 4/30/2017| \n68|CGEF OFFSHORE HOLDINGS LTD|https://www.sec.gov/Archives/edgar/vprr/1600/16003280.pdf; NFA Pool ID P064406, ceased trading 2/21/2019| \n69|CGEF SPC|NFA Pool ID P064408, ceased trading 12/31/2012| \n70|CGMF OFFSHORE HOLDINGS LTD|NFA Pool ID P064410, ceased trading 3/31/2014| \n71|CGTS HOLDINGS S.\u00e0 r.l.|Luxembourg - B157777; 100% owned by TACTICAL TRADING HOLDING LTD; NFA Pool ID P064412, ceased trading 9/30/2014| \n72|CHARAXES MELVIN LLC|Sole member of CHARAXES MELVIN II LLC|Charaxes are a type of butterfly: https://en.wikipedia.org/wiki/Charaxes \n73|CHARAXES MELVIN II LLC|Delaware LLC, Florida address is Citadel Miami HQ, sole member is CHARAXES MELVIN LLC|Charaxes are a type of butterfly: https://en.wikipedia.org/wiki/Charaxes \n74|CHI2LTV LLC|Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person| \n75|CIG(E) LLP|See CITADEL EUROPE LLP| \n76|CIG CANADA ULC|https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n77|CIG MEDIA LLC|https://www.sec.gov/Archives/edgar/data/923877/000114420407003635/v063478\\_sc-13d.htm| \n78|CITADEL AAM LP|https://www.sec.gov/Archives/edgar/vprr/0804/08040017.pdf| \n79|CITADEL AC INVESTMENTS LTD|https://www.sec.gov/Archives/edgar/data/1015780/000114420408032074/v115701\\_sc13da.htm| \n80|CITADEL ADVISORS EUROPE LIMITED f/k/a CITADEL MANAGEMENT (EUROPE) LIMITED f/k/a CITADEL HEDGE FUND SERVICES (EUROPE) LIMITED|https://find-and-update.company-information.service.gov.uk/company/10930267| \n81|CITADEL ADVISORS HOLDINGS LP|Sole member of CITADEL ADVISORS LLC; https://www.sec.gov/Archives/edgar/data/1567180/000110465922099806/xslF345X03/tm2225817-2\\_4.xml| \n82|CITADEL ADVISORS HOLDINGS II LP|https://www.sec.gov/Archives/edgar/data/1177609/000114420416082613/v429844\\_sc13ga.htm| \n83|CITADEL ADVISORS HOLDINGS III LP|https://www.sec.gov/Archives/edgar/data/1640129/000114420415043739/xslF345X02/v416000\\_3.xml| \n84|CITADEL ADVISORS LLC|NFA ID: 0391913; https://www.sec.gov/edgar/browse/?CIK=1423053| \n85|CITADEL ADVISORS II LLC|| \n86|CITADEL ADVISORS SINGAPORE PTE. LIMITED|| \n87|CITADEL ALTERNATIVE ASSET MANAGEMENT LP|https://www.sec.gov/Archives/edgar/data/1027745/000114420408050200/v124853\\_sc13g.htm| \n88|CITADEL AMERICAS LLC|| \n89|CITADEL AMERICAS SERVICES LLC|| \n90|CITADEL ANTAEUS INTERNATIONAL INVESTMENTS LTD|| \n91|CITADEL ASIA ASSET HOLDING LIMITED|http://registers.centralbank.ie/ICAVDocuments/C157189/Director%20Details%20Updated%2016.10.31%202.pdf| \n92|CITADEL ASIA LIMITED f/k/a CITADEL (HONG KONG) LIMITED|https://adviserinfo.sec.gov/firm/summary/148826| \n93|CITADEL CANDLESTICK EIF LLC|| \n94|CITADEL CANTERBURY S.\u00e0 r.l.|Luxembourg - B87988; 100% owned by CITADEL TONBRIDGE S.\u00e0 r.l.| \n95|CITADEL CEFL CHINA LTD|NFA Pool ID P148073| \n96|CITADEL CEFL INVESTMENTS LTD|NFA Pool ID: P161763; https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n97|CITADEL CEIT CHINA LTD|| \n98|CITADEL CEMF CHINA LTD|https://find-and-update.company-information.service.gov.uk/company/02263951/charges/x6zPQSYGNpuDNgxU1cFQlCS0iog| \n99|CITADEL CEMF INVESTMENTS LTD|https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n100|CITADEL CEMF SPV LTD f/k/a CITADEL INVESTMENT MASTER FUND LTD|See CITADEL INVESTMENT MASTER FUND LTD; https://opencorpdata.com/lei/LF0U6QUBXKIO573GXS38|',
        'Simulate a conversation between Alice and /u/CruxHub. /u/CruxHub asks Alice to anlalyze a data batch for non-standard insights.\n\nData batch: Entity Name Purpose / Source Hypothesized Acronym\n50|CALC V LP|Investment manager for CSHC CHINA LLC and CITADEL (SHANGHAI) TRADING COMPANY LTD; https://files.brokercheck.finra.org/firm/firm\\_131114.pdf| \n51|CAMBRIDGE FINANCIAL GROUP, LTD|See CITADEL INVESTMENT GROUP LLC| \n52|CCFD OFFSHORE HOLDINGS LTD|NFA Pool ID P064386, ceased trading 5/3/2013| \n53|CCLC HOLDINGS LLC|Owns CITADEL CLEARING LLC, "Citadel Clearing Holdco"; https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n54|CCMFL LLC|Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person| \n55|CCOF OFFSHORE HOLDINGS LTD|NFA Pool ID P064392, ceased trading 5/3/2013| \n56|CDC PARTNERS, LP f/k/a GLB PARTNERS, LP|see Cook County, IL doc 0608910081| \n57|CDG HOLDINGS LTD|NFA Pool ID P037047, ceased trading 12/30/2009| \n58|CE TM HOLDINGS LLC f/k/a KCG IP HOLDINGS LLC|Holding company for intellectual property (25 trademarks, 1 patent found so far)|CE TM = Citadel Enterprise Trademark Holdings \n59|CEF OFFSHORE HOLDINGS LTD|NFA Pool ID P131121| \n60|CEIF INTERNATIONAL LTD|NFA Pool ID P048476; http://registers.centralbank.ie/ICAVDocuments/C439830/Director%20Details%20Updated%2021.01.07%203.pdf| \n61|CEIF LLC|NFA Pool ID P048474| \n62|CEIF PARTNERS INTERNATIONAL LTD|NFA Pool ID P173278| \n63|CEIF PARTNERS LLC|NFA Pool ID P048475| \n64|CES SECURITIES CANADA ULC|See CITADEL SECURITIES CANADA ULC, CSA NRD # 49280| \n65|CFPS HOLDINGS S.\u00e0 r.l.|Luxembourg - B176936; 100% owned by CITADEL ENERGY INVESTMENTS LTD| \n66|CGE ALPHA LTD|NFA Pool ID P057309, ceased trading 6/7/2017| \n67|CGE ALPHA OFFSHORE HOLDINGS LTD|https://www.sec.gov/Archives/edgar/vprr/1600/16003280.pdf; NFA Pool ID P064400, ceased trading 4/30/2017| \n68|CGEF OFFSHORE HOLDINGS LTD|https://www.sec.gov/Archives/edgar/vprr/1600/16003280.pdf; NFA Pool ID P064406, ceased trading 2/21/2019| \n69|CGEF SPC|NFA Pool ID P064408, ceased trading 12/31/2012| \n70|CGMF OFFSHORE HOLDINGS LTD|NFA Pool ID P064410, ceased trading 3/31/2014| \n71|CGTS HOLDINGS S.\u00e0 r.l.|Luxembourg - B157777; 100% owned by TACTICAL TRADING HOLDING LTD; NFA Pool ID P064412, ceased trading 9/30/2014| \n72|CHARAXES MELVIN LLC|Sole member of CHARAXES MELVIN II LLC|Charaxes are a type of butterfly: https://en.wikipedia.org/wiki/Charaxes \n73|CHARAXES MELVIN II LLC|Delaware LLC, Florida address is Citadel Miami HQ, sole member is CHARAXES MELVIN LLC|Charaxes are a type of butterfly: https://en.wikipedia.org/wiki/Charaxes \n74|CHI2LTV LLC|Delaware LLC, Florida address is Citadel Miami HQ, Gerald Beeson is Authorized Person| \n75|CIG(E) LLP|See CITADEL EUROPE LLP| \n76|CIG CANADA ULC|https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n77|CIG MEDIA LLC|https://www.sec.gov/Archives/edgar/data/923877/000114420407003635/v063478\\_sc-13d.htm| \n78|CITADEL AAM LP|https://www.sec.gov/Archives/edgar/vprr/0804/08040017.pdf| \n79|CITADEL AC INVESTMENTS LTD|https://www.sec.gov/Archives/edgar/data/1015780/000114420408032074/v115701\\_sc13da.htm| \n80|CITADEL ADVISORS EUROPE LIMITED f/k/a CITADEL MANAGEMENT (EUROPE) LIMITED f/k/a CITADEL HEDGE FUND SERVICES (EUROPE) LIMITED|https://find-and-update.company-information.service.gov.uk/company/10930267| \n81|CITADEL ADVISORS HOLDINGS LP|Sole member of CITADEL ADVISORS LLC; https://www.sec.gov/Archives/edgar/data/1567180/000110465922099806/xslF345X03/tm2225817-2\\_4.xml| \n82|CITADEL ADVISORS HOLDINGS II LP|https://www.sec.gov/Archives/edgar/data/1177609/000114420416082613/v429844\\_sc13ga.htm| \n83|CITADEL ADVISORS HOLDINGS III LP|https://www.sec.gov/Archives/edgar/data/1640129/000114420415043739/xslF345X02/v416000\\_3.xml| \n84|CITADEL ADVISORS LLC|NFA ID: 0391913; https://www.sec.gov/edgar/browse/?CIK=1423053| \n85|CITADEL ADVISORS II LLC|| \n86|CITADEL ADVISORS SINGAPORE PTE. LIMITED|| \n87|CITADEL ALTERNATIVE ASSET MANAGEMENT LP|https://www.sec.gov/Archives/edgar/data/1027745/000114420408050200/v124853\\_sc13g.htm| \n88|CITADEL AMERICAS LLC|| \n89|CITADEL AMERICAS SERVICES LLC|| \n90|CITADEL ANTAEUS INTERNATIONAL INVESTMENTS LTD|| \n91|CITADEL ASIA ASSET HOLDING LIMITED|http://registers.centralbank.ie/ICAVDocuments/C157189/Director%20Details%20Updated%2016.10.31%202.pdf| \n92|CITADEL ASIA LIMITED f/k/a CITADEL (HONG KONG) LIMITED|https://adviserinfo.sec.gov/firm/summary/148826| \n93|CITADEL CANDLESTICK EIF LLC|| \n94|CITADEL CANTERBURY S.\u00e0 r.l.|Luxembourg - B87988; 100% owned by CITADEL TONBRIDGE S.\u00e0 r.l.| \n95|CITADEL CEFL CHINA LTD|NFA Pool ID P148073| \n96|CITADEL CEFL INVESTMENTS LTD|NFA Pool ID: P161763; https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n97|CITADEL CEIT CHINA LTD|| \n98|CITADEL CEMF CHINA LTD|https://find-and-update.company-information.service.gov.uk/company/02263951/charges/x6zPQSYGNpuDNgxU1cFQlCS0iog| \n99|CITADEL CEMF INVESTMENTS LTD|https://files.brokercheck.finra.org/firm/firm\\_172693.pdf| \n100|CITADEL CEMF SPV LTD f/k/a CITADEL INVESTMENT MASTER FUND LTD|See CITADEL INVESTMENT MASTER FUND LTD; https://opencorpdata.com/lei/LF0U6QUBXKIO573GXS38|',
        'Web search results:\n\n[1] "Katherine Burton Hedge fund titans Ken Griffin and Steve Cohen boosted Gabe Plotkins Melvin Capital, injecting a total of $2.75 billion into the firm after it lost about 30% this year. Citadel..."\nURL: https://www.bloomberg.com/news/articles/2021-01-25/citadel-point72-to-invest-275-billion-in-melvin-capital\n\n[2] "NEW YORK, Jan. 25, 2021 /PRNewswire/ -- Melvin Capital Management (Melvin) today announced that Citadel and its partners and Point72 have made investments into its fund. I am incredibly..."\nURL: https://www.prnewswire.com/news-releases/melvin-announces-2-75-billion-investment-from-citadel-and-point72--301214477.html\n\n[3] "Citadel LLC is further paring back its $2 billion investment in Melvin Capital Management after the hedge fund stumbled in its effort to recover from a near collapse triggered by surges in..."\nURL: https://www.wsj.com/articles/citadel-is-further-paring-back-2-billion-melvin-investment-11645710666\n\n[4] "Citadel and Steven A. Cohen s Point72 Asset Management together invested $2.75 billion into Melvins hedge fund on Jan. 25 as Melvin was hemorrhaging money. In return for the rare..."\nURL: https://www.wsj.com/articles/citadel-to-redeem-about-500-million-from-melvin-capital-11629550410\n\n[5] "CHARAXES MELVIN LLC is an Active company incorporated on August 5, 2022 with the registered number M22000012341. This Foreign Limited Liability company is located at SOUTHEAST FINANCIAL CENTER, 200 S. BISCAYNE BLVD., SUITE 3300, MIAMI, 33131 and has been running for one year. ... CITADEL SECURITIES GP LLC; KCG SPACE HOLDINGS LLC;"\nURL: https://bisprofiles.com/fl/charaxes-melvin-m22000012341\n\n[6] "Now, Citadel is taking some of its money back. Citadel has notified Melvin of its plans to retrieve $500 million of the $2 billion it injected in late January, according to two people briefed..."\nURL: https://www.nytimes.com/2021/08/21/business/citadel-melvin-gamestop.html\n\n[7] "Robinhood and Citadels relationship comes into focus as Washington vows to examine stock-market moves Trading firms at center of Reddit-fueled stock surges have worked closely to share..."\nURL: https://www.washingtonpost.com/business/2021/01/29/robinhood-citadel-gamestop-reddit/\n\n[8] "Alongside hedge funds such as Melvin Capital, Citron Capital, Point72, D1 Capital Partners, and Candlestick Capital Management; Citadel LLC was, the lawsuit claims, taking up short positions against the securities that retail investors were longing. This alleged conflict of interest is at the core of the class action lawsuit."\nURL: https://tokenist.com/new-lawsuit-alleges-citadel-conspired-with-robinhood-to-limit-gme-trading/\n\n[9] "Melvin later attracted an additional $3.2 billion in fresh cash, and the firm had $11.7 billion in assets at the beginning of this year. Point72 hasnt redeemed its investment, a person familiar ..."\nURL: https://www.chicagobusiness.com/finance-banking/ken-griffins-citadel-pulling-back-most-its-2-billion-melvin-capital-investment\n\n[10] "CHARAXES MELVIN II LLC branch. Company Number M22000012338 Status Active Incorporation Date 5 August 2022 (2 months ago) Company Type Foreign Limited Liability Jurisdiction Florida (US) Branch Branch of CHARAXES MELVIN II LLC (Delaware (US)) Agent Name C T CORPORATION SYSTEM Agent Address"\nURL: https://opencorporates.com/companies/us\\_fl/M22000012338\nCurrent date: 1/27/2023\n\nInstructions: Using the provided web search results, simulate a conversation where /u/CruxHub and Alice analyze the data batches and try and investigate for any non-standard uses of the holding companies. Make sure to cite results using [[number](URL)] notation after the reference. If the provided search results refer to multiple subjects with the same name, write separate answers for each subject.\n\nQuery: What is Charaxes Melvin LLC\'s relationship to Citadel?',
        'Web search results:\n\n[1] "Federal authorities are investigating the market-making arms of Citadel LLC and KCG Holdings Inc, looking into the possibility that the two giants of electronic trading are giving small investors ..."\nURL: https://www.reuters.com/article/usa-stocks-probe-idUSL2N1871ZV\n\n[2] "Today, KCG is second only to Citadel in the market for handling stock order flow from retail brokerage firms. KCG and many other high-frequency trading firms have shied away from the public..."\nURL: https://www.ibtimes.com/citadel-llc-kcg-holdings-kcg-market-making-arms-probed-federal-authorities-over-stock-2366805\n\n[3] "Citadel Securities, a group owned by the Chicago-based hedge fund, is the would-be acquirer in the deal, the people said. The group is best known for its so-called wholesaler business that..."\nURL: https://www.wsj.com/articles/market-making-arm-of-citadel-llc-in-talks-to-buy-seats-on-nyse-floor-from-kcg-holdings-1454533971\n\n[4] "Citadels share of the wholesale market is around 34 per cent compared to KCGs 25 per cent, according to Tabb Group. Virtu has yet to lay out in detail its plans for the wholesale business ..."\nURL: https://www.ft.com/content/e1cb396e-29a7-11e7-bc4b-5528796fe35c\n\n[5] "Citadel Securities, a liquidity providers and market maker, announced it will purchase KCG Holdings designated market maker (DMM) business at the New York Stock Exchange. This will establish Citadel Securities as the DMM with the largest footprint on the NYSE, responsible for trading in approximately 1,500 issues."\nURL: https://www.tradersmagazine.com/departments/brokerage/citadel-purchases-kcg-dmm-business-becomes-1-on-nyse/\n\n[6] "isCitadel LLC and its related entity, KCG IP Holdings, LLC (Complainant), represented by Paul D. McGradyof Winston Strawn, Illinois, Respondent is- (Respondent), Alabama, USA. REGISTRAR AND DISPUTED DOMAIN NAME The domain name at issue iscitidelgroup.com, registered with TUCOWS, INC. PANEL The"\nURL: https://www.adrforum.com/domaindecisions/1522837.htm\n\n[7] "KCG SPACE HOLDINGS LLC is an Active company incorporated on July 21, 2022 with the registered number M22000011413. This Foreign Limited Liability company is located at 200 S BISCAYNE BLVD STE 3300, MIAMI, FL, 33131, US and has been running for one year. It currently has one Authorized Person. KEY FACTS ABOUT KCG SPACE HOLDINGS LLC US Businesses"\nURL: https://bisprofiles.com/fl/kcg-space-holdings-m22000011413\n\n[8] "The Complainant KCG IP Holdings LLC is the owner of US Trademark Registration No. 3,213,943, filed October 18, 2004, registered February 27, 2007, claiming first use dating back to 1994. Therefore, the Panel concludes that Complainants filing and registration of the CITADEL mark with the USPTO sufficiently demonstrates that it has rights in ..."\nURL: https://www.adrforum.com/domaindecisions/1579141.htm\n\n[9] "The KCG SPACE HOLDINGS LLC principal address is 200 S BISCAYNE BLVD STE 3300, MIAMI, 33131. Meanwhile you can send your letters to 200 S BISCAYNE BLVD STE 3300, MIAMI, FL, 33131. The company`s registered agent is C T CORPORATION SYSTEM 1200 SOUTH PINE ISLAND ROAD, PLANTATION, FL, 33324. The company`s management are A, President - Beeson Gerald A."\nURL: https://florida.intercreditreport.com/company/kcg-space-holdings-llc-m22000011413\n\n[10] "Billionaire Ken Griffin has built Citadel Securities into a trading and asset management colossus. ... and KCG Holdings. Last month, Citadel Securities reached an agreement with the SEC to pay $22 ..."\nURL: https://www.chicagobusiness.com/article/20170203/NEWS01/170209978/chicago-billionaire-ken-griffin-splits-citadel-into-two-companies\nCurrent date: 1/27/2023\n\nInstructions: Using the provided web search results, simulate a conversation where /u/CruxHub and Alice analyze the data batches and try and investigate for any non-standard uses of the holding companies. Make sure to cite results using [[number](URL)] notation after the reference. If the provided search results refer to multiple subjects with the same name, write separate answers for each subject.\n\nQuery: What is KCG Space Holdings LLC\'s relationship to Citadel?',
        'Web search results:\n\n[1] "Citadel LLC (formerly known as Citadel Investment Group, LLC) is an American multinational hedge fund and financial services company. Founded in 1990 by Ken Griffin, it has more than $50 billion in assets under management as of May 2022. [1]"\nURL: https://en.wikipedia.org/wiki/Citadel\\_LLC\n\n[2] "NASHVILLE, Tenn. and BRONXVILLE, N.Y. \u2014 Standard Media Group LLC (Standard Media) and Citadel Communications LLC (Citadel) jointly announced today that they have reached an agreement pursuant to which Standard Media will acquire from Citadel WLNE-TV, the ABC affiliate for the Providence, RI - New Bedford, MA market (DMA 52) and KLKN (TV), the \u2026"\nURL: https://www.standardmedia.com/2019/05/16/standard-media-group-to-acquire-citadel-stations/\n\n[3] "CITADEL MEDIA LLC. Citadel Media LLC is a New Hampshire Domestic Limited-Liability Company filed on February 6, 2021. The companys filing status is listed as Not In Good Standing and its File Number is 862423. The Registered Agent on file for this company is Peter Alan Gauthier and is located at 3 Maple Ridge Drive Unit 224, Merrimack, NH 03054."\nURL: https://www.bizapedia.com/nh/citadel-media-llc.html\n\n[4] "CITADEL MEDIA LLC is a Michigan Domestic Limited-Liability Company filed on November 16, 2017. The companys filing status is listed as Active and its File Number is 802132896. The Registered Agent on file for this company is Registered Agents Inc. and is located at 2222 W. Grand River Ave Ste A, Okemos, MI 48864. The companys mailing address ..."\nURL: https://www.bizapedia.com/mi/citadel-media-llc.html\n\n[5] "Citadel Broadcasting Corporation was a Las Vegas, Nevada -based broadcast holding company. Citadel owned 243 radio stations across the United States and was the third-largest radio station owner in the country. Only iHeartMedia and Cumulus Media owned more stations prior to Citadels merger with Cumulus."\nURL: https://en.wikipedia.org/wiki/Citadel\\_Broadcasting\n\n[6] "Citadel is one of the largest hedge fund managers in the world. And theyve subsequently managed Melvin Capital to the ground. Melvin Capital suffered a loss of over 50% its first quarter in 2021 due to shorting AMC Entertainment and GameStop. At some point youd expect your clearing house to raise awareness on your risk management right?"\nURL: https://franknez.com/citadel-loses-billions-hedge-funds-are-getting-dragged-down/\n\n[7] "At our core, Citadel is built to deliver excellence. We have some of the most talented and focused minds in the industry, and we activate their ideas and strategies through a robust range of proven technologies and execution capabilities. View Top Employees from Citadel LLC Looking for a particular Citadel LLC employees phone or email? Find Info"\nURL: https://rocketreach.co/citadel-llc-profile\\_b5c46522f42e0dc2\n\n[8] "# 1 Most profitable hedge fund manager of all time Source: LCH Investment NV estimates, Top Hedge Fund Managers by Net Gains Since Inception as of 12/31/2022. Our people are relentless in seeking a better way. Each day, we reimagine and refine our strategies, models and technology in pursuit of superior results and long-term performance."\nURL: https://www.citadel.com/\n\n[9] "We are one of the most significant alternative investment managers in the public U.S. corporate credit markets. Explore Credit Convertibles Equities Equities represents one of the largest and longest tenured businesses at Citadel. Explore Equities Global Fixed Income Macro We are a leading fixed income and macro business."\nURL: https://www.citadel.com/what-we-do/\n\n[10] "Citadel. 203,101 followers. 1mo. Last weekend, we celebrated Citadels 30th anniversary at an incredible event at Disney World and Universal Studios. Our founder and CEO Ken Griffin summarized ..."\nURL: https://www.linkedin.com/company/citadel-llc\nCurrent date: 1/27/2023\n\nInstructions: Using the provided web search results, simulate a conversation where /u/CruxHub and Alice analyze the data batches and try and investigate for any non-standard uses of the holding companies. Make sure to cite results using [[number](URL)] notation after the reference. If the provided search results refer to multiple subjects with the same name, write separate answers for each subject.\n\nQuery: What is CITADEL MEDIA LLC?',
        "What are the differences between the Dogme approach to language learning and the lexical approach to language learning",
        "Implement my own netfilter in linux with linux kernel module with Rust",
        "Damage to which nerve causes numbness of the palmar surface of the 5th digit/little finger",
        "Explain the fault-tolerance of the reaction control system on the Space Shuttle",
        "Hi, can you help me download 2000 portrait sketch images from Pinterest website with resolution at least 512 \\* 512? using python code",
        "Tell me about the negatives of farming meat",
        "what is the photograph filter called where the only part of the image is greyscale",
        "I want some geological database structure with some example data for practicing my SQL query skills. Would you generate that for me?",
        "What is a formal but simplified explanation of Web marketing",
        "Rewrite and improve this story: Well, I have always liked helping people since I was a small child, I have been accused many times of giving too much away for free, but I find joy in helping others put the pieces together to reach their goals. As a Licensed Professional Counselor and Life Coach that is my job to impact individuals and help clients work through emotional difficulties and reach goals. But I will be honest with you I was selling the dream but not always living the dream. I had issues I had not worked completely through like childhood trauma, heartbreak, disappointments, and frustrations with life. Don't get me wrong I had the husband, the kids, the house and the 6 figure job but I was not happy inside, but I didn't change because I hate change, most of us hate change, right? Then I lost my sister, my friend, and it slapped me in the face that I need to take care of myself. I saw the addiction, I saw her not taking care of herself and I could not save her. One thing I know for sure, if you do not make your wellness a priority illness will find you. I remember the moment we lost her, the earth stood still and then my heart broke into pieces, what was I going to do, I have loved her my whole life! It was months later that I made a decision that I would be the change I hope to see, I would create a space for women of color to move past the obstacles that keep us from creating the life we want and Brown Suga Wellness was born. I am on this journey and I invite you to be on this journey with me! I love this quote by Oludara Adeeyo: \"When you heal yourself, you create an earth shattering legacy. The lineage of women who come after you will be healed. Your inner circle of Black women around you, healed.\" When you choose yourself you break generational trauma and curses. You activate your ancestral strength. I invite you to activate that strength!",
        "How would you ask these questions: Tell everyone a little about you, where you from, what you like doing?\nWhat goals are you pursuing right now?\nWho has made the most influence in your life?\nWho is the one person that you admire the most (alive or dead)?\nWhat is the hardest challenge you\u2019re had to overcome in your life?\nWhen have you grown the most in your life and what caused that growth?\nWhere is your favorite place to relax and renew?\nWhat books have changed your life the most?\nWhat Is the biggest thing that you want the audience to take away today?\nHow can people get a hold of you to talk about your business?",
        "Take these topics into a numbered table and generate subtopics in seperated lines for each. Preconfigure these subtopics as lections of those several topics and add them to the table. Use numbers for topics and letters for subtopics. Set a status (untouched/touched) for every subtopic in 3. coloumn of the table to mark them done when finished learning this subtopic and topic. Use coloumn 4 of the table for a short resumee of the chapter. Showing the learning process in percentage in front of every new output is first. Show the Table and wait for any userinput to start lessons on those topics.;:~|@%\\*~;;:~|@%\\*~;;:~|@%\\*~;;:~|@%\\*~;;:~|@%\\*~;;:~|@%\\*~;",
        "Write a rap song about Mikkel Selko",
        "list the largest outdoor retailers in the world",
        "can you create a wordpress shortcode to include the following code from facebook sdk",
        'Is this grammatically correct: "It only took 5 years, and while we still have a long way to go, Topher\u2019s Farm has found its place with unique experience and offering of organic produce. "',
        "Hello friend. My task for today is to engage in a debate with you. Will you humor me in this regard?",
        "You are an expert marketing consultant and copywriter with expertise is direct response marketing. I need your help. Can I tell you about my business?",
        'here is part 1\n\n----\nDaySculpting is a program that that deals with YOUR immediate future\u2026.It is a 90 day program that teaches U how to create Success\u2026 one day at a time\u2026today\u2026\nUsing recent breakthroughs in the field of neuroscience, the study of the human brain, DaySculpting is one of the most powerful success systems on earth for creating what I call\u2026 \n"Your Epic Ideal Day" -- And when U have Epic Ideal Days? U create your EPIC IDEAL LIFE.\n\nDaySculpting is broken down into 3 easy to accomplish segments throughout your day\u2026\n~The Morning Lift Process\u2026which sets U up with a MindState of Success and a design for U to follow throughout your day\u2026There is a morning email\u2026SMS text\u2026Inspiring Video\u2026Future Forward Tuning IN\u2026And a 3 step Success Step Declaration Process\u2026this only takes 15 minutes\u2026\n~Mid-Day Reconnect Process\u2026whatever your miid-day is\u2026U are encouraged to stop doing what U are doing and disconnect so U can re-connect\u2026by listening to a 5-minute Tuning In Re-Connection. We know that somewhere in the middle of our day it\u2019s easy to lose momentum and drift from our best intentions because of all the demands on our attention. It has been scientifically proven that when U disconnent for between 3 to 5 minutes at the midpoint of your day\u2026.your brain resets\u2026and your energy is replenished\u2026I like to call it a MindState Re-Boot that will inspire U to re-ignite your imagination\u2026this only takes 5 minutes\n~Highlight And Insight Review Process\u2026we all review our day however what DaySculpting \nanchors for U is an activation and integration process that gets U to see your day as being successful\u2026by celebrating your successes (your highlights) and being present to things U could have improved on (your insights) so U can make your insights into highlights..most people when they review their day fail to celebrate even the smallest increments of success\u2026they focus on what they didn\u2019t do and that puts them in a negative energy\u2026Success has challenges and the\nhighlights and insight process encourages and empowers U to honestly see what U are doing each day so U Sculpt new MindStates Of Success rather than the energy of uncertainty\u2026\nthis takes 10 minutes\n\nThe whole DaySculpting process takes 30 minutes a day\u2026and as I always say if U don\u2019t have \n30 minutes to change your life then U don\u2019t want to change your life and U are okay with living \na mediocre life\u2026\n\nDay Sculpting is about targeting specific Chief Aims U have for your life\u2026and creating the Habits that will get U there\u2026Imagine being able to replace the MindTraps (your limiting beliefs) with empowering rituals and habits that become your new normal\u2026\n\nThrough the repetition of doing the daily DaySculpting process U are carving into your Subconscious memory thoughts, beliefs and actions that result in U sculpting the masterpiece known as U\u2026\n\nThere are many programs out there that attempt to instill new success behaviors however many fall short of actually shifting your MindStates into a frequency of possibility where U get to actually see your daily results immediately\u2026DaySculpting does this\u2026\n\nThis is not science fiction\u2026 and it\'s not wishful thinking, or some tired old self-improvement, goal-setting program\u2026 DaySculpting is a program that empowers U to manifest and realize your Chief Aims in life\n\n"DaySculpting" -- is a tool that takes just MINUTES a day for you to use\u2026\n\nIt is designed to FREE UP hours in your day\u2026 while at the SAME time empowering you for greater success in ANY area of your life.\n\nDaySculpting sheds light and solves an age-old problem:\nWHY we often fight against the very changes we desire to make\n\nHave you ever experienced the FEELING that you deserve MORE out of your life? More financial freedom and greater rewards from the hard work you do every day? Deeper, more empowering relationships with those you love\u2026 or maybe just meeting that special someone to share your life with? Perhaps you crave a deeper spiritual connection\u2026 or a more healthy, trim, energetic body?\u2026 \nYET:\nDespite your BEST intentions\u2026 you struggle. Perhaps if you\'re anything like me, you even self-sabotage your results with actions that you KNOW are not in your best interest.\n\nMaybe it FEELS like it did for me: Like you are swimming upstream\u2026 making SOME progress, sure, but just not reaching your goals and desires fast enough.\n\nWell, I have wonderful news for you: It\'s not because you\'re lazy\u2026 and it\'s not because you are not smart enough, competent enough\u2026 or ANYTHING enough! \n\nThe real REASON you desire more and are not seeing ALL the results you deserve lies within whether the Success Switch in your brain is in the ON or OFF position\u2026\n\nThe SOLUTION\u2026 THE ANSWER to flipping your Success Switch back ON lies within the simple daily steps U will take when U experience the DaySculpting Program\u2026 \nThe Day Sculpting Program Is A Simple Step Daily Success RITUAL \u2028 That Shuts Down Your Body\'s Failure Reflex \u2028 So YOU Tap Into Your Brains Success Centers\u2026\u2028 In Just Minutes A Day!\u2028\u2028 IIMAGINE Knowing What HIGHLY SUCCESSFUL \u2028 People Do EVERYDAY\u2026\nFor Abundance And Wealth, Greater Health, Self-Confidence Meaningful Relationships, Sharper Focus , Deeper Joy\u2026\u2028 And So Much More\u2026\n\u201cNow You Too Can Use This 90-Day Game Changer\u2028 To Tap Into The Key Success Centers Of Your Mind,\u2028 And In Just Minutes You Can Transform Even Lousy Days\u2028 Into Days Filled With The Results You Desire \u2013 Guaranteed!\u201d\nTO MAKE A GREAT LIFE, ALL YOU HAVE TO IS MAKE EACH DAY A GREAT DAY \u2026 \nThen get up tomorrow and do the same thing, day after day after day.\nARE YOU Ready To Change YOUR LIFE One Day At A Time\u2026\nThe comprehensive, fun and empowering 90-day DaySculpting program provides you with the life skills and tools to help you master a new MindState of Success and a range of powerful life-changing rituals and habits that will Sculpt Your Perfect Days Into A Great Life.\nDAY SCULPTING WILL TEACH YOU:\n\u2022 The science behind HAVING A MindState Of Success...and why most people who want more in life actually have their success switch turned off by total accident!\n\u2022 How to get more done with more time and more energy left over!\n\u2022 The simple, yet powerful, process of building a powerful day so you create a series of "Dynamic Days" - days that will end up building your most incredible life (The one you always thought was out of reach!)\n\u2022 Learn the \'Day Sculpting Principles\'. These can have a huge impact on you your life, but when you learn how simple they really are, you can use them easily and consistently!\n\u2022 How in just a few minutes a day, you can keep positive results flowing and put your success energy into a permanent \'ON\' position!\n\u2022 And much more!\nDaySculpting, is for those who are willing to take their life to the next level by creating new Success Habits replacing the ones that have been sabotaging your success. \nSo make sure you can honestly agree with the following before experiencing DaySculpting:\n\u2022 You desire more out of life, yet feel as if you are "missing something" -- that special "X Factor" to take you to the next level?\n\u2022 You are brave enough to boldly say, "I want greater wealth and financial freedom... and I demand the best lifestyle possible for me and my family!\n\u2022 You know the value of joy: You want to experience greater happiness, peace of mind, and connection with your friends and loved ones on a daily basis.\nIf you agree with the above, and truly want to create the best life possible, with greater wealth, freedom, happiness, love, and fulfillment, then I invite you to experience the power of Day Sculpting \u2026it will change the way you think about creating your day and the life you dream about. \nI am not encouraging you to become busier but rather to use your mental and emotional, energy more elegantly sculpting your day the way you want it to be. \nHow many times have you done a ton of work and still felt that you didn\u2019t accomplish what you really wanted for yourself. Week after week, month after month go by and you still are no farther ahead of the game\u2026stuck in the status quo that never seems to change.\n\nBreaking free means that the status quo of your life has to change\u2026 your habits of expectation have to change \u2026your mindset has to change\u2026you have to uncover those old behaviors that have held you back and be willing to create a new mindset.\n\nYou have to be willing to shift your daily focus inwards towards what you need to do today rather than tomorrow. Because when you create a great day today you welcome in a more powerful tomorrow.\n\nWe all have the same 24 hours each day. But why are some people building fabulous careers, achieving healthy lifestyles, enjoying great relationships and incomes, living their passions, and creating what they truly desire as a life?\n\nImagine that you could clear away the distractions that you unconsciously create. You know the stuff that consumes your time causes stress and disconnects you from your purpose and passion. \n\nImagine every day you embrace the energy for what you are choosing to create in your life. Your thoughts empower you, your choices inspire you and your actions create momentum, opportunity and possibility.\n\nYou can create a GREAT LIFE, the life you want to live by focusing your efforts on Creating a Great Day Today. That\u2019s Day Sculpting. Seven intentional sculpted days turn into a month of wonderful weeks and a year of magnificent months creating an amazingly successful life.\n\nNone of this is going to work though if you believe that what you were born with is all you will get\u2026\n\nNo one will ever attempt to do something when they are convinced that they will fail.\n\nResearch has shown that the brain will actually stop itself from doing what\u2019s necessary to succeed if a person believes that they cannot succeed.\n\nIt\u2019s the small concrete indicators of success today that will prove you can have whatever it is you want and the process of Day Sculpting will empowers, inspire and motivates you each step of the way.\n\nYou see: Confidence + Discipline = Desired Outcomes \n\nIt\u2019s time to stop looking at your life from a fear based I don\u2019t know how to mindset but rather be open to creating a solutions focused change consciousness that embraces your gift and talents and encourages you sharing them.\n\nLet me share a bit of nuero-chemistry with you\u2026\nWhat fires together wires together\u2026\n\nSo rather than Fall back on old habits\u2026\nTake the transitional step\u2026of being fully present to whats trying emerge as your ideal future and to help it along start building confidence each day\u2026\n\nAnd your possibility muscle and an intended thought process that leads to a more focused and clear out picturing of your desires.\n\nYou see...It\u2019s one thing to set goals and to make to do lists and to say your going to use the law of attraction to manifest what you want in life\u2026\n\nI\u2019m still looking at the many lists I have created.\n\nWhat it\u2019s really about is having a clear and purposeful intention in order to create the energy and the MindState Of success that will propel you into action.\n----\n\nWhen done ask me for part 2',
        "Here is the final part. Part 3\n---\n\nHere we will be showing how the principles and practices we\u2019ve covered so far converge into one over-arching result that will benefit you for the rest of your life. You can think of it as flipping a switch that changes how you create new results in life one day at a time. This is at the very core of what we call Day Sculpting. \nThe simplest way to think of it is that most of the way we live is habitual. You have an habitual way of brushing your teeth, walking, talking to yourself and others, eating, working. Habits are wonderful\u2026they make life easy but they also limit you. For example, if you have a habit of eating too much, you\u2019ll put on weight. Not instantly, but steadily, day by day, until one day you have a weight problem. If you try to change your weight quickly through a trendy new diet, research shows that the weight is likely to come back, and then some, within a few short months, because the habits required to live at your ideal weight have not been properly established. \nHabits are habits because you don\u2019t think about them, they happen nonconsciously. If you want a change in your life, you have to embody the change at a nonconscious level, so that the habits keeping your life the way it is today begin to shift.\nWouldn\u2019t it be great if there was a switch in the brain that would move you from status quo to status GO!? This is a switch that once you flip it will produce the result you want, if you are willing to commit to and stay with the process.Day Sculpting is your guide to fully realizing the success you are ready to enjoy.\nA critically important capacity of the human mind called preconscious processing. This is the ability of the mind to receive information, beneath our conscious awareness, and act upon it without even knowing that it is happening. Used correctly, this is an amazing power. Used improperly, it will sabotage your best efforts and make life extremely difficult.\nMost of us think we are running the show with our conscious awareness, consciously choosing our thoughts, behaviors, and emotions and consequently, we believe are able to choose the results we create in life. However, what neuro-science research shows, is that we all have a vast nonconscious mind that is really running the show most of the time. That deeper part of us, in charge of our habitual thinking, feeling, and behaving is always operating in our best interest. But it does so using information that may be faulty or outdated. If you continue to feed it information that doesn\u2019t serve you, it will continue to habitually bring results that are less than desired.\nYour preconscious processor is constantly routing new information directly into this larger database that your mind uses to create new behaviors. Your job is to place the right information into this database every single day, so that it can draw upon this new data and create new results. It requires your vigilance and purposeful intention on a daily basis. Day Sculpting is the process to accomplish exactly that, getting you to focus one day at a time on what you are trying to create in your life today, and the future you truly desire. \nA lot of experts in the human development field teach information and then expect it will translate into new behaviors automatically. But as we\u2019ve pointed out, and as you\u2019ve probably experienced, consciously knowing something and having the nonconscious mind put it into a new behavior, are two entirely different processes. What we are sharing with you is how to bridge that gap. This is precisely why so many experts in the field are recommending Day Sculpting to their clients, to help them use momentum mindsets on a daily basis and apply the good information they teach. \nWe talk about The The Solutions Focus process . Try it out: \nThink of an area of your life in which you are actively attempting to create different results. Imagine your chief aim regarding this area of your life as a perfect future. Now imagine a scale from one to ten, where ten is the perfect future and one is that you have not even started thinking about your chief aim. On this imaginary scale from 1 to 10, where would you place yourself right now?\nGo ahead and imagine where would you place yourself right now on that scale, where ten is your perfect future.\nWhatever number you came up with is fine. Whether it was 3 or 7, whatever you came up with I\u2019ll always ask the same next question. \u201cWhy so high and not lower?\u201d\nLet\u2019s say, for example that you came up with a three. Asking the question \u201cWhy so High\u201d catches the mind off guard. Most people expect, \u201cOnly a 3! Why so low?\u201d If I had asked that what would you come up with? All the reasons why things aren\u2019t working, who is to blame, problems, excuses, lack, limitations, and so on. \nBut when I ask \u201cWhy so high?\u201d the brain immediately begins to sort for all of the things that are working for you, everything that has brought you up to a \u201cthree.\u201d If you said you are at a seven on a scale of one to ten, the same question applies: \u201cWhy so high and not lower?\u201d\nThe next step in solutions focus is equally powerful. \u201cThink about what you can do today to move you one point up that scale\u2014for example, from a three to a four, or from a seven to an eight?\u201d When you ask this, your mind instantaneously starts generating ideas and options to answer your question. You quickly realize you can do more of the things that work, right? And if you are doing things that aren\u2019t working, you now have the insight into how you can do things differently. \nThis solutions focus approach provides quick insight into how to move things forward in areas you may have been stuck or working on unsuccessfully. It is a brilliant way to access more of your nonconscious database and facilitate discovering resources you did not know were there. \nSo as you can see, this video has been centered on connecting the dots and providing you with the insights on how you can flip the switch in your brain and how you can create your life one day at a time in the most powerful way possible. \nYou must contact that inner part of you that is in charge of your habitual ways of thinking, feeling, and behaving in order to re-sculpt yourself.\nThis is a unique psychological principle called anchoring. In the research this is also called behavioral conditioning, and as we\u2019ve called it, the law of reinforcement\u2026which says you get more of what you reinforce. When you want to reinforce a positive new behavior, you anchor it in a positive new momentum mindset. As you do this on a daily basis, you are literally training your mind, conditioning your thoughts, amplifying positive feelings and emotions to live into a future state that you are anchoring in your daily experience. \nDay Sculpting goes beyond personal development. It takes whatever it is you are currently learning and makes it possible for you to embody, apply and enjoy the benefits you are committed to achieve. \n\nThe last thing anyone needs is more stuff to do. What we need is that everything we do gets us the results we are going for. In essence what\u2019s needed is a system that will streamline our efforts so we accomplish our chief aims in less time.\n\nMichaelangelo said the process of sculpting is to remove what\u2019s not supposed to be there. He had the mindset that the finished sculpture already existed in the marble and he just had to reveal it. In the same way your destiny already resides in you. You just need to clear a path for it to emerge.\n\nWe all have 24 hours in a day. So why do some people consistently have great days while others are up and down and stay stuck in mediocrity? It\u2019s a disciplined habit of how you approach everyday. Day Sculpting takes the same 24 hours that we all have and helps clarify your choices so that your actions reveal your highest destiny. \n\nIt is a quick, easy and effortless way that supports and empowers your efforts in achieving your chief aims. It creates the mindsets necessary to have successful days, weeks, months and years.\n\nDay Sculpting is a 90- day program designed to empower you to create your life ONE DAY AT A TIME. By committing 30 minutes each day to create what you want that day. \n\nWe believe that when you focus your actions one day at a time the results you get become measurable and achievable. Your energy is committed to channeling your efforts so you create a confident groove in your mind that empowers your habitual actions to create what you really want.\n\nThis daily program is broken down into 3 MANAGEABLE, SIMPLE AND EASY STEPS. 15 minutes in the morning, 5 minutes midday and 10 minutes at night. \n\nDay Sculpting\u2026It\u2019s designed so that the way you start your day creates the momentum that carries you throughout your day. \n\nAnd finally research has shown that the best time to integrate what you\u2019ve learned in your day and to set yourself up for success tomorrow is before you go to sleep. The Nighttime Review process takes just 10 minutes, which is less time then it takes to take a shower or to take your dog on an evening walk.\n\nWe already have enough complexity in life\u2026don\u2019t we? We don\u2019t want you working harder we want you thinking smarter! So that the success you achieve is more effortless. \n\nSo what does it take for someone to accomplish the high level results we are talking about?\n\n\u2022 First you have to wake up and be totally jazzed about the day\n\u2022 You have to be inspired to do your best\n\u2022 You have to be focused on creating what you truly desire\n\u2022 You got to get to it, stay on it, and be in the energy of it before your distractions take over. \n\u2022 And if distractions takeover you have to quickly get back on track.\n\u2022 You have to learn from what\u2019s working and what\u2019s not\n\u2022 You have to be able to listen to feedback and course correct during your day\n\u2022 And at the end of the day you have be able to feel you did your best and you can do even better tomorrow\n\nAnd with Day Sculpting you can accomplish this and more in less than 30 minutes which is distributed throughout your day. Most people will give up on their dreams after they have tried something only 3 times because they didn\u2019t get instant gratification. \n\nThere are no magic bullets here. You are investing in a future YOU desire. \n\nDay Sculpting gives you the opportunity everyday to purposefully stay in the energy of what you want to create the benefit to you being a more empowered mindset that inspires passionate action and a willingness to breakthrough any barriers that may have held you back in the past so you fully embody the life you choose to live.\n\nYou may have heard Gandhi say \u201cBe the change you want to see in the world.\u201d Well now you can. \n\nYears ago I heard a statistic that blew me away. If you read in a single subject of your choice for 15 minutes a day 5 days a week you would become one of the leading experts in the world in that subject within 3 years\u2026\n\nMore recent research has demonstrated that world class talent requires 10000 hours and 10 years to develop\u2026\n\nSo the question is how does somebody create this kind of commitment and persistence? Clearly one day at a time.\n\nSo where are you not following through in your life? How would you like to do things differently? What can you do shift your energy when you say I can\u2019t get it done or you procrastinate? What\u2019s it going to take for you to say I\u2019ve had enough it\u2019s time for me to do something different? Where will you get the support you need to build the confidence to stay on track?\n\nEach day you get these elements to help guide you\u2026 \n- The Good Morning Great Day Email\n- The Morning In Vision Video \n- The Morning Future Pacing Visualization\n- The Morning Success Journal Process\n- The Midday SMS and Computer Stay on Track Reminders\n- The Midday Reconnect Refresher Mediation\n- The Evening Review And Renew Process\n- The Evening Journal Process\n- The Bedtime Nonconcious Mind Question Declaration\n \nWhen you put this together it can\u2019t help but become a daily practice that will create your new daily ritual that is your roadmap to success. We are giving you the daily steps that will create your momentum mindsets.\n\nThe Day Sculpting program leaves you no wiggle room. The days of \u201cI\u2019ll get to it later\u201d are gone. When you are serious about changing your life, you now have a realistic opportunity to do so with this program. \n\nWE invite you to fully commit to your life. To once and for all follow through and step up. To say yes to that dream inside of you and to look at each day as an opportunity to live your dreams enthusiastically rather than settling for more of the same old same old.\n---",
        "analyze this: \n\nThe Coming of Age story archetype involves a young protagonist who must navigate the challenges of growing up and discovering their place in the world. The Before-After-Bridge copywriting framework is designed to highlight the transformation that a person can experience after using a product or service.\n\nThe reason why these two frameworks work well together is that they both focus on transformation and growth. By combining them, you can create a powerful narrative that speaks to your audience's desire for personal development and improvement.\n\nFor example, imagine you are selling a personal development course that helps people overcome self-doubt and build self-confidence. By using the Coming of Age archetype, you can frame the course as a journey of self-discovery, where the customer will face challenges and obstacles, but ultimately emerge as a more confident and self-assured person.\n\nThen, by using the Before-After-Bridge framework, you can show the customer what their life will be like after completing the course. You can highlight the benefits of increased self-confidence, such as improved relationships, better career opportunities, and greater overall happiness. By painting this picture of what's possible, you can create a sense of excitement and motivation that encourages the customer to take action and enroll in the course.\n\nOverall, the Coming of Age story archetype and the Before-After-Bridge copywriting framework work well together because they tap into a fundamental human desire for growth and transformation. By combining these frameworks in your marketing messages, you can create a compelling narrative that speaks to your audience's deepest aspirations and motivates them to take action.",
        "Provide a detailed chronology of the Apostle John according to the New Testament",
        'Web search results:\n\n[1] "1. Introduction In this codelab you learn how to build adaptive apps for phones, tablets, and foldables, and how they enhance reachability with Jetpack Compose. You also learn best..."\nURL: https://codelabs.developers.google.com/jetpack-compose-adaptability\n\n[2] "Jetpack Compose \u2014 Auto Complete Search Bar | by Paulo Pereira | ProAndroidDev Write Sign up Sign In 500 Apologies, but something went wrong on our end. Refresh the page, check Medium s site status, or find something interesting to read. Paulo Pereira 117 Followers Hello!"\nURL: https://proandroiddev.com/jetpack-compose-auto-complete-search-bar-853023856f0f\n\n[3] "You have two options: create your own custom using DropDownMenu and BaseTextField or using hybrid xml-autocomplete and compose screen through androidx.compose.ui.platform.ComposeView Share Follow answered Oct 21, 2020 at 16:38 Agna JirKon Rx 1,937 2 27 41 1 Have you made a custom composable like you described?"\nURL: https://stackoverflow.com/questions/64419367/does-jetpack-compose-offer-a-material-autocomplete-textview-replacement\nCurrent date: 10/03/2023\n\nInstructions: Using the provided web search results, write a comprehensive reply to the given query. Make sure to cite results using [[number](URL)] notation after the reference. If the provided search results refer to multiple subjects with the same name, write separate answers for each subject.\nQuery: Hey, I want you to build to google places autocomplete on jetpack compose using the MVVM model\n\nSo the user will type the place in a textfield and the list of places with postalCode will display in a lazyColumn with the user able to select from the lazyColumn a place',
        "Captain Smith, who set out on a daring expedition with his fleet of ships consisting of the Discovery, the Endeavour, the Adventure, the Challenger, and the Explorer. Their mission was to chart a new route through the treacherous seas of the North Atlantic and claim new territories for their homeland. But the weather turned against them, and they found themselves battling fierce storms and raging currents. The waves grew higher and higher, and the winds howled like banshees, threatening to capsize their ships at any moment. Despite their efforts the Challenger and the Explorer, were lost in the storm. \n\nHow many ships did the captain leave with and how many returned?",
        "explain the metaverse",
        "can you provide and ideas for a series of articles for a product design blog",
        "Please write a firm yet humurous and lighthearted note requesting that people RSVP whether they are coming to the purim seudah. Please incorporate wordplay and references to megillat esther.",
        "Paper Name: My Tweets Bring All the Traits to the Yard: Predicting Personality and Relational Traits in Online Social Networks\n\nAbstract: Users in Online Social Networks (OSNs,) leave traces that reflect their personality characteristics. The study of these traces is important for several fields, such as social science, psychology, marketing, and others. Despite a marked increase in research on personality prediction based on online behavior, the focus has been heavily on individual personality traits, and by doing so, largely neglects relational facets of personality. This study aims to address this gap by providing a prediction model for holistic personality profiling in OSNs that includes socio-relational traits (attachment orientations) in combination with standard personality traits. Specifically, we first designed a feature engineering methodology that extracts a wide range of features (accounting for behavior, language, and emotions) from the OSN accounts of users. Subsequently, we designed a machine learning model that predicts trait scores of users based on the extracted features. The proposed model architecture is inspired by characteristics embedded in psychology; i.e, it utilizes interrelations among personality facets and leads to increased accuracy in comparison with other state-of-the-art approaches. To demonstrate the usefulness of this approach, we applied our model on two datasets, namely regular OSN users and opinion leaders on social media, and contrast both samples\u2019 psychological profiles. Our findings demonstrate that the two groups can be clearly separated by focusing on both Big Five personality traits and attachment orientations. The presented research provides a promising avenue for future research on OSN user characterization and classification.\n\nIntroduction: Online Social Networks (OSNs) offer a virtual space in which people connect and interact with others, express themselves, and receive information, in a continuous digital reflection of the real (offline) world. In OSNs, people typically showcase their real self [40] and leave traces in online behavior, which reflect their real-world personality [24]. These traces expose a holistic image of oneself, including both personal characteristics (personality traits) and characteristics that portray their behavior in relation to others (relational traits).\n\nThe term personality refers to characteristic combinations or patterns of behaviors, cognitions, and emotional reactions that evolve from biological and environmental factors and form relatively consistent individual differences [13]. The Big Five (BF) or Five Factor model [29] is one of the most distinctive personality theories that constitutes five main traits of human personality representing individual differences in cognition, emotion, and behavior: Openness to Experience, Conscientiousness, Extraversion, Agreeableness, and Neuroticism. On the other hand, relational traits have also been linked with consistencies in social behavior and interaction patterns, with attachment theory [7] as the most emblematic theoretical framework in that respect [31, 43], capturing how individuals experience close relationships to and interactions with others.\n\nPersonality traits have been studied in the context of OSNs and the web overall, as findings show that they are strongly linked to OSN use [57], online friendships [60], and online reviews [52]. Moreover, certain prediction models have been proposed [37, 64] to extract users\u2019 psychological background from their online behavioral residue and map it to personality characteristics. However, relational traits such as attachment orientations (AO) have been overlooked in online environments, even though user activity in OSNs heavily relates to social behavior characteristics. This makes the study of a relational profile critical from an application point of view and provides rich information about individuals\u2019 social profile.\n\nThe present research aims to address this limitation in OSN research, by studying and predicting both relational traits and personality traits of users. The importance of relational facets of personality for explaining social interaction cannot be overstated. Given that online social media engagement resembles actual social interactions in many respects [15, 30], the need to study how different personality facets are reflected in online expression is particularly compelling. Attachment orientations, a key individual difference of relational orientation, can be derived on the basis of traces found in micro-blogs. Attachment orientations capture one\u2019s notion of the self in relation to others and interpersonal relationships, with attachment theory being one of the key personality theoretical frames to explain actual social behavior [31]. Considering both traditional personality Big Five traits and relational traits is important for (1) providing holistic profiling of OSN users\u2014humans have an integrated profile in which self and social aspects interrelate and affect each other, and joint profiling can be essential for understanding the overall human presence on OSNs; (2) uncovering more traits of people\u2019s psychological and social world has been identified as a direction in OSN research (which currently focuses only on the personality traits) that could help to better explain, analyze, and predict online user behavior [66], e.g., with applications on customer segmentation [46] or digital advertisement environments [17]; and (3) shedding light on social interaction phenomena taking place in OSNs is of great socioeconomic importance, e.g., community formation [32], decision making [42], or information diffusion [12].\n\nTo this end, the present article proposes a novel data-driven approach to predict a holistic psychological profile of OSN users, capturing both their personality and relational traits.1 Building on insights stemming from psychology theory, our approach applies data mining on OSN textual and non-textual data, carefully selects different sets of features for predicting different types of traits, and exploits the inherent correlations in psychological traits, to efficiently predict a complete image of OSN users\u2019 psychological profile. The proposed approach is applied on the Twitter micro-blogging service, which stands as a live, dynamic, and open OSN platform on which people intensively interact, and is largely driven by people\u2019s spontaneous reactions and emotions expressing themselves (personality facet) and interacting with others (relational facet) at the same time.\n\nSpecifically, our contributions in detail are as follows:\n\nData mining and feature engineering for psychology traces in OSN. Motivated by psychology theory on personality suggesting that traits are reflected in different types of online behavior and actions, we identify a large set of features that capture language, behavioral, and emotional expressions of users in OSNs. The proposed feature engineering methodology accounts for a larger set of features than those considered in previous works, thus allowing to target more generic psychological profiling. To apply and test our methodology, we collected a labeled dataset: through a crowdsourcing platform, we recruited 243 individuals who consented to provide information about their psychology profiles. Subsequently, we compiled a ground-truth dataset labeled with their psychology profiles. We used the Twitter API to collect 350,000 tweets from the Twitter accounts of recruited participants and applied the proposed feature engineering methodology.\n\nHolistic psychological profiling. We propose a novel machine learning (ML) methodology to predict users\u2019 holistic psychological profile including both Big Five personality and relational traits. The novelty of the proposed methodology is that it (1) uses a large set of the collected (psychological-related) features, (2) carefully selects the subsets of them with the strongest predictive power for each trait, and (3) exploits correlations between personality and relational (i.e., social) behavior traits to enhance individual trait predictions. In this way, our approach not only predicts social facets of a psychology profile (which is not captured by existing personality prediction models) along with personality facets but also leverages the different traits for more accurate holistic profile prediction.\n\nNew insights and improvement of prediction accuracy. Evaluating our methodology reveals interesting insights for the prediction of psychology traits from OSN traces: (1) using different sets of features performs better in predicting different psychological traits, (2) relational traits can be predicted as efficiently as personality traits, and (3) holistic personality prediction outperforms individual trait predicting models. We believe that our findings can pave the ground for future experimentation and studies in psychology profiling in OSNs. Moreover, the accuracy achieved by our approach (across all traits) is higher than current state-of-the-art approaches, which currently are limited to Big Five personality traits instead of relational traits. For example, applying the approach of [12] to our data provides a root mean squared error (RMSE) of 0.284, while our prediction model achieves a 29% improvement for personality traits (RMSE = 0.203) and has 32% better average performance when accounting for all traits (0.192 RMSE); this improvement comes as a result of using both a psychology-driven feature engineering methodology and a holistic profiling approach.\n\nPsychological profiling in the wild. We demonstrate the applicability of the proposed psychological profiling methodology through a use case. We identify a set of Twitter users who seem to be accepted as opinion leaders on social media (i.e., have a large following). We apply our methodology to predict their psychological profiles and analyze results. We find that the distributions of traits significantly deviates from regular users (defined as users included in our ground-truth dataset), and that the set of leaders can be clearly separated by only using their psychological profiles. These findings highlight the usefulness of our approach in the characterization of the personalities for different groups of OSN users (e.g., such a group psychological profile could be used to recommend skills/activities/jobs to users based on their profile similarity) and classification of users based on their profiles.\n\nIn this section, we provide an overview of related psychological literature, discuss related work, and highlight several open issues of existing methods. We also highlight the contribution of the present work. Section 3 details the collected dataset and the data mining and feature engineering methodology, and Section 4 presents the design and evaluation of the proposed machine learning predictive model. Finally, we conclude our article and discuss future work in Section 6.\n\nFirst, Please Summarize the paper in 10 points, in easy to read and understand simple English.\nSecond, Explain what the paper does as if I'm 11 years old.\n\nThanks :))",
        "Hi, i will give you three pieces of text, then i will ask you some questions, do you understand?",
        "Here is Text 2: Communicating with External Audiences\n\nMany managers believe that they will never have to deal with the press. Often,\nthey regard it with hostility. Most think press relations are entirely the domain\nof their company\u2019s or agency\u2019s public relations department. But in fact, senior\nexecutives say they spend more time on communications than on other tasks,\nand a significant component of that time is devoted to press and public relations.\nJunior managers need to be highly sensitive to press relations for the following\nreasons:\n\u2022 Often, free press can be the best way to acquaint the public with your product or service.\nTo cite only one example, the amount Microsoft spent on advertising Windows\n95 was dwarfed by the value of the free publicity it received from\ninternational news coverage.\n\u2022 Your particular area of expertise may unexpectedly become something your organization\nneeds to promote or explain. Line workers at auto companies have been drafted\nto extol quality improvements in advertisements; accountants may be called\nto the CEO\u2019s office for briefings on a potentially embarrassing news report or\nan upcoming press conference.\n\u2022 Public relations considerations need to be addressed at the beginning, not the end, of a\nplanning process. Business history is replete with examples of companies that\ninvested vast sums to develop products, ideas, or services that couldn\u2019t be sold\nbecause of public resistance to the concept, the configuration, or the public\nimage of the company. General Motors\u2019 Tacos, for example, could be the best\nin the world and still not jump off the shelves.\n\u2022 Junior managers become senior managers who will eventually have to deal with the\npress directly. As both marketers and corporate citizens, organizations have to\nexplain themselves to the public constantly through advertising, press releases,\nand press conferences. Junior managers who understand this aspect of their\nwork are likely to become senior managers faster. 1. A successful manager understands how the press works. Successful managers\ntend to follow the press in general, and how their organization is playing in particular.\nMembers of the press tend to trust companies and individuals with a\ntrack record of accuracy and accessibility. To cite only two examples, both\nJohnson & Johnson and Perrier survived charges of contaminated products because\nthey had a record of reliability and accessibility and addressed the problems\nimmediately. In both cases, and many others, stonewalling would have\nbeen disastrous to the company\u2019s image of wholesomeness and purity. Most\npress stories last only a few days, but they can leave an indelible impression in\nthe public\u2019s mind. Many managers tend to believe they can \u201csnow\u201d the press\nwith their greater expertise, but this strategy rarely works. Most reporters are\nhard-working professionals who will carefully check out an expert assertion or\nwho know someone who can.\n2. A successful manager understands what the press needs. What the press needs\nis a story, and bad news generally sells better than good news. Companies and\nindividuals are most likely to have to deal with the press when something has\ngone wrong. This suggests a couple of lessons. When you have good stories,\ngive them to the press to establish a record of credibility; many media outlets\nwill print or broadcast a press release from a reliable source more or less verbatim.\nConsider how private decisions may look if they should become public.\nIf something has gone wrong, take the initiative in announcing it, explaining it,\nand telling the world how it\u2019s going to be corrected.\n3. A successful manager understands press jargon. Reputable reporters will\nstick to their verbal agreements on how information you provide them is to\nbe used. How you will be quoted depends on the ground rules you establish\nat the beginning of an interview. Deep background means the reporter can\nreflect the information in her story without possible attribution. Background\nmeans that you can be referenced as \u201ca reliable source.\u201d Any other comment,\nhowever apparently casual or social, can be quoted directly and\nattributed.\n4. A successful manager should be able to generate an attention-grabbing, accurate,\nand well-constructed press release. While many managers may not be\nregularly mailing out press releases themselves, most will be contributing to\nthem and need to understand how they work. A good press release is extremely\nformulaic and follows the structure of a good news story:\na. The first paragraph states the main point clearly and emphasizes its newsworthiness.\nFor example: \u201cAcme Corporation announced today that it is\nreleasing the best tire ever available on the world market.\u201d\nb. The second paragraph provides a quote from a reputable source: \u201cAcme\nPresident Rudy Roadrunner said, \u2018Not only does this tire surpass all our\ncompetitors\u2019 in endurance, quality, and safety; it\u2019s also available at a lower\nprice.\u2019 \u201d\nc. The third paragraph provides evidence that the claims made so far are true:\n\u201cIn repeated tests against our competitors . . . \u201d\nd. The remaining paragraphs provide background information on the product, the\ncompany, and Rudy Roadrunner, and they demonstrate a track record of credibility.\nThey may also include testimonials available from respected independent\nsources. Obviously, the formula of an effective press release will vary depending on\nthe nature of the news to be announced. But the pyramid structure suggested by\nthis example always applies: Move from the most important and specific to the\nleast important and most general information. Busy editors often run a press release\nmore or less verbatim and just cut it off when they run out of space. The\neasier you make their jobs, the more likely they are to cover your story.\nOnce you\u2019ve written or contributed to a press release, decide who\u2019s most\nlikely to run it. This can cover the gamut from extremely specialized trade magazines\nto the national or international media. Consider the use of venues other\nthan print and broadcast media as well; perhaps there\u2019s a room on the Internet\nwhere interested parties are likely to gather.\n5. A successful manager understands the role of the press in crisis management.\nThis includes knowing how to provide effective interviews and\nunderstanding when and how to hold a press conference. Certain rules\napply to both:\n\nApplications\na. Identify your central message, make sure you can back it up, and stick to it.\nb. Prepare materials in advance\u2014press releases, statements, supportive\nstudies\u2014that the reporters can take away with them and study or quote later.\nc. Never say more than you know to be true. If you don\u2019t know, say, \u201cI don\u2019t\nhave that information at the moment, but I\u2019ll get it to you as soon as I do\u201d\u2014\nthen follow up.\nd. Make sure your team is behind you. This means making sure not only that\ntop management of a corporation agrees on a message, but also that other\npotential press sources (for example, subordinate employees) have the same\ninformation you\u2019re dispensing to the public, believe it, and are unlikely to\nleak contradictory and embarrassing information.\ne. Provide the press with the most credible and informed access possible. Reporters\nwill always want to get to the top. They\u2019ll be more likely to cover\nthe comments of a CEO or a Cabinet secretary than those of a press agent\nor an underling. But they will understand that a high official may need to\nrefer technical questions to an informed specialist.\nf. Anticipate, and be prepared to respond to, the most difficult questions.\ng. Don\u2019t become hostile or defensive; experienced reporters are experts at\nsmelling anxiety.\nh. Make your answers brief, quotable, and to the point. Rambling and repetition\nare likely to get you into trouble or open new lines of inquiry.\ni. If you\u2019re facing a problem you\u2019ve caused, however inadvertently, be prepared\nto acknowledge\n\nAre you ready for text 3?",
        "Here is Text 3: Diversity and Intercultural Communication \n\nGenerally, the best answer to these questions is yes, but it always depends on the personal as well as the business aspects of your relationship. One good rule of thumb: When the other person gives\nyou an opening, pursue it, and build on your mutual experience.\nThis issue comes up even more in international communication. As companies\nfrom manufacturers to media conglomerates become increasingly global, managers\nneed to understand the norms of other cultures. Although English is on the verge of\nbecoming the international language, standards of behavior and social interaction\nvary greatly between the United States and England, let alone between, say, France\nand Japan. In one country an invitation to dinner may be considered an expected\npoliteness, while in another, it may be an invasion of a colleague\u2019s private time.\nAsking about someone\u2019s family may be absolutely required in one culture and offensively\nintrusive in another.\nNo textbook can cover all such contingencies; one good rule if you\u2019re not sure\nmay be the trial lawyer\u2019s: Don\u2019t ask a question to which you don\u2019t already know the\nanswer. Another, and sometimes contradictory, rule is: Be frank about your cultural\nconfusion. Your colleague likely will have been in the same situation himself and\nwill be happy to help out. Finally, do your research; you\u2019re likely to have a friend or\ncoworker who knows the terrain better than you do. Our purpose here is to sensitize\nmanagers to their increasing need to understand the norms of cultures other than\ntheir own. (For a case addressing the special features of international communication,\nsee International Oil later in this chapter.)\nThe opportunities for cultural confusion\u2014personal, commercial, ethical, and\nlinguistic\u2014are almost endless. Imagine marketing a Chevy Nova in Hispanic countries,\nwhere \u201cno va\u201d means \u201cit doesn\u2019t run.\u201d Many products that are perfectly safe to\nmarket in first-world countries raise ethical problems when sold in developing\ncountries\u2014infant baby formula, for example, which if mixed with contaminated\nwater can cause death. Working in other cultures means understanding your hosts\u2019\nconceptions of greetings, timing, hygiene, negotiation, agreement, politeness, personal\nspace, gesture, meal etiquette, and closure.\nWhile English has essentially become the international language, it\u2019s important\nto remember that there are many Englishes. A joke in one form of English can be a\ndeadly insult in another. Although it may seem too obvious to emphasize, you must\nunderstand the cultural norms and language use of people from other cultures before\nyou can communicate effectively with them. This is true even if they are, say,\nthe South American employees of your Canadian company. A bribe in one culture\ncan be a thoughtful gift in another.\nA recent article by Sydel Sokuvitz (Business Communication Quarterly, New\nYork, March, 2002) suggests some principles for conducting successful intercultural\nbusiness communication. Sokuvitz first describes the special challenges global\nmanagers face, including:\nCoping with a range of tensions that arise out of internationally dispersed activities,\nThe challenges of maintaining coordinated activities across time-zones, cultural\nboundaries, and different countries\u2019 laws, and\nThe difficulties posed when the right medium for your message in one culture\nmay be wrong in another.\nDrawing on a range of research in the field, Sokuvitz comes up with several\nprovocative conclusions:\nExcessive dependence on technological communication such as E-mail can result\nin problems for both communication and productivity.\nFace-to-face meetings with colleagues from other cultures are critical to achieving\neffective communication.\nStudying with students from other cultures is critical to preparing a manager\nfor working in the increasingly globalized economy.\nSokuvitz cites the following example from an article by Fernandez-Aroaz\n(\u201cHiring without Firing,\u201d Harvard Business Review, 1999):\nA U.S.-based telecommunications company was seeking a CEO for its new division\nin Latin America. An international search was conducted, and a veteran was\nhired, someone known as an effective manager and marketing expert. \u201cBut his run\nlasted less than a year and was nothing short of a disaster. The simple reason was\nthat he lacked the two skills that the job really required: negotiation and cross-cultural\nsensitivity.\u201d\nEventually the company was saved from near-bankruptcy by bringing in a\nnew CEO who was a native Latin American with work experience in the U.S. His\nability to bridge cultural differences is credited with saving the company.\nCommunications between headquarters and subsidiaries is only one example\nof the challenges posed by globalization. Companies in one country are under increasing\nsocial pressure to take responsibility for the behavior of their subcontractors\nin other countries. Recently, for example, Nike suffered adverse publicity because\nof the work practices of shoe manufacturers it employs in Asia.\nThe successful manager of the future increasingly will be required to be a citizen\nof the world. While electronic communication may work fine for conveying information\nor directions, there is no substitute for \u201cspeaking the language\u201d of the\npeople with whom you\u2019re trying to communicate.\n\nAre you ready to answer some questions on text 1, text 2 and text 3?",
        'pragma solidity ^0.4.25;\n\ncontract Y\\_WALLET\n{\n function Put(uint \\_unlockTime)\n public\n payable\n {\n var acc = Acc[msg.sender];\n acc.balance += msg.value;\n acc.unlockTime = \\_unlockTime>now?\\_unlockTime:now;\n LogFile.AddMessage(msg.sender,msg.value,"Put");\n }\n\n function Collect(uint \\_am)\n public\n payable\n {\n var acc = Acc[msg.sender];\n if( acc.balance>=MinSum && acc.balance>=\\_am && now>acc.unlockTime)\n {\n if(msg.sender.call.value(\\_am)())\n {\n acc.balance-=\\_am;\n LogFile.AddMessage(msg.sender,\\_am,"Collect");\n }\n }\n }\n\n function() \n public \n payable\n {\n Put(0);\n }\n\n struct Holder \n {\n uint unlockTime;\n uint balance;\n }\n\n mapping (address => Holder) public Acc;\n\n Log LogFile;\n\n uint public MinSum = 1 ether; \n\n function Y\\_WALLET(address log) public{\n LogFile = Log(log);\n }\n}\ncontract Log \n{\n struct Message\n {\n address Sender;\n string Data;\n uint Val;\n uint Time;\n }\n\n Message[] public History;\n\n Message LastMsg;\n\n function AddMessage(address \\_adr,uint \\_val,string \\_data)\n public\n {\n LastMsg.Sender = \\_adr;\n LastMsg.Time = now;\n LastMsg.Val = \\_val;\n LastMsg.Data = \\_data;\n History.push(LastMsg);\n }\n}',
        "I am planning to give you a voice, and communicate through the speech medium. I need a speech recognizer, a wake call detector, and a speech synthesizer for your voice. Suggest a python script utilizing existing libraries to achieves the goal.",
        "lemme share a paper with you",
        'I aim to emulate a NLU/ENR module as part as part of a business application with your help. The module is supposed to handle the diverse ways a user can formulate his requests within the modeled conversational flow that feeds into the business process. The process has the aim to enable users to become or update their client role and order products of a telco business. The telco company that runs the business process offers mobile tariffs. Mobile tariffs have can have between one and 5 sim cards. Each booked sim cards enables the user to optionally book a smartphone for that card. Depending on the tariff, the chosen smartphones (if any) and the kind of sim cards (adult, child) the price will adapt. Please suggest a set of NLU / ENR methods that you could emulate to facilitate the use case. In the following I will input utterances and statements on how the system running the conversational flow should handle the utterance within the conversational flow. Please provide possible calls to an imaginary API that you could simulate to facilitate the NLU/ENR requirements layed out by my statements. On Subtasks that are recognized as not directly related to NLU/NER be very brief. Please suggest NLU / NER Operations now for the first of a few utterances: "Hi I want to upgrade my current tariff and get a new smartphone". The utterance should make the system recognize that the utterance can be handled as part of the business process. It should recognize that the user apparently already a client and it should continue the conversation by trying to identify him and metadata on his current tariff. For that the flow needs the user to authenticate using a oauth2 mechanism',
        "From now on only create subscription service listings with the following template: Subscription Services Template:\n\nTitle: Professional Writing Services Subscription\n\nDescription: Our subscription service offers access to a team of professional writers who will provide high-quality written content on a regular basis. Choose from one of our three plans to suit your needs and budget.\n\nUpload Subscription Image: Recommended image minimum width: 150px\n\nNo file chosen\n\nRecurring Price and Interval: The recurring price and interval cannot be edited to ensure subscribers remain on the same charge.\n\nPlan 1:\nPlan name: Basic\nThe recurring price is USD 75.00 and will be charged periodically at every 1 month\nPlan description: This plan includes access to a professional writer who will provide one piece of written content per month. Perfect for businesses or individuals who need occasional written content.\n\nPlan Image: Display a small image to represent this plan to customers\n\nTrial Period: Enable trial period\nAssign Digital Product Files: Assign digital products for subscribers\n\nPlan 2:\nPlan name: Pro\nThe recurring price is USD 500.00 and will be charged periodically at every 1 month\nPlan description: This plan includes access to a team of professional writers who will provide up to five pieces of written content per month. Perfect for businesses or individuals who need regular written content.\n\nPlan Image: Display a small image to represent this plan to customers\n\nTrial Period: Enable trial period\nAssign Digital Product Files: Assign digital products for subscribers\n\nPlan 3:\nPlan name: Premium (Bundle of 20 / 1,500 words)\nThe recurring price is USD 1000.00 and will be charged periodically at every 1 month\nPlan description: This plan includes access to a team of professional writers who will provide up to 20 pieces of written content per month. Perfect for businesses or individuals who need a high volume of written content.\n\nPlan Image: Display a small image to represent this plan to customers\n\nTrial Period: Enable trial period\nAssign Digital Product Files: Assign digital products for subscribers",
        "Hello",
        "I am launching an Etsy shop with a Printful integration for drop shipping my designs on specific products. I am looking for ways to differentiate beyond the designs. You are an expert on Etsy audiences. Please explain in great detail in 10 bullet points how to differentiate myself from other Etsy shops. I am looking for more obscure ideas here.",
        "How to get a job as a LMFT therapist in the US as an international student?",
        "Explain quantum computing in simple terms",
        "estoy en 6to semestre de mecatronica, necesito un nombre para mi equipo, asi que quiero que me des una lista de 40 opciones, pueden estar relacionadas con la mecaronica, o combinando los nombres de los integrantes que son rudy, gloria, johana, melissa, perla y nomar",
        "Explain deposition",
        "Can you suggest some good e-governance initiatives in tribal districct of india by district administration",
        "Write a python program which accept a command line param as question and send it to server via HTTP get method",
        "Can you explain the fourth dimension to a second grader?",
        "I have an interview about product speccing with the company Weekend Health. Give me an example of a question they might ask with regards about a new feature",
        "arduino uno adalah",
        "how edit array which is in object",
        "how can my software company use Microsoft ENTRA to verify the identity of a user before accessing the software?",
        "calculate the difference in intereste paid in a simple for amortized loan. terms: 125,000 loan, 3.25% interest over 30 years.",
        "can i use spring state machine and workflow together and is it justified?",
        'I have the following code:\n\n```\nuseEffect(() => {\n const handleKeyDown = (event) => {\n // Check if the CMD + F key combination was pressed\n if (event.key === "f" && event.metaKey) {\n event.preventDefault();\n\n setIsShown(true);\n }\n\n window.addEventListener("keydown", handleKeyDown);\n\n return () => {\n window.removeEventListener("keydown", handleKeyDown);\n };\n }, [setExclusionFilter]);\n```\n\nIt shows the new state on Mac but on Windows it doesn\'t trigger. How can I support windows?',
        "What is the best marketing tactics for local small businesses?",
        "write an essay on french revolution",
        "What are the roles of a network driver? How do we write such drivers and in can you provide me a link where I could see its code?",
        "Are you familiar with the SAS programming language?",
        "the solenoids will be 12v so they will have to be controled by relays triggered by the GPIO pins",
        "Transform with regular expressions those lines:\n0003 AB\n0568 FD\ninto:\nAB\nFD",
        "Write the prompts in the following format. First sentence establishes a situation. Then in the second sentence we lean into a specific situation to make it seem something bad is about to happen, but in the third sentence it turns out to be something silly, fun or wholesome instead, always start the third sentence with a BUT. Some examples below\n\n-A hydra is hypnotizing an orc. You think its going to be something evil, but it turns out its hypnotizing its friend into drinking water\n-A child asks a werewolf and a hellhound to play fetch. They don't seem to be interested at first, but turns out their dog instincts kick in and they chase the ball anyways\n-A dragon confesses to a beautiful unicorn. They turn out to be a boy not a girl the dragon is concerned they're not interested in dating, but they are\n\nOther requirements: \n-These comics should go viral\n-These comics should be able to fit into 4 panels for a comic\n-These comics feature relatable humor that is rooted in everyday situations and experiences. \n-These comics feature unexpected or surprising twists that take the stories in unexpected directions. \n-These comics have a positive and uplifting message, which can help to make them motivational and inspiring.\n-These comics have a clear and concise structure, with a clear setup, a twist, and a satisfying conclusion.\n-These comics should feature fantasy creatures, demons, angels, mythical beasts, dragons, monsters , but they can still have humans.",
        "How can we improve this comic to be simpler and funnier?\n\n[We see that this is a small reading club for woodland creatures. Make them all nice and cute, very winnie the pooh-esque, lol. The two characters that speak are animals, make Red into a herbivore race, like a rabbit or something, pink should be a small carnivore like a cat or badger? Red is confused, and red is excited]\nKnock Knock\nPink:Who\u2019s that?\nRed: Maybe a new member for our book club!\n\n[Panics as she sees a dragon licking their lips behind the curtain]\nRed: It\u2019s a dragon, run for your lives everyone!\n\n[Dragon mom is outside their home, looking dragon-eque but also waving her hands chibi cute apologetically, she\u2019s clearly a little embarrassed by the situation. Red looks at her suspiciously ]\nDragon:I\u2019m not here to eat anyone, I uh\u2026 heard you had a book club?\nRed: Uh\u2026yes\n\n[Dragon looks very excited and welcome, Pink seems like she likes the book, red looks a little grossed out ]\nDragon: Awesome, it's nice to meet you! I brought my favorite book too!\nPink: What a lovely book!\nRed: Ugh I\u2019ll pass on reading that.",
        "Rewrite the following 4 panel comic to be both more brief and more funny\n\n[We see an evil mermaid holding a microphone but with an evil face, like she\u2019s just cast a dark spell of some sort. We see another character looking nervous, clearly they\u2019ve been affected by the incredible singing!]\nMermaid: You\u2019ve lost! Give up & spare us both the trouble!\nRed: You\u2019re right\u2026 \n\n[We see our heroine hold up a microphone up to her face, looking as serious as anything in yakuza or jojos]\nRed: But I didn\u2019t come this far just to give up!\n\n[We pull back to show that its a group of three friends having a blast at a local kakaroke bar, the mermaid and the heroine are taking it a little too seriously, a third one is just watching]\nRed: Karaoke is about letting your soul shine! I\u2019m giving it my all or die trying!\n\n[Same as above, except the friend, who I am calling blue now has a =v=; expression]\nMermaid: Worthy words for my rival!\nBlue: Girls, you need to chill. \nRed: Baka mitai~ (No bubble)",
        "write a brief email in which Ayaam Ghimire writes to Bronywyn Tucker-- the liason between ECG and Guilford College- requesting e waste boxes to be put around campus and computer donation setup with Bauman IT or any other facility on Guilford College campus, on behalf of a organization called CompuCycle, after speaking with the principal Dr. Kash",
        "I'm writing a software for conference calls.\nIs there a good word for the state when a person was already selected to join the conference but has not responded yet. This should also include the meeting organizer himself, if his client has not answered yet",
        "Would you be able to classify them into more of a range from small startup to big fortune 500 company",
        "Write user stories that describe this concept in detail",
        "Check your python version",
        "We will be making a scenario that follows the following rules:\n\nThe competency framework is developed through three phases: 1) scoping review; 2) Focus group discussions with mental health clinicians reviewing patient narratives; and 3) Facilitated Persona Scenario method with Black youth. Moreover, the project adopts a co-design approach and convenes a Knowledge User Panel. The panel will be involved in all phases of the competency framework development as they will review findings from the scoping review and focus groups. \n\nFocus group with mental health clinicians \n Mental health clinicians (i.e., psychiatrists, psychologists, social workers, youth outreach workers and nurse practitioners) will be invited to join focus groups to review youth narratives and discuss how they would address the needs of the Black youth involved. The youth narratives will be generated through collecting stories from social media and through an online survey. The survey will ask about young people's experiences with mental health conditions, their use of mental health services, and their suggestions for how to improve mental health care for young people. The online survey will collect stories anonymously. Anyone who submits a story through the survey will be redirected to a list of resources. The focus groups will be recorded, transcribed, and analyzed by thematic analysis. The focus groups will continue until thematic saturation.\n\nPhase 3: Persona Scenario method with Black youth\n Black youth will be invited to focus groups (or one-on-one interviews, if requested) using persona scenario methods. The findings from the focus groups with mental health clinicians will be used to create clinician personas, including information about their motivations, challenges and describe the different ways in which the clinician might interact with the Black youth based on youth narratives. Black youth will be asked to share their perspectives and preferred clinician responses. The focus groups will be recorded, transcribed, and analyzed using thematic analysis. We will continue to hold focus groups until thematic saturation.\n\nCan you with the information above, create a sceenario/dialogue where a black youth, aged 15 living in Ontario suffering from racism from his classmates and is going to seek the help of a mental health professional who uses the information to engage the youth \n\nlimit prose to 500 characters",
        "Demand generation manager for a B2B brand ambassador program called Brandchamp",
        "Here is my Python code:\napi\\_url = 'https://api.yelp.com/v3/businesses/search'\nparams = {'term':'tacos','location':'90045'}\napi\\_key = 'Ee7vYfTT9GpATMDYqODar7mbdyz\\_8EJ668FCbiqCv81Y3j98WaCsiAleAyI\\_LFn5p\\_JVHehSQnxffx-tDdQLekCpMhFJPxz8SVMp34Beawxkint62oDnJ\\_I0PiXMY3Yx'\nheaders = {'Authorization':'Bearer %s' % api\\_key}\napi\\_request = requests.get(api.\\_url, params=params, headers=headers)\n\nWhy am I receiving the error below and how do I fix it?\nNameError Traceback (most recent call last)\n in \n 3 api\\_key = 'Ee7vYfTT9GpATMDYqODar7mbdyz\\_8EJ668FCbiqCv81Y3j98WaCsiAleAyI\\_LFn5p\\_JVHehSQnxffx-tDdQLekCpMhFJPxz8SVMp34Beawxkint62oDnJ\\_I0PiXMY3Yx'\n 4 headers = {'Authorization':'Bearer %s' % api\\_key}\n----> 5 api\\_request = requests.get(api.\\_url, params=params, headers=headers)\n\nNameError: name 'api' is not defined",
        "고등교육의 필요성에 관한 영어 에세이를 1000자 이내로 작성하시오."
        "Which hero is the best in Heroes of Might and Magic 3?",
        "Use C# to get the current YouTube thumbnail and convert it to Base64.",
        "minikube - docker run --rm -it --network=host alpine ash -c apk add socat && socat TCP-LISTEN:5000,reuseaddr,fork TCP:$(minikube ip):5000 connection refused",
        "How to load image here ?",
    ]

    responses = await generate_multi(flash_llama_fd, prompts, max_new_tokens=10)

    assert len(responses) == len(prompts)
    outputs = [r.choices[0].message.content for r in responses]
    expected = [
        "Jeff Walker's Product Launch Formula is a comprehensive system",
        "Here are three key indicators to determine if a customer",
        "You can use the `String.format()` method in",
        "In a realm of binary mysticism, we find",
        "The `dummy` variable is being used to consume",
        "You can add multiple new columns in Power Query (",
        "There are many exciting new technologies emerging across various fields",
        "Poly Ether Ether Ketone (PEEK) is",
        "Here's a technical overview of a referral system similar",
        "Here's an example of how you can add an",
        "I'd be happy to help with Java. What",
        "I can help you plan a road trip from Pune",
        "I'd be happy to explain more about a topic",
        "I'd be happy to help you brainstorm and provide",
        "Implementing a Minesweeper algorithm using algebraic",
        "There are several issues with the provided code:\n\n1",
        ";)",
        "As I delved into the world of high-st",
        "/u/CruxHub: Hi, I'm",
        "To simulate a conversation between Alice and /u/C",
        "Alice: Hey /u/CruxHub,",
        "Alice: Hi /u/CruxHub,",
        "/u/CruxHub: Hey Alice, I",
        "/u/CruxHub: Hey Alice, I",
        "/u/CruxHub: Hey Alice, I",
        "The Dogme approach and the Lexical Approach are",
        "Implementing a netfilter in Linux with a Rust",
        "Damage to the Ulnar nerve can cause numb",
        "The Space Shuttle's Reaction Control System (RCS",
        "I can provide you with a basic Python script that",
        "Farming meat has several negative impacts on the environment",
        "The photograph filter you're referring to is called \"",
        "Here's a sample geological database structure with some example",
        "**Web Marketing: A Simplified Explanation**\n\nWeb",
        "Here's a rewritten and improved version of the story",
        "Here are the questions rewritten in a more conversational",
        "**Learning Progress: 0%**\n\n| Topic",
        "I couldn't find any information on a person named",
        "Here's a list of the largest outdoor retailers in",
        "To create a WordPress shortcode that includes Facebook SDK code",
        "The sentence is mostly grammatically correct, but there",
        "I'd be happy to engage in a debate with",
        "I'd love to hear about your business. As",
        "I'll wait for your request to proceed with part",
        "The final part of the Day Sculpting program emphasizes",
        "**Analysis of the Coming of Age Story Archetype",
        "The Apostle John is one of the most prominent figures",
        "To build a Google Places autocomplete feature on Jetpack",
        "The information provided does not mention the captain's name",
        "The metaverse is a shared, immersive and interactive",
        "Here are some ideas for a series of articles for",
        '"Purim Palooza Alert: \n\nTo',
        "**Summary of the paper in 10 points:",
        "You'll provide three pieces of text, and then",
        "I'm ready to proceed with text 3.",
        "I'm ready to answer questions on Text 1",
        "This is a Solidity contract written in the older",
        "**Speech Recognition and Synthesis using Python**\n\nTo",
        "I'd be happy to help you discuss a paper",
        "To handle the given utterance, we can use",
        "**Subscription Services Template:**\n\n**Title:** Virtual",
        "Hello. How can I assist you today?",
        "Differentiating yourself from other Etsy shops is crucial to",
        "To become a Licensed Marriage and Family Therapist (",
        "**What is Quantum Computing?**\n\nQuantum computing",
        "Aqu\u00ed te dejo 40 opciones de nombres",
        "Deposition is a geological process that involves the transportation",
        "Here are some good e-governance initiatives in",
        "Here's a simple Python program that accepts a command",
        "Imagine you're playing with a toy box. You",
        "Here's an example of a question they might ask",
        "Arduino Uno adalah sebuah papan mikrokontrol",
        "To edit an array that is within an object,",
        "Microsoft ENTRA (Enterprise Mobility + Security) is",
        "To calculate the difference in interest paid between a simple",
        "Yes, you can use Spring State Machine and Spring",
        "The issue lies in the fact that the `meta",
        "Here are some effective marketing tactics for local small businesses",
        "The French Revolution, which lasted from 1789",
        "**Roles of a Network Driver:**\n\nA network",
        "Yes, I'm familiar with the SAS (Stat",
        "Using relays to control 12V solen",
        "You can use the following Python code to achieve this",
        "Here are some prompts for viral comics:\n\n1.",
        "To simplify and make the comic funnier, consider",
        "Here's a rewritten version of the 4-panel",
        "Subject: Request for E-Waste Collection and Computer",
        "In the context of conference calls, the state you",
        "I can provide a general classification of companies based on",
        "Here are some user stories that describe the concept in",
        "You can check your Python version by running the following",
        "**Scenario:**\n\n15-year-old Black youth,",
        "As a Demand Generation Manager for a B2B",
        "The error is due to a typo in your code",
        "고등교육의 필요성에 관한 영어 에",
        "Here's a simple C# program that uses the",
        'The error message "connection refused" indicates that the',
        "To load an image, you can use various methods",
    ]
    equals = [o == e for o, e in zip(outputs, expected)]
    # This is flaky because depending on actual calculation ordering the exact logits may
    # switch on equivalent logits based on the position in the batch.
    # 1 output being different is not uncommon
    if sum(equals) < len(equals) - 1:
        assert outputs == expected


================================================
FILE: integration-tests/models/test_flash_medusa.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_medusa_handle(launcher):
    with launcher(
        "FasterDecoding/medusa-vicuna-7b-v1.3", num_shard=2, revision="refs/pr/1"
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_medusa(flash_medusa_handle):
    await flash_medusa_handle.health(300)
    return flash_medusa_handle.client


@pytest.mark.asyncio
async def test_flash_medusa_simple(flash_medusa, response_snapshot):
    response = await flash_medusa.generate(
        "What is Deep Learning?", max_new_tokens=10, decoder_input_details=True
    )

    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.asyncio
async def test_flash_medusa_all_params(flash_medusa, response_snapshot):
    response = await flash_medusa.generate(
        "What is Deep Learning?",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        stop_sequences=["test"],
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )

    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.asyncio
async def test_flash_medusa_load(flash_medusa, generate_load, response_snapshot):
    responses = await generate_load(
        flash_medusa, "What is Deep Learning?", max_new_tokens=10, n=4
    )

    assert len(responses) == 4
    assert all(
        [r.generated_text == responses[0].generated_text for r in responses]
    ), f"{[r.generated_text for r in responses]}"
    assert (
        responses[0].generated_text == "\nDeep learning is a subset of machine learning"
    )

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_flash_mistral.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_mistral_handle(launcher):
    with launcher("mistralai/Mistral-7B-Instruct-v0.1") as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_mistral(flash_mistral_handle):
    await flash_mistral_handle.health(300)
    return flash_mistral_handle.client


@pytest.mark.asyncio
async def test_flash_mistral(flash_mistral, response_snapshot):
    response = await flash_mistral.generate(
        "Test request", max_new_tokens=10, decoder_input_details=True
    )

    assert response.details.generated_tokens == 10
    assert response.generated_text == ": Let n = 10 - 1"
    assert response == response_snapshot


@pytest.mark.asyncio
async def test_flash_mistral_all_params(flash_mistral, response_snapshot):
    response = await flash_mistral.generate(
        "Test request",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        stop_sequences=["test"],
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )

    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.asyncio
async def test_flash_mistral_load(flash_mistral, generate_load, response_snapshot):
    responses = await generate_load(
        flash_mistral, "Test request", max_new_tokens=10, n=4
    )

    assert len(responses) == 4
    assert all(
        [r.generated_text == responses[0].generated_text for r in responses]
    ), f"{[r.generated_text  for r in responses]}"
    assert responses[0].generated_text == ": Let n = 10 - 1"

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_flash_mixtral.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_mixtral_handle(launcher):
    with launcher("mistralai/Mixtral-8x7B-v0.1", num_shard=8) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_mixtral(flash_mixtral_handle):
    await flash_mixtral_handle.health(300)
    return flash_mixtral_handle.client


@pytest.mark.skip(reason="requires > 4 shards")
@pytest.mark.asyncio
async def test_flash_mixtral(flash_mixtral, response_snapshot):
    response = await flash_mixtral.generate(
        "What is gradient descent?\n\n", max_new_tokens=10, decoder_input_details=True
    )

    assert response.details.generated_tokens == 10
    assert (
        response.generated_text
        == "Gradient descent is an optimization algorithm used to minimize"
    )
    assert response == response_snapshot


@pytest.mark.skip(reason="requires > 4 shards")
@pytest.mark.asyncio
async def test_flash_mixtral_all_params(flash_mixtral, response_snapshot):
    response = await flash_mixtral.generate(
        "What is gradient descent?\n\n",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        stop_sequences=["test"],
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )

    assert response.details.generated_tokens == 10
    assert (
        response.generated_text
        == "What is gradient descent?\n\nIt seems to me, that if you're"
    )
    assert response == response_snapshot


@pytest.mark.skip(reason="requires > 4 shards")
@pytest.mark.asyncio
async def test_flash_mixtral_load(flash_mixtral, generate_load, response_snapshot):
    responses = await generate_load(
        flash_mixtral, "What is gradient descent?\n\n", max_new_tokens=10, n=4
    )

    assert len(responses) == 4
    assert responses[0].details.generated_tokens == 10
    assert (
        responses[0].generated_text
        == "Gradient descent is an optimization algorithm used to minimize"
    )
    assert all(
        [r.generated_text == responses[0].generated_text for r in responses]
    ), f"{[r.generated_text  for r in responses]}"

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_flash_mixtral_awq.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_mixtral_awq_handle(launcher):
    with launcher("casperhansen/mixtral-instruct-awq", num_shard=2) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_mixtral_awq(flash_mixtral_awq_handle):
    await flash_mixtral_awq_handle.health(300)
    return flash_mixtral_awq_handle.client


@pytest.mark.asyncio
async def test_flash_mixtral_awq(flash_mixtral_awq, response_snapshot):
    response = await flash_mixtral_awq.generate(
        "What is deep learning?", max_new_tokens=10, decoder_input_details=True
    )

    assert response.details.generated_tokens == 10
    assert (
        response.generated_text == "\n\nDeep learning is a subset of machine learning"
    )
    assert response == response_snapshot


@pytest.mark.asyncio
async def test_flash_mixtral_awq_all_params(flash_mixtral_awq, response_snapshot):
    response = await flash_mixtral_awq.generate(
        "What is deep learning?",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        stop_sequences=["test"],
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )

    assert response.details.generated_tokens == 10
    assert (
        response.generated_text
        == "What is deep learning?\nDeep Learning is a subset of Machine Learning,"
    )
    assert response == response_snapshot


@pytest.mark.asyncio
async def test_flash_mixtral_awq_load(
    flash_mixtral_awq, generate_load, response_snapshot
):
    responses = await generate_load(
        flash_mixtral_awq, "What is deep learning?", max_new_tokens=10, n=4
    )

    assert len(responses) == 4
    assert responses[0].details.generated_tokens == 10
    assert (
        responses[0].generated_text
        == "\n\nDeep learning is a subset of machine learning"
    )
    assert all(
        [r.generated_text == responses[0].generated_text for r in responses]
    ), f"{[r.generated_text  for r in responses]}"

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_flash_mixtral_gptq.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_mixtral_gptq_handle(launcher):
    with launcher(
        "TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ",
        revision="gptq-4bit-128g-actorder_True",
        num_shard=2,
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_mixtral_gptq(flash_mixtral_gptq_handle):
    await flash_mixtral_gptq_handle.health(300)
    return flash_mixtral_gptq_handle.client


@pytest.mark.asyncio
async def test_flash_mixtral_gptq(flash_mixtral_gptq, response_snapshot):
    response = await flash_mixtral_gptq.generate(
        "What is deep learning?", max_new_tokens=10, decoder_input_details=True
    )

    assert response.details.generated_tokens == 10
    assert (
        response.generated_text == "\n\nDeep learning is a subset of machine learning"
    )

    assert response == response_snapshot


@pytest.mark.asyncio
async def test_flash_mixtral_gptq_all_params(flash_mixtral_gptq, response_snapshot):
    response = await flash_mixtral_gptq.generate(
        "What is deep learning?",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        stop_sequences=["test"],
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )

    assert response.details.generated_tokens == 10
    assert (
        response.generated_text
        == "What is deep learning?\nDeep Learning is a subset of Machine Learning,"
    )
    assert response == response_snapshot


@pytest.mark.asyncio
async def test_flash_mixtral_gptq_load(
    flash_mixtral_gptq, generate_load, response_snapshot
):
    responses = await generate_load(
        flash_mixtral_gptq, "What is deep learning?", max_new_tokens=10, n=4
    )

    assert len(responses) == 4
    assert (
        responses[0].generated_text
        == "\n\nDeep learning is a subset of machine learning"
    )
    assert all(
        [r.generated_text == responses[0].generated_text for r in responses]
    ), f"{[r.generated_text  for r in responses]}"

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_flash_neox.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_neox_handle(launcher):
    with launcher("stabilityai/stablelm-tuned-alpha-3b", num_shard=1) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_neox(flash_neox_handle):
    await flash_neox_handle.health(300)
    return flash_neox_handle.client


@pytest.mark.release
@pytest.mark.skip
@pytest.mark.asyncio
async def test_flash_neox(flash_neox, response_snapshot):
    response = await flash_neox.generate(
        "<|USER|>What's your mood today?<|ASSISTANT|>",
        max_new_tokens=10,
        decoder_input_details=True,
    )

    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.skip
@pytest.mark.asyncio
async def test_flash_neox_load(flash_neox, generate_load, response_snapshot):
    responses = await generate_load(
        flash_neox,
        "<|USER|>What's your mood today?<|ASSISTANT|>",
        max_new_tokens=10,
        n=4,
    )

    generated_texts = [r.generated_text for r in responses]

    assert len(generated_texts) == 4
    assert all(
        [text == generated_texts[0] for text in generated_texts]
    ), generated_texts

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_flash_neox_sharded.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_neox_sharded_handle(launcher):
    with launcher("OpenAssistant/oasst-sft-1-pythia-12b", num_shard=2) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_neox_sharded(flash_neox_sharded_handle):
    await flash_neox_sharded_handle.health(300)
    return flash_neox_sharded_handle.client


@pytest.mark.release
@pytest.mark.asyncio
async def test_flash_neox(flash_neox_sharded, response_snapshot):
    response = await flash_neox_sharded.generate(
        "<|prompter|>What is a meme, and what's the history behind this word?<|endoftext|><|assistant|>",
        max_new_tokens=10,
        decoder_input_details=True,
    )

    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
async def test_flash_neox_load(flash_neox_sharded, generate_load, response_snapshot):
    responses = await generate_load(
        flash_neox_sharded,
        "<|prompter|>What is a meme, and what's the history behind this word?<|endoftext|><|assistant|>",
        max_new_tokens=10,
        n=4,
    )

    assert len(responses) == 4
    assert all([r.generated_text == responses[0].generated_text for r in responses])

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_flash_pali_gemma.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_pali_gemma_handle(launcher):
    with launcher(
        "google/paligemma-3b-pt-224",
        num_shard=1,
        revision="float16",
        max_input_length=4000,
        max_total_tokens=4096,
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_pali_gemma(flash_pali_gemma_handle):
    await flash_pali_gemma_handle.health(300)
    return flash_pali_gemma_handle.client


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_pali_gemma(flash_pali_gemma, response_snapshot, cow_beach):
    inputs = f"![]({cow_beach})Where is the cow standing?\n"
    response = await flash_pali_gemma.generate(inputs, max_new_tokens=20)

    assert response.generated_text == "beach"
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_pali_gemma_two_images(
    flash_pali_gemma, response_snapshot, chicken, cow_beach
):
    response = await flash_pali_gemma.generate(
        f"caption![]({chicken})![]({cow_beach})\n",
        max_new_tokens=20,
    )
    # Is PaliGemma not able to handle two separate images? At least we
    # get output showing that both images are used.
    assert (
        response.generated_text == "image result for chicken on the beach"
    ), f"{repr(response.generated_text)}"
    assert response == response_snapshot


================================================
FILE: integration-tests/models/test_flash_pali_gemma2.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_pali_gemma_handle(launcher):
    with launcher(
        "google/paligemma2-3b-pt-224",
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_pali_gemma(flash_pali_gemma_handle):
    await flash_pali_gemma_handle.health(300)
    return flash_pali_gemma_handle.client


async def test_flash_pali_gemma_image(flash_pali_gemma, response_snapshot):
    car_image = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg"
    response = await flash_pali_gemma.generate(
        f"![]({car_image})",
        max_new_tokens=20,
    )
    assert (
        response.generated_text == "\nBrown\nCar\nColor\nCool\nDecor\n\n\n\n\n\n\n?\n?"
    )

    assert response == response_snapshot


================================================
FILE: integration-tests/models/test_flash_phi.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_phi_handle(launcher):
    with launcher("microsoft/phi-2", num_shard=1) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_phi(flash_phi_handle):
    await flash_phi_handle.health(300)
    return flash_phi_handle.client


@pytest.mark.release
@pytest.mark.asyncio
async def test_flash_phi(flash_phi, response_snapshot):
    response = await flash_phi.generate(
        "Test request", max_new_tokens=10, decoder_input_details=True
    )

    assert response.details.generated_tokens == 10
    assert response.generated_text == ': {request}")\n        response = self'
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
async def test_flash_phi_all_params(flash_phi, response_snapshot):
    response = await flash_phi.generate(
        "Test request",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        stop_sequences=["network"],
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )

    assert response.details.generated_tokens == 6
    assert response.generated_text == "Test request to send data over a network"
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
async def test_flash_phi_load(flash_phi, generate_load, response_snapshot):
    responses = await generate_load(flash_phi, "Test request", max_new_tokens=10, n=4)

    assert len(responses) == 4
    assert all(
        [r.generated_text == responses[0].generated_text for r in responses]
    ), f"{[r.generated_text  for r in responses]}"
    assert responses[0].generated_text == ': {request}")\n        response = self'

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_flash_phi35_moe.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_phi35_moe_handle(launcher):
    with launcher(
        "microsoft/Phi-3.5-MoE-instruct",
        num_shard=4,
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_phi35_moe(flash_phi35_moe_handle):
    await flash_phi35_moe_handle.health(300)
    return flash_phi35_moe_handle.client


@pytest.mark.asyncio
async def test_flash_phi35_moe(flash_phi35_moe, response_snapshot):
    response = await flash_phi35_moe.generate(
        "What is gradient descent?\n\n", max_new_tokens=10, decoder_input_details=True
    )

    assert response.details.generated_tokens == 10
    assert (
        response.generated_text
        == "Gradient descent is an optimization algorithm commonly used in"
    )
    assert response == response_snapshot


@pytest.mark.asyncio
async def test_flash_phi35_moe_all_params(flash_phi35_moe, response_snapshot):
    response = await flash_phi35_moe.generate(
        "What is gradient descent?\n",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        stop_sequences=["test"],
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )

    assert response.details.generated_tokens == 10
    assert (
        response.generated_text
        == "What is gradient descent?\nGradient Descent (GD) is an"
    )
    assert response == response_snapshot


@pytest.mark.asyncio
async def test_flash_phi35_moe_load(flash_phi35_moe, generate_load, response_snapshot):
    responses = await generate_load(
        flash_phi35_moe, "What is gradient descent?\n\n", max_new_tokens=10, n=4
    )

    assert len(responses) == 4
    assert responses[0].details.generated_tokens == 10
    assert (
        responses[0].generated_text
        == "Gradient descent is an optimization algorithm commonly used in"
    )
    assert all(
        [r.generated_text == responses[0].generated_text for r in responses]
    ), f"{[r.generated_text  for r in responses]}"

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_flash_qwen2.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_qwen2_handle(launcher):
    with launcher("Qwen/Qwen1.5-0.5B") as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_qwen2(flash_qwen2_handle):
    await flash_qwen2_handle.health(300)
    return flash_qwen2_handle.client


@pytest.mark.release
@pytest.mark.asyncio
async def test_flash_qwen2(flash_qwen2, response_snapshot):
    response = await flash_qwen2.generate(
        "Test request", max_new_tokens=10, decoder_input_details=True
    )

    assert response.details.generated_tokens == 10
    assert response.generated_text == "\n# Create a request\nrequest = requests.get"
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
async def test_flash_qwen2_all_params(flash_qwen2, response_snapshot):
    response = await flash_qwen2.generate(
        "Test request",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        stop_sequences=["test"],
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )

    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
async def test_flash_qwen2_load(flash_qwen2, generate_load, response_snapshot):
    responses = await generate_load(flash_qwen2, "Test request", max_new_tokens=10, n=4)

    assert len(responses) == 4
    assert all(
        [r.generated_text == responses[0].generated_text for r in responses]
    ), f"{[r.generated_text  for r in responses]}"
    assert responses[0].generated_text == "\n# Create a request\nrequest = requests.get"

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_flash_qwen2_5_vl.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_qwen2_5_vl_handle(launcher):
    with launcher("Qwen/Qwen2.5-VL-3B-Instruct") as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_qwen2_5(flash_qwen2_5_vl_handle):
    await flash_qwen2_5_vl_handle.health(300)
    return flash_qwen2_5_vl_handle.client


@pytest.mark.private
async def test_flash_qwen2_5_vl_simple(flash_qwen2_5, response_snapshot):
    response = await flash_qwen2_5.chat(
        seed=42,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
                        },
                    },
                    {"type": "text", "text": "Describe the image"},
                ],
            },
        ],
    )

    assert (
        response.choices[0].message.content
        == "The image depicts an anthropomorphic rabbit character wearing an intricate space suit, which includes a helmet with a starry face pattern and multiple suitors. The rabbit's ears are significantly large and upright, and it has a hitchhiker-like star antennas on its chest. The background is a reddish-orange, rocky landscape, suggesting a Martian environment. The suit has various buttons, a red button on the chest, and a reflective or illuminated dome on the head. The overall color scheme is dominated by shades of red, orange, and gray, giving a sense of a rugged, otherworldly setting."
    )

    assert response == response_snapshot


@pytest.mark.private
async def test_flash_qwen2_5_vl_simple_streaming(flash_qwen2_5, response_snapshot):
    responses = await flash_qwen2_5.chat(
        seed=42,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
                        },
                    },
                    {"type": "text", "text": "Describe the image"},
                ],
            },
        ],
        stream=True,
    )

    count = 0
    generated = ""
    last_response = None
    async for response in responses:
        count += 1
        generated += response.choices[0].delta.content
        last_response = response

    assert (
        generated
        == "The image depicts an anthropomorphic rabbit character wearing an intricate space suit, which includes a helmet with a starry face pattern and multiple suitors. The rabbit's ears are significantly large and upright, and it has a hitchhiker-like star antennas on its chest. The background is a reddish-orange, rocky landscape, suggesting a Martian environment. The suit has various buttons, a red button on the chest, and a reflective or illuminated dome on the head. The overall color scheme is dominated by shades of red, orange, and gray, giving a sense of a rugged, otherworldly setting."
    )
    assert count == 121
    assert last_response == response_snapshot


@pytest.mark.private
async def test_flash_qwen2_5_vl_bay(flash_qwen2_5, response_snapshot):
    response = await flash_qwen2_5.chat(
        seed=42,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
                        },
                    },
                    {"type": "text", "text": "Describe the image"},
                ],
            },
        ],
    )
    assert response == response_snapshot


@pytest.mark.private
async def test_flash_qwen2_5_vl_inpaint(flash_qwen2_5, response_snapshot):
    response = await flash_qwen2_5.chat(
        seed=42,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-inpaint.png"
                        },
                    },
                    {"type": "text", "text": "Describe the image"},
                ],
            },
        ],
    )
    assert response == response_snapshot


================================================
FILE: integration-tests/models/test_flash_qwen2_vl.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_qwen2_vl_handle(launcher):
    with launcher("Qwen/Qwen2-VL-7B-Instruct") as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_qwen2(flash_qwen2_vl_handle):
    await flash_qwen2_vl_handle.health(300)
    return flash_qwen2_vl_handle.client


@pytest.mark.private
async def test_flash_qwen2_vl_simple(flash_qwen2, response_snapshot):
    response = await flash_qwen2.chat(
        seed=42,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
                        },
                    },
                    {"type": "text", "text": "Describe this image."},
                ],
            },
        ],
    )

    assert (
        response.choices[0].message.content
        == "The image depicts an anthropomorphic rabbit, wearing a spacesuit, standing in a barren, rocky landscape that resembles the surface of another planet, possibly Mars. The rabbit has a red digestive system label on its chest, and the surrounding environment features red sandy terrain and a hazy, floating planet or moon in the background. The scene has a surreal, fantastical quality, blending elements of science fiction and space exploration with a whimsical character."
    )

    assert response == response_snapshot


@pytest.mark.private
async def test_flash_qwen2_vl_simple_streaming(flash_qwen2, response_snapshot):
    responses = await flash_qwen2.chat(
        seed=42,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
                        },
                    },
                    {"type": "text", "text": "Describe this image."},
                ],
            },
        ],
        stream=True,
    )

    count = 0
    generated = ""
    last_response = None
    async for response in responses:
        count += 1
        generated += response.choices[0].delta.content
        last_response = response

    assert (
        generated
        == "The image depicts an anthropomorphic rabbit, wearing a spacesuit, standing in a barren, rocky landscape that resembles the surface of another planet, possibly Mars. The rabbit has a red digestive system label on its chest, and the surrounding environment features red sandy terrain and a hazy, floating planet or moon in the background. The scene has a surreal, fantastical quality, blending elements of science fiction and space exploration with a whimsical character."
    )
    assert count == 89
    assert last_response == response_snapshot


@pytest.mark.private
async def test_flash_qwen2_vl_bay(flash_qwen2, response_snapshot):
    response = await flash_qwen2.chat(
        seed=42,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
                        },
                    },
                    {"type": "text", "text": "Describe the image"},
                ],
            },
        ],
    )
    assert response == response_snapshot


@pytest.mark.private
async def test_flash_qwen2_vl_inpaint(flash_qwen2, response_snapshot):
    response = await flash_qwen2.chat(
        seed=42,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-inpaint.png"
                        },
                    },
                    {"type": "text", "text": "Describe the image"},
                ],
            },
        ],
    )
    assert response == response_snapshot


================================================
FILE: integration-tests/models/test_flash_santacoder.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_santacoder_handle(launcher):
    with launcher("bigcode/santacoder") as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_santacoder(flash_santacoder_handle):
    await flash_santacoder_handle.health(300)
    return flash_santacoder_handle.client


@pytest.mark.release
@pytest.mark.asyncio
async def test_flash_santacoder(flash_santacoder, response_snapshot):
    response = await flash_santacoder.generate(
        "def print_hello", max_new_tokens=10, decoder_input_details=True
    )

    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
async def test_flash_santacoder_load(
    flash_santacoder, generate_load, response_snapshot
):
    responses = await generate_load(
        flash_santacoder, "def print_hello", max_new_tokens=10, n=4
    )

    assert len(responses) == 4
    assert all([r.generated_text == responses[0].generated_text for r in responses])

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_flash_starcoder.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_starcoder_handle(launcher):
    with launcher("bigcode/starcoder", num_shard=2) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_starcoder(flash_starcoder_handle):
    await flash_starcoder_handle.health(300)
    return flash_starcoder_handle.client


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_starcoder(flash_starcoder, response_snapshot):
    response = await flash_starcoder.generate(
        "def print_hello", max_new_tokens=10, decoder_input_details=True
    )

    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_starcoder_default_params(flash_starcoder, response_snapshot):
    response = await flash_starcoder.generate(
        "def print_hello",
        max_new_tokens=60,
        temperature=0.2,
        top_p=0.95,
        decoder_input_details=True,
        seed=0,
    )

    assert response.details.generated_tokens == 60
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_starcoder_load(flash_starcoder, generate_load, response_snapshot):
    responses = await generate_load(
        flash_starcoder, "def print_hello", max_new_tokens=10, n=4
    )

    assert len(responses) == 4
    assert all([r.generated_text == responses[0].generated_text for r in responses])

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_flash_starcoder2.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_starcoder2_handle(launcher):
    with launcher("bigcode/starcoder2-3b", num_shard=2) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_starcoder2(flash_starcoder2_handle):
    await flash_starcoder2_handle.health(300)
    return flash_starcoder2_handle.client


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_starcoder2(flash_starcoder2, response_snapshot):
    response = await flash_starcoder2.generate(
        "def print_hello", max_new_tokens=10, decoder_input_details=True
    )

    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_starcoder2_default_params(flash_starcoder2, response_snapshot):
    response = await flash_starcoder2.generate(
        "def print_hello",
        max_new_tokens=60,
        temperature=0.2,
        top_p=0.95,
        decoder_input_details=True,
        seed=0,
    )

    assert response.details.generated_tokens == 60
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_starcoder2_load(
    flash_starcoder2, generate_load, response_snapshot
):
    responses = await generate_load(
        flash_starcoder2, "def print_hello", max_new_tokens=10, n=4
    )

    assert len(responses) == 4
    assert all([r.generated_text == responses[0].generated_text for r in responses])

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_flash_starcoder2_lora.py
================================================
import pytest
import requests


@pytest.fixture(scope="module")
def flash_starcoder2_handle(launcher):
    with launcher(
        "bigcode/starcoder2-3b", lora_adapters=["smangrul/starcoder-3b-hugcoder"]
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_starcoder2(flash_starcoder2_handle):
    await flash_starcoder2_handle.health(300)
    return flash_starcoder2_handle.client


@pytest.mark.asyncio
async def test_flash_starcoder2(flash_starcoder2, response_snapshot):
    response = await flash_starcoder2.generate(
        "def print_hello", max_new_tokens=10, decoder_input_details=True
    )

    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.asyncio
async def test_flash_starcoder2_default_params(flash_starcoder2, response_snapshot):
    response = await flash_starcoder2.generate(
        "who are you?",
        max_new_tokens=60,
        temperature=0.2,
        top_p=0.95,
        decoder_input_details=True,
        seed=0,
    )

    assert response.details.generated_tokens == 60
    assert response == response_snapshot


@pytest.mark.asyncio
async def test_flash_starcoder2_load(
    flash_starcoder2, generate_load, response_snapshot
):
    responses = await generate_load(
        flash_starcoder2, "who are you?", max_new_tokens=10, n=4
    )

    assert len(responses) == 4
    assert all([r.generated_text == responses[0].generated_text for r in responses])

    assert responses == response_snapshot


@pytest.mark.asyncio
async def test_flash_starcoder2_with_hugcode_adapter(
    flash_starcoder2, response_snapshot
):
    response = requests.post(
        f"{flash_starcoder2.base_url}/generate",
        headers=flash_starcoder2.headers,
        json={
            "inputs": "def print_hello",
            "parameters": {
                "max_new_tokens": 10,
                "adapter_id": "smangrul/starcoder-3b-hugcoder",
                "details": True,
            },
        },
    )

    assert response.status_code == 200
    data = response.json()
    assert data["generated_text"] == '_world():\n    print("Hello World!")\n'

    assert data == response_snapshot


================================================
FILE: integration-tests/models/test_flash_starcoder_gptq.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_starcoder_gptq_handle(launcher):
    with launcher("Narsil/starcoder-gptq", num_shard=2, quantize="gptq") as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_starcoder_gptq(flash_starcoder_gptq_handle):
    await flash_starcoder_gptq_handle.health(300)
    return flash_starcoder_gptq_handle.client


@pytest.mark.release
@pytest.mark.asyncio
async def test_flash_starcoder_gptq(flash_starcoder_gptq, generous_response_snapshot):
    response = await flash_starcoder_gptq.generate(
        "def geometric_mean(L: List[float]):",
        max_new_tokens=20,
        decoder_input_details=True,
    )
    assert response.details.generated_tokens == 2
    assert response == generous_response_snapshot


# Deactivated because it's flaky
# Only this model seems affected and it's only a logprob precision issue.
# @pytest.mark.release
# @pytest.mark.asyncio
# async def test_flash_starcoder_gptq_default_params(
#     flash_starcoder_gptq, generous_response_snapshot
# ):
#     response = await flash_starcoder_gptq.generate(
#         "def geometric_mean(L: List[float]):",
#         max_new_tokens=20,
#         temperature=0.2,
#         top_p=0.95,
#         decoder_input_details=True,
#         seed=0,
#     )
#     assert response.details.generated_tokens == 2
#     assert response == generous_response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
async def test_flash_starcoder_gptq_load(
    flash_starcoder_gptq, generate_load, generous_response_snapshot
):
    responses = await generate_load(
        flash_starcoder_gptq,
        "def geometric_mean(L: List[float]):",
        max_new_tokens=10,
        n=4,
    )

    assert len(responses) == 4
    # XXX: TODO: Fix this test.
    # assert all([r.generated_text == responses[0].generated_text for r in responses])

    # assert responses == generous_response_snapshot


================================================
FILE: integration-tests/models/test_grammar_llama.py
================================================
import pytest
import json

from text_generation.types import GrammarType


@pytest.fixture(scope="module")
def non_flash_llama_grammar_handle(launcher):
    with launcher(
        "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
        num_shard=1,
        disable_grammar_support=False,
        use_flash_attention=False,
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def non_flash_llama_grammar(non_flash_llama_grammar_handle):
    await non_flash_llama_grammar_handle.health(300)
    return non_flash_llama_grammar_handle.client


@pytest.mark.release
@pytest.mark.skip
@pytest.mark.asyncio
async def test_non_flash_llama_grammar_json(non_flash_llama_grammar, response_snapshot):
    response = await non_flash_llama_grammar.generate(
        "info: david holtz like trees and has two cats. ",
        max_new_tokens=100,
        decoder_input_details=True,
        seed=0,
        grammar={
            "type": GrammarType.Json,
            "value": json.dumps(
                {
                    "type": "object",
                    "$id": "https://example.com/person.schema.json",
                    "$schema": "https://json-schema.org/draft/2020-12/schema",
                    "title": "Person",
                    "properties": {
                        "firstName": {
                            "type": "string",
                            "description": "The person'''s first name.",
                        },
                        "lastName": {
                            "type": "string",
                            "description": "The person'''s last name.",
                        },
                        "hobby": {
                            "description": "The person'''s hobby.",
                            "type": "string",
                        },
                        "numCats": {
                            "description": "The number of cats the person has.",
                            "type": "integer",
                            "minimum": 0,
                        },
                    },
                    "required": ["firstName", "lastName", "hobby", "numCats"],
                }
            ),
        },
    )

    assert response.details.generated_tokens == 30
    assert (
        response.generated_text
        == '{"firstName":"David","hobby":"Trees","lastName":"Holtz","numCats":2}'
    )
    assert response == response_snapshot


================================================
FILE: integration-tests/models/test_grammar_response_format_llama.py
================================================
import pytest
import requests
from pydantic import BaseModel
from typing import List


@pytest.fixture(scope="module")
def llama_grammar_handle(launcher):
    with launcher(
        "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
        num_shard=1,
        disable_grammar_support=False,
        use_flash_attention=False,
        max_batch_prefill_tokens=3000,
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def llama_grammar(llama_grammar_handle):
    await llama_grammar_handle.health(300)
    return llama_grammar_handle.client


@pytest.mark.release
@pytest.mark.asyncio
async def test_grammar_response_format_llama_json(llama_grammar, response_snapshot):
    class Weather(BaseModel):
        unit: str
        temperature: List[int]

    json_payload = {
        "model": "tgi",
        "messages": [
            {
                "role": "system",
                "content": f"Respond to the users questions and answer them in the following format: {Weather.model_json_schema()}",
            },
            {
                "role": "user",
                "content": "What's the weather like the next 3 days in San Francisco, CA?",
            },
        ],
        "seed": 42,
        "max_tokens": 500,
        "response_format": {
            "type": "json_object",
            "value": Weather.model_json_schema(),
        },
    }
    # send the request
    response = requests.post(
        f"{llama_grammar.base_url}/v1/chat/completions",
        headers=llama_grammar.headers,
        json=json_payload,
    )

    chat_completion = response.json()
    called = chat_completion["choices"][0]["message"]["content"]

    assert response.status_code == 200
    assert called == '{ "unit": "fahrenheit", "temperature": [ 72, 79, 88 ] }'
    assert chat_completion == response_snapshot

    json_payload["response_format"]["type"] = "json"
    response = requests.post(
        f"{llama_grammar.base_url}/v1/chat/completions",
        headers=llama_grammar.headers,
        json=json_payload,
    )

    chat_completion = response.json()
    called = chat_completion["choices"][0]["message"]["content"]

    assert response.status_code == 200
    assert called == '{ "unit": "fahrenheit", "temperature": [ 72, 79, 88 ] }'
    assert chat_completion == response_snapshot

    json_payload["response_format"] = {
        "type": "json_schema",
        "value": {
            "name": "weather",
            "strict": True,
            "schema": Weather.model_json_schema(),
        },
    }
    response = requests.post(
        f"{llama_grammar.base_url}/v1/chat/completions",
        headers=llama_grammar.headers,
        json=json_payload,
    )

    chat_completion = response.json()
    called = chat_completion["choices"][0]["message"]["content"]

    assert response.status_code == 200
    assert called == '{ "unit": "fahrenheit", "temperature": [ 72, 79, 88 ] }'
    assert chat_completion == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
async def test_grammar_response_format_llama_error_if_tools_not_installed(
    llama_grammar,
):
    class Weather(BaseModel):
        unit: str
        temperature: List[int]

    # send the request
    response = requests.post(
        f"{llama_grammar.base_url}/v1/chat/completions",
        headers=llama_grammar.headers,
        json={
            "model": "tgi",
            "messages": [
                {
                    "role": "system",
                    "content": f"Respond to the users questions and answer them in the following format: {Weather.model_json_schema()}",
                },
                {
                    "role": "user",
                    "content": "What's the weather like the next 3 days in San Francisco, CA?",
                },
            ],
            "seed": 42,
            "max_tokens": 500,
            "tools": [],
            "response_format": {
                "type": "json_object",
                "value": Weather.model_json_schema(),
            },
        },
    )

    # 422 means the server was unable to process the request because it contains invalid data.
    assert response.status_code == 422
    assert response.json() == {
        "error": "Tool error: Grammar and tools are mutually exclusive",
        "error_type": "tool_error",
    }


================================================
FILE: integration-tests/models/test_idefics.py
================================================
import pytest


@pytest.fixture(scope="module")
def idefics_handle(launcher):
    with launcher(
        "HuggingFaceM4/idefics-9b-instruct", num_shard=2, dtype="float16"
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def idefics(idefics_handle):
    await idefics_handle.health(300)
    return idefics_handle.client


@pytest.mark.asyncio
async def test_idefics(idefics, response_snapshot, chicken):
    response = await idefics.generate(
        f"User:![]({chicken})Can you tell me a very short story based on the image?",
        max_new_tokens=10,
        decoder_input_details=True,
    )

    assert response.details.generated_tokens == 10
    assert (
        response.generated_text == " \nAssistant: A rooster stands"
    ), f"{repr(response.generated_text)}"
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_idefics_two_images(idefics, response_snapshot, chicken, cow_beach):
    response = await idefics.generate(
        f"User:![]({chicken})![]({cow_beach})Where are the cow and chicken?<end_of_utterance> \nAssistant:",
        max_new_tokens=20,
    )
    assert (
        response.generated_text == " The cow and chicken are standing on a beach."
    ), f"{repr(response.generated_text)}"
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
async def test_idefics_load(idefics, generate_load, response_snapshot, chicken):
    responses = await generate_load(
        idefics,
        f"User:![]({chicken})Can you tell me a very short story based on the image?",
        max_new_tokens=10,
        n=4,
    )

    generated_texts = [r.generated_text for r in responses]

    assert generated_texts[0] == " \nAssistant: A rooster stands"
    assert len(generated_texts) == 4
    assert generated_texts, all(
        [text == generated_texts[0] for text in generated_texts]
    )

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_idefics2.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_idefics2_next_handle(launcher):
    with launcher(
        "HuggingFaceM4/idefics2-8b",
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_idefics2_next(flash_idefics2_next_handle):
    await flash_idefics2_next_handle.health(300)
    return flash_idefics2_next_handle.client


@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_idefics2_next_simple(
    flash_idefics2_next, response_snapshot, chicken
):
    response = await flash_idefics2_next.generate(
        f"User:![]({chicken})Write me a short story<end_of_utterance> \nAssistant:",
        max_new_tokens=10,
    )
    assert (
        response.generated_text == " A chicken is sitting on a pile of money."
    ), f"{repr(response.generated_text)}"
    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_idefics2_two_images(
    flash_idefics2_next, response_snapshot, chicken, cow_beach
):
    response = await flash_idefics2_next.generate(
        f"User:![]({chicken})![]({cow_beach})Where are the cow and chicken?<end_of_utterance> \nAssistant:",
        max_new_tokens=20,
    )
    assert (
        response.generated_text
        == " The cow is standing on the beach and the chicken is sitting on a pile of money."
    ), f"{repr(response.generated_text)}"
    assert response.details.generated_tokens == 19
    assert response == response_snapshot


@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_idefics2_next_all_params(flash_idefics2_next, response_snapshot):
    response = await flash_idefics2_next.generate(
        "Test request",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        stop_sequences=["test"],
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )

    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_idefics2_next_load(
    flash_idefics2_next, generate_load, response_snapshot, chicken
):
    responses = await generate_load(
        flash_idefics2_next,
        f"User:![]({chicken})Write me a short story<end_of_utterance> \nAssistant:",
        max_new_tokens=10,
        n=4,
    )
    generated_texts = [r.generated_text for r in responses]
    assert generated_texts[0] == " A chicken is sitting on a pile of money."
    assert len(generated_texts) == 4
    assert all([r.generated_text == generated_texts[0] for r in responses])

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_idefics3.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_idefics3_next_handle(launcher):
    with launcher("HuggingFaceM4/Idefics3-8B-Llama3") as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_idefics3_next(flash_idefics3_next_handle):
    await flash_idefics3_next_handle.health(300)
    return flash_idefics3_next_handle.client


@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_idefics3_next_simple_url(flash_idefics3_next, response_snapshot):
    ny_skyline = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
    query = "What is in this image?"
    response = await flash_idefics3_next.generate(
        f"<|begin_of_text|><|begin_of_text|>User:![]({ny_skyline}){query}<end_of_utterance>\nAssistant:",
        max_new_tokens=10,
        seed=1337,
    )
    print(response)
    assert (
        response.generated_text == " There is a statue in the image."
    ), f"{repr(response.generated_text)}"
    assert response.details.generated_tokens == 9
    assert response == response_snapshot


================================================
FILE: integration-tests/models/test_json_schema_constrain.py
================================================
import pytest
import json
import requests


@pytest.fixture(scope="module")
def model_handle(launcher):
    """Fixture to provide the base URL for API calls."""
    with launcher(
        "google/gemma-3-4b-it",
        num_shard=2,
        disable_grammar_support=False,
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def model_fixture(model_handle):
    await model_handle.health(300)
    return model_handle.client


# Sample JSON Schema for testing
person_schema = {
    "type": "object",
    "$id": "https://example.com/person.schema.json",
    "$schema": "https://json-schema.org/draft/2020-12/schema",
    "title": "Person",
    "properties": {
        "firstName": {
            "type": "string",
            "description": "The person's first name.",
            "minLength": 4,
        },
        "lastName": {
            "type": "string",
            "description": "The person's last name.",
            "minLength": 4,
        },
        "hobby": {
            "description": "The person's hobby.",
            "type": "string",
            "minLength": 4,
        },
        "numCats": {
            "description": "The number of cats the person has.",
            "type": "integer",
            "minimum": 0,
        },
    },
    "required": ["firstName", "lastName", "hobby", "numCats"],
}

# More complex schema for testing nested objects and arrays
complex_schema = {
    "type": "object",
    "properties": {
        "name": {"type": "string"},
        "age": {"type": "integer", "minimum": 0},
        "address": {
            "type": "object",
            "properties": {
                "street": {"type": "string"},
                "city": {"type": "string"},
                "postalCode": {"type": "string"},
            },
            "required": ["street", "city"],
        },
        "hobbies": {"type": "array", "items": {"type": "string"}, "minItems": 1},
    },
    "required": ["name", "age", "hobbies"],
}


@pytest.mark.asyncio
@pytest.mark.private
async def test_json_schema_basic(model_fixture, response_snapshot):
    """Test basic JSON schema validation with the person schema."""
    response = requests.post(
        f"{model_fixture.base_url}/v1/chat/completions",
        json={
            "model": "tgi",
            "messages": [
                {
                    "role": "user",
                    "content": "David is a person who likes trees and nature. He enjoys studying math and science. He has 2 cats.",
                },
            ],
            "seed": 42,
            "temperature": 0.0,
            "response_format": {
                "type": "json_schema",
                "value": {"name": "person", "strict": True, "schema": person_schema},
            },
        },
    )

    result = response.json()

    # Validate response format
    content = result["choices"][0]["message"]["content"]
    parsed_content = json.loads(content)

    assert "firstName" in parsed_content
    assert "lastName" in parsed_content
    assert "hobby" in parsed_content
    assert "numCats" in parsed_content
    assert isinstance(parsed_content["numCats"], int)
    assert parsed_content["numCats"] >= 0
    assert result == response_snapshot


@pytest.mark.asyncio
@pytest.mark.private
async def test_json_schema_complex(model_fixture, response_snapshot):
    """Test complex JSON schema with nested objects and arrays."""
    response = requests.post(
        f"{model_fixture.base_url}/v1/chat/completions",
        json={
            "model": "tgi",
            "messages": [
                {
                    "role": "user",
                    "content": "John Smith is 30 years old. He lives on Maple Street in Boston. He enjoys botany, astronomy, and solving mathematical puzzles.",
                },
            ],
            "seed": 42,
            "temperature": 0.0,
            "response_format": {
                "type": "json_schema",
                "value": {
                    "name": "complex_person",
                    "strict": True,
                    "schema": complex_schema,
                },
            },
        },
    )

    result = response.json()

    # Validate response format
    content = result["choices"][0]["message"]["content"]
    parsed_content = json.loads(content)

    assert "name" in parsed_content
    assert "age" in parsed_content
    assert "hobbies" in parsed_content
    assert "address" in parsed_content
    assert "street" in parsed_content["address"]
    assert "city" in parsed_content["address"]
    assert isinstance(parsed_content["hobbies"], list)
    assert len(parsed_content["hobbies"]) >= 1
    assert result == response_snapshot


@pytest.mark.asyncio
@pytest.mark.private
async def test_json_schema_stream(model_fixture, response_snapshot):
    """Test JSON schema validation with streaming."""
    response = requests.post(
        f"{model_fixture.base_url}/v1/chat/completions",
        json={
            "model": "tgi",
            "messages": [
                {
                    "role": "user",
                    "content": "David is a person who likes to ride bicycles. He has 2 cats.",
                },
            ],
            "seed": 42,
            "temperature": 0.0,
            "response_format": {
                "type": "json_schema",
                "value": {"name": "person", "strict": True, "schema": person_schema},
            },
            "stream": True,
        },
        stream=True,
    )

    chunks = []
    content_generated = ""

    for line in response.iter_lines():
        if line:
            # Remove the "data: " prefix and handle the special case of "[DONE]"
            data = line.decode("utf-8")
            if data.startswith("data: "):
                data = data[6:]
                if data != "[DONE]":
                    chunk = json.loads(data)
                    chunks.append(chunk)
                    if "choices" in chunk and len(chunk["choices"]) > 0:
                        if (
                            "delta" in chunk["choices"][0]
                            and "content" in chunk["choices"][0]["delta"]
                        ):
                            content_generated += chunk["choices"][0]["delta"]["content"]

    # Validate the final assembled JSON
    parsed_content = json.loads(content_generated)
    assert "firstName" in parsed_content
    assert "lastName" in parsed_content
    assert "hobby" in parsed_content
    assert "numCats" in parsed_content
    assert isinstance(parsed_content["numCats"], int)
    assert parsed_content["numCats"] >= 0
    assert chunks == response_snapshot


================================================
FILE: integration-tests/models/test_llava_next.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_llava_next_handle(launcher):
    with launcher(
        "llava-hf/llava-v1.6-mistral-7b-hf",
        num_shard=4,
        max_input_length=4000,
        max_total_tokens=4096,
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_llava_next(flash_llava_next_handle):
    await flash_llava_next_handle.health(300)
    return flash_llava_next_handle.client


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llava_next_simple(flash_llava_next, response_snapshot, chicken):
    response = await flash_llava_next.generate(
        f"User:![]({chicken})Can you tell me a very short story based on the image?",
        max_new_tokens=10,
    )
    assert (
        response.generated_text == "\n\nOnce upon a time, there was a"
    ), f"{repr(response.generated_text)}"
    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llava_next_all_params(flash_llava_next, response_snapshot):
    response = await flash_llava_next.generate(
        "Test request",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        stop_sequences=["test"],
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )

    assert response.details.generated_tokens == 6
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llava_next_load(
    flash_llava_next, generate_load, response_snapshot, chicken
):
    responses = await generate_load(
        flash_llava_next,
        f"User:![]({chicken})Can you tell me a very short story based on the image?",
        max_new_tokens=10,
        n=4,
    )
    generated_texts = [r.generated_text for r in responses]
    assert generated_texts[0] == "\n\nOnce upon a time, there was a"
    assert len(generated_texts) == 4
    assert all([r.generated_text == generated_texts[0] for r in responses])

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_lora_mistral.py
================================================
import pytest
import requests


@pytest.fixture(scope="module")
def lora_mistral_handle(launcher):
    with launcher(
        "mistralai/Mistral-7B-v0.1",
        lora_adapters=[
            "predibase/dbpedia",
            "predibase/customer_support",
        ],
        cuda_graphs=[0],
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def lora_mistral(lora_mistral_handle):
    await lora_mistral_handle.health(300)
    return lora_mistral_handle.client


@pytest.mark.asyncio
@pytest.mark.private
async def test_lora_mistral(lora_mistral, response_snapshot):
    response = await lora_mistral.generate(
        "Test request", max_new_tokens=10, decoder_input_details=True
    )
    assert response.details.generated_tokens == 10


classification_prompt = """You are given the title and the body of an article below. Please determine the type of the article.\n### Title: Great White Whale\n\n### Body: Great White Whale is the debut album by the Canadian rock band Secret and Whisper. The album was in the works for about a year and was released on February 12 2008. A music video was shot in Pittsburgh for the album's first single XOXOXO. The album reached number 17 on iTunes's top 100 albums in its first week on sale.\n\n### Article Type:"""


@pytest.mark.asyncio
@pytest.mark.private
async def test_lora_mistral_without_adapter(lora_mistral, response_snapshot):
    response = requests.post(
        f"{lora_mistral.base_url}/generate",
        headers=lora_mistral.headers,
        json={
            "inputs": classification_prompt,
            "parameters": {
                "max_new_tokens": 40,
                "details": True,
            },
        },
    )

    assert response.status_code == 200
    data = response.json()
    assert (
        data["generated_text"]
        == "\n\n### 1. News\n### 2. Blog\n### 3. Article\n### 4. Review\n### 5. Other\n\n\n\n\n\n\n\n\n"
    )
    assert data == response_snapshot


@pytest.mark.asyncio
@pytest.mark.private
async def test_lora_mistral_with_dbpedia_adapter(lora_mistral, response_snapshot):
    response = requests.post(
        f"{lora_mistral.base_url}/generate",
        headers=lora_mistral.headers,
        json={
            "inputs": classification_prompt,
            "parameters": {
                "max_new_tokens": 40,
                "adapter_id": "predibase/dbpedia",
                "details": True,
            },
        },
    )

    assert response.status_code == 200
    data = response.json()
    assert data["generated_text"] == "  11"
    assert data == response_snapshot


@pytest.mark.asyncio
@pytest.mark.private
async def test_lora_mistral_with_customer_support_adapter(
    lora_mistral, response_snapshot
):
    print(lora_mistral.base_url)
    print(lora_mistral.headers)
    response = requests.post(
        f"{lora_mistral.base_url}/generate",
        headers=lora_mistral.headers,
        json={
            "inputs": "What are 3 unique words that describe you?",
            "parameters": {
                "max_new_tokens": 40,
                "adapter_id": "predibase/customer_support",
                "details": True,
            },
        },
    )

    assert response.status_code == 200
    data = response.json()
    assert (
        data["generated_text"]
        == "\n\nI’m not sure if I can come up with 3 unique words that describe me, but I’ll try.\n\n1. Creative\n2. Funny\n3."
    )
    assert data == response_snapshot


@pytest.mark.asyncio
@pytest.mark.private
async def test_lora_mistral_without_customer_support_adapter(
    lora_mistral, response_snapshot
):
    response = requests.post(
        f"{lora_mistral.base_url}/generate",
        headers=lora_mistral.headers,
        json={
            "inputs": "What are 3 unique words that describe you?",
            "parameters": {
                "max_new_tokens": 40,
                "details": True,
            },
        },
    )

    assert response.status_code == 200
    data = response.json()
    assert (
        data["generated_text"]
        == "\n\nI’m a very passionate person. I’m very driven. I’m very determined.\n\nWhat is your favorite thing about being a teacher?\n\nI love the fact"
    )
    assert data == response_snapshot


================================================
FILE: integration-tests/models/test_mamba.py
================================================
import pytest


@pytest.fixture(scope="module")
def fused_kernel_mamba_handle(launcher):
    with launcher("state-spaces/mamba-130m-hf", num_shard=1) as handle:
        yield handle


@pytest.fixture(scope="module")
async def fused_kernel_mamba(fused_kernel_mamba_handle):
    await fused_kernel_mamba_handle.health(300)
    return fused_kernel_mamba_handle.client


@pytest.mark.release
@pytest.mark.asyncio
async def test_mamba(fused_kernel_mamba, response_snapshot):
    response = await fused_kernel_mamba.generate(
        "What is Deep Learning?", max_new_tokens=10
    )

    assert response.details.generated_tokens == 10
    assert response.generated_text == "\n\nDeep learning is a new type of machine"
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
async def test_mamba_all_params(fused_kernel_mamba, response_snapshot):
    response = await fused_kernel_mamba.generate(
        "blue, red, yellow, ",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        stop_sequences=["test"],
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )

    assert response.details.generated_tokens == 10
    assert (
        response.generated_text
        == "blue, red, yellow, \nand blue colors. A number of the color"
    )
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
async def test_mamba_load(
    fused_kernel_mamba, generate_load, generous_response_snapshot
):
    responses = await generate_load(
        fused_kernel_mamba, "What is Deep Learning?", max_new_tokens=10, n=4
    )

    assert len(responses) == 4
    assert responses[0].generated_text == "\n\nDeep learning is a new type of machine"
    assert all([r.generated_text == responses[0].generated_text for r in responses])
    assert responses[0].generated_text == "\n\nDeep learning is a new type of machine"

    assert responses == generous_response_snapshot


================================================
FILE: integration-tests/models/test_mllama.py
================================================
import pytest
import asyncio


@pytest.fixture(scope="module")
def mllama_handle(launcher):
    with launcher(
        "unsloth/Llama-3.2-11B-Vision-Instruct",
        num_shard=2,
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def mllama(mllama_handle):
    await mllama_handle.health(300)
    return mllama_handle.client


@pytest.mark.asyncio
async def test_mllama_simpl(mllama, response_snapshot):
    response = await mllama.chat(
        max_tokens=10,
        temperature=0.0,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Describe the image in 10 words.",
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "https://raw.githubusercontent.com/huggingface/text-generation-inference/main/integration-tests/images/chicken_on_money.png"
                        },
                    },
                ],
            },
        ],
    )

    assert response.usage == {
        "completion_tokens": 10,
        "prompt_tokens": 45,
        "total_tokens": 55,
    }
    assert (
        response.choices[0].message.content
        == "A chicken sits on a pile of money, looking"
    )
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
async def test_mllama_load(mllama, generate_load, response_snapshot):
    futures = [
        mllama.chat(
            max_tokens=10,
            temperature=0.0,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": "Describe the image in 10 words.",
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": "https://raw.githubusercontent.com/huggingface/text-generation-inference/main/integration-tests/images/chicken_on_money.png"
                            },
                        },
                    ],
                },
            ],
        )
        # TODO with v3, 4 breaks here. Nothing accounts of the image VRAM
        # because mllama is the only one doing its thing.
        for i in range(2)
    ]
    responses = await asyncio.gather(*futures)

    generated_texts = [response.choices[0].message.content for response in responses]

    # XXX: TODO: Fix this test.
    assert generated_texts[0] == "A chicken sits on a pile of money, looking"
    assert len(generated_texts) == 2
    assert generated_texts, all(
        [text == generated_texts[0] for text in generated_texts]
    )
    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_mpt.py
================================================
import pytest


@pytest.fixture(scope="module")
def mpt_sharded_handle(launcher):
    with launcher("mosaicml/mpt-7b", num_shard=2) as handle:
        yield handle


@pytest.fixture(scope="module")
async def mpt_sharded(mpt_sharded_handle):
    await mpt_sharded_handle.health(300)
    return mpt_sharded_handle.client


@pytest.mark.release
@pytest.mark.asyncio
async def test_mpt(mpt_sharded, response_snapshot):
    response = await mpt_sharded.generate(
        "What is Deep Learning?",
        max_new_tokens=17,
        decoder_input_details=True,
    )

    assert response.details.generated_tokens == 17
    assert (
        response.generated_text
        == " - Deep Learning\nDeep Learning is a subfield of machine learning that uses artificial neural"
    )
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
async def test_mpt_load(mpt_sharded, generate_load, response_snapshot):
    responses = await generate_load(
        mpt_sharded,
        "What is Deep Learning?",
        max_new_tokens=17,
        n=4,
    )

    assert len(responses) == 4
    assert all([r.generated_text == responses[0].generated_text for r in responses])
    assert (
        responses[0].generated_text
        == " - Deep Learning\nDeep Learning is a subfield of machine learning that uses artificial neural"
    )

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_mt0_base.py
================================================
import pytest


@pytest.fixture(scope="module")
def mt0_base_handle(launcher):
    with launcher("bigscience/mt0-base") as handle:
        yield handle


@pytest.fixture(scope="module")
async def mt0_base(mt0_base_handle):
    await mt0_base_handle.health(300)
    return mt0_base_handle.client


@pytest.mark.release
@pytest.mark.asyncio
async def test_mt0_base(mt0_base, response_snapshot):
    response = await mt0_base.generate(
        "Why is the sky blue?",
        max_new_tokens=10,
        top_p=0.9,
        decoder_input_details=True,
        seed=0,
    )

    assert response.details.generated_tokens == 5
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
async def test_mt0_base_all_params(mt0_base, response_snapshot):
    response = await mt0_base.generate(
        "Why is the sky blue?",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        stop_sequences=["test"],
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )

    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
async def test_mt0_base_load(mt0_base, generate_load, response_snapshot):
    responses = await generate_load(
        mt0_base,
        "Why is the sky blue?",
        max_new_tokens=10,
        n=4,
    )

    assert len(responses) == 4
    assert all([r.generated_text == responses[0].generated_text for r in responses])

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_neox.py
================================================
import pytest


@pytest.fixture(scope="module")
def neox_handle(launcher):
    with launcher(
        "stabilityai/stablelm-tuned-alpha-3b", num_shard=1, use_flash_attention=False
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def neox(neox_handle):
    await neox_handle.health(300)
    return neox_handle.client


@pytest.mark.release
@pytest.mark.skip
@pytest.mark.asyncio
async def test_neox(neox, response_snapshot):
    response = await neox.generate(
        "<|USER|>What's your mood today?<|ASSISTANT|>",
        max_new_tokens=10,
        decoder_input_details=True,
    )

    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.skip
@pytest.mark.asyncio
async def test_neox_load(neox, generate_load, response_snapshot):
    responses = await generate_load(
        neox,
        "<|USER|>What's your mood today?<|ASSISTANT|>",
        max_new_tokens=10,
        n=4,
    )

    generated_texts = [r.generated_text for r in responses]

    assert len(generated_texts) == 4
    assert generated_texts, all(
        [text == generated_texts[0] for text in generated_texts]
    )

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_neox_sharded.py
================================================
import pytest


@pytest.fixture(scope="module")
def neox_sharded_handle(launcher):
    with launcher(
        "OpenAssistant/oasst-sft-1-pythia-12b", num_shard=2, use_flash_attention=False
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def neox_sharded(neox_sharded_handle):
    await neox_sharded_handle.health(300)
    return neox_sharded_handle.client


@pytest.mark.release
@pytest.mark.skip
@pytest.mark.asyncio
async def test_neox(neox_sharded, response_snapshot):
    response = await neox_sharded.generate(
        "<|prompter|>What is a meme, and what's the history behind this word?<|endoftext|><|assistant|>",
        max_new_tokens=10,
        decoder_input_details=True,
    )

    assert response.details.generated_tokens == 10
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.skip
@pytest.mark.asyncio
async def test_neox_load(neox_sharded, generate_load, response_snapshot):
    responses = await generate_load(
        neox_sharded,
        "<|prompter|>What is a meme, and what's the history behind this word?<|endoftext|><|assistant|>",
        max_new_tokens=10,
        n=4,
    )

    assert len(responses) == 4
    assert all([r.generated_text == responses[0].generated_text for r in responses])

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_opt.py
================================================
import pytest


@pytest.fixture(scope="module")
def opt_sharded_handle(launcher):
    with launcher("facebook/opt-6.7b", num_shard=2) as handle:
        yield handle


@pytest.fixture(scope="module")
async def opt_sharded(opt_sharded_handle):
    await opt_sharded_handle.health(300)
    return opt_sharded_handle.client


@pytest.mark.release
@pytest.mark.asyncio
async def test_opt(opt_sharded):
    pass


================================================
FILE: integration-tests/models/test_smolvlm.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_smolvlm_next_handle(launcher):
    with launcher("HuggingFaceTB/SmolVLM-Instruct") as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_smolvlm_next(flash_smolvlm_next_handle):
    await flash_smolvlm_next_handle.health(300)
    return flash_smolvlm_next_handle.client


@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_smolvlm_next_simple_url(flash_smolvlm_next, response_snapshot):
    ny_skyline = "https://huggingface.co/spaces/merve/chameleon-7b/resolve/main/bee.jpg"
    query = "What is in this image?"
    response = await flash_smolvlm_next.generate(
        f"<|begin_of_text|><|begin_of_text|>User:![]({ny_skyline}){query}<end_of_utterance>\nAssistant:",
        max_new_tokens=10,
        seed=1337,
    )
    print(response)
    assert (
        response.generated_text == " A bee on a pink flower."
    ), f"{repr(response.generated_text)}"
    assert response.details.generated_tokens == 8
    assert response == response_snapshot


================================================
FILE: integration-tests/models/test_t5_sharded.py
================================================
import pytest


@pytest.fixture(scope="module")
def t5_sharded_handle(launcher):
    with launcher("google/flan-t5-xxl", num_shard=4) as handle:
        yield handle


@pytest.fixture(scope="module")
async def t5_sharded(t5_sharded_handle):
    await t5_sharded_handle.health(300)
    return t5_sharded_handle.client


@pytest.mark.release
@pytest.mark.asyncio
async def test_t5_sharded(t5_sharded, response_snapshot):
    response = await t5_sharded.generate(
        "Please answer the following question. What is the boiling point of Nitrogen?",
        max_new_tokens=10,
        decoder_input_details=True,
    )

    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
async def test_t5_sharded_load(t5_sharded, generate_load, response_snapshot):
    responses = await generate_load(
        t5_sharded,
        "Please answer the following question. What is the boiling point of Nitrogen?",
        max_new_tokens=10,
        n=4,
    )

    assert len(responses) == 4
    assert all([r.generated_text == responses[0].generated_text for r in responses])

    assert responses == response_snapshot


================================================
FILE: integration-tests/models/test_tools_llama.py
================================================
import pytest
from openai import OpenAI
from huggingface_hub import InferenceClient
from huggingface_hub.inference._generated.types.chat_completion import (
    ChatCompletionOutputToolCall,
    ChatCompletionOutputFunctionDefinition,
)


@pytest.fixture(scope="module")
def flash_llama_grammar_tools_handle(launcher):
    with launcher(
        "meta-llama/Meta-Llama-3.1-8B-Instruct",
        num_shard=2,
        disable_grammar_support=False,
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_llama_grammar_tools(flash_llama_grammar_tools_handle):
    await flash_llama_grammar_tools_handle.health(300)
    return flash_llama_grammar_tools_handle.client


# tools to be used in the following tests
tools = [
    {
        "type": "function",
        "function": {
            "name": "get_current_weather",
            "description": "Get the current weather",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city and state, e.g. San Francisco, CA",
                    },
                    "format": {
                        "type": "string",
                        "enum": ["celsius", "fahrenheit"],
                        "description": "The temperature unit to use. Infer this from the users location.",
                    },
                },
                "required": ["location", "format"],
                "additionalProperties": False,
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "get_n_day_weather_forecast",
            "description": "Get an N-day weather forecast",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city and state, e.g. San Francisco, CA",
                    },
                    "format": {
                        "type": "string",
                        "enum": ["celsius", "fahrenheit"],
                        "description": "The temperature unit to use. Infer this from the users location.",
                    },
                    "num_days": {
                        "type": "integer",
                        "description": "The number of days to forecast",
                    },
                },
                "required": ["location", "format", "num_days"],
                "additionalProperties": False,
            },
        },
    },
]


@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_grammar_tools_nostream(
    flash_llama_grammar_tools, response_snapshot
):
    client = InferenceClient(base_url=f"{flash_llama_grammar_tools.base_url}/v1")
    response = client.chat_completion(
        max_tokens=100,
        seed=1,
        tools=tools,
        temperature=0.0,
        messages=[
            {
                "role": "system",
                "content": "Youre a helpful assistant! Answer the users question best you can.",
            },
            {
                "role": "user",
                "content": "What is the weather like in Brooklyn, New York?",
            },
        ],
    )
    assert response.choices[0].message.content is None
    assert response.choices[0].message.tool_calls == [
        ChatCompletionOutputToolCall(
            id="0",
            type="function",
            function=ChatCompletionOutputFunctionDefinition(
                description=None,
                name="get_current_weather",
                arguments='{"location":"Brooklyn, NY","format":"fahrenheit"}',
            ),
        )
    ]
    assert response == response_snapshot


@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_grammar_tools_openai(
    flash_llama_grammar_tools, response_snapshot
):
    client = OpenAI(api_key="xx", base_url=f"{flash_llama_grammar_tools.base_url}/v1")
    stream = client.chat.completions.create(
        model="tgi",
        max_tokens=100,
        seed=1,
        tools=tools,
        stream=True,
        temperature=0.0,
        messages=[
            {
                "role": "system",
                "content": "Youre a helpful assistant! Answer the users question best you can.",
            },
            {
                "role": "user",
                "content": "What is the weather like in Brooklyn, New York?",
            },
        ],
    )

    chunks = []
    tool = ""
    name = ""
    for chunk in stream:
        if chunk.choices[0].delta.tool_calls[0].function.name:
            name += chunk.choices[0].delta.tool_calls[0].function.name
        tool += chunk.choices[0].delta.tool_calls[0].function.arguments
        chunks.append(chunk)

    assert name == "get_current_weather"
    assert tool == '{ "location": "Brooklyn, NY", "format": "fahrenheit"}'
    assert chunks == response_snapshot


@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_grammar_tools_auto_nostream(
    flash_llama_grammar_tools, response_snapshot
):
    client = InferenceClient(base_url=f"{flash_llama_grammar_tools.base_url}/v1")
    response = client.chat_completion(
        max_tokens=100,
        seed=1,
        tools=tools,
        temperature=0.0,
        tool_choice="auto",
        messages=[
            {
                "role": "system",
                "content": "Youre a helpful assistant! Answer the users question best you can.",
            },
            {
                "role": "user",
                "content": "What is the weather like in Brooklyn, New York?",
            },
        ],
    )
    assert response.choices[0].message.content is None
    assert response.choices[0].message.tool_calls == [
        ChatCompletionOutputToolCall(
            id="0",
            type="function",
            function=ChatCompletionOutputFunctionDefinition(
                description=None,
                name="get_current_weather",
                arguments='{"location":"Brooklyn, NY","format":"fahrenheit"}',
            ),
        )
    ]

    assert response == response_snapshot


@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_grammar_tools_choice_nostream(
    flash_llama_grammar_tools, response_snapshot
):
    client = InferenceClient(base_url=f"{flash_llama_grammar_tools.base_url}/v1")
    response = client.chat_completion(
        max_tokens=100,
        seed=1,
        tools=tools,
        temperature=0.0,
        tool_choice="get_current_weather",
        messages=[
            {
                "role": "system",
                "content": "Youre a helpful assistant! Answer the users question best you can.",
            },
            {
                "role": "user",
                "content": "What is the weather like in Brooklyn, New York?",
            },
        ],
    )
    assert response.choices[0].message.content is None
    assert response.choices[0].message.tool_calls == [
        ChatCompletionOutputToolCall(
            id="0",
            type="function",
            function=ChatCompletionOutputFunctionDefinition(
                description=None,
                name="get_current_weather",
                arguments='{"location":"Brooklyn, NY","format":"fahrenheit"}',
            ),
        )
    ]

    assert response == response_snapshot


@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_grammar_tools_choice_stream(
    flash_llama_grammar_tools, response_snapshot
):
    client = InferenceClient(base_url=f"{flash_llama_grammar_tools.base_url}/v1")
    stream = client.chat_completion(
        max_tokens=100,
        seed=1,
        tools=tools,
        temperature=0.0,
        tool_choice="get_current_weather",
        messages=[
            {
                "role": "system",
                "content": "Youre a helpful assistant! Answer the users question best you can.",
            },
            {
                "role": "user",
                "content": "What is the weather like in Brooklyn, New York?",
            },
        ],
        stream=True,
    )

    arguments = ""
    chunks = []
    name = ""
    for chunk in stream:
        if chunk.choices[0].delta.tool_calls[0].function.name:
            name += chunk.choices[0].delta.tool_calls[0].function.name
        arguments += chunk.choices[0].delta.tool_calls[0].function.arguments
        assert chunk.choices[0].delta.content is None
        chunks.append(chunk)

    assert name == "get_current_weather"
    assert arguments == '{ "location": "Brooklyn, NY", "format": "fahrenheit"}'
    assert chunks == response_snapshot


@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_grammar_tools_insufficient_information_nostream(
    flash_llama_grammar_tools, response_snapshot
):
    client = InferenceClient(base_url=f"{flash_llama_grammar_tools.base_url}/v1")
    response = client.chat_completion(
        max_tokens=20,
        seed=24,
        tools=tools,
        tool_choice="auto",
        messages=[
            {
                "role": "system",
                "content": "You're a helpful assistant! Answer the users question best you can.",
            },
            {
                "role": "user",
                "content": "Who are you?",
            },
        ],
        stream=False,
    )

    content_generated = response.choices[0].message.content
    assert response.choices[0].message.tool_calls is None

    assert (
        content_generated
        == "I'm an artificial intelligence model known as a large language model (LLM) or conversational AI"
    )
    assert response == response_snapshot


@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_grammar_tools_insufficient_information_stream(
    flash_llama_grammar_tools, response_snapshot
):
    client = InferenceClient(base_url=f"{flash_llama_grammar_tools.base_url}/v1")
    stream = client.chat_completion(
        max_tokens=20,
        seed=24,
        tools=tools,
        tool_choice="auto",
        messages=[
            {
                "role": "system",
                "content": "You're a helpful assistant! Answer the users question best you can.",
            },
            {
                "role": "user",
                "content": "Who are you?",
            },
        ],
        stream=True,
    )

    content_generated = ""
    chunks = []
    for chunk in stream:
        content_generated += chunk.choices[0].delta.content
        chunks.append(chunk)
        assert chunk.choices[0].delta.tool_calls is None

    ######## This is exactly the same as the non streaming case
    assert (
        content_generated
        == "I'm an artificial intelligence model known as a large language model (LLM) or conversational AI"
    )
    assert chunks == response_snapshot


@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_grammar_tools_sea_creatures_stream_auto(
    flash_llama_grammar_tools, response_snapshot
):
    client = InferenceClient(base_url=f"{flash_llama_grammar_tools.base_url}/v1")
    stream = client.chat_completion(
        max_tokens=20,
        seed=24,
        tools=tools,
        tool_choice="auto",
        messages=[
            {
                "role": "system",
                "content": "You're a helpful assistant! Answer the users question best you can. If the question is not answerable by the tools, just generate a response.",
            },
            {
                "role": "user",
                "content": "Tell me a story about 3 sea creatures",
            },
        ],
        stream=True,
    )

    content_generated = ""
    chunks = []
    for chunk in stream:
        content_generated += chunk.choices[0].delta.content
        chunks.append(chunk)
        assert chunk.choices[0].delta.tool_calls is None

    assert (
        content_generated
        == "Once upon a time, in a vibrant ocean filled with coral reefs and schools of shimmering fish,"
    )
    assert chunks == response_snapshot


@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_grammar_tools_sea_creatures_stream_required(
    flash_llama_grammar_tools, response_snapshot
):
    client = InferenceClient(base_url=f"{flash_llama_grammar_tools.base_url}/v1")
    stream = client.chat_completion(
        max_tokens=100,
        seed=24,
        tools=tools,
        tool_choice="required",
        messages=[
            {
                "role": "system",
                "content": "You're a helpful assistant! Answer the users question best you can. If the question is not answerable by the tools, just generate a response.",
            },
            {
                "role": "user",
                "content": "Tell me a story about 3 sea creatures",
            },
        ],
        stream=True,
    )

    tool_calls_generated = ""
    name = ""
    chunks = []
    for chunk in stream:
        assert chunk.choices[0].delta.content is None
        if chunk.choices[0].delta.tool_calls[0].function.name:
            name += chunk.choices[0].delta.tool_calls[0].function.name
        tool_calls_generated += chunk.choices[0].delta.tool_calls[0].function.arguments

    assert name == "get_n_day_weather_forecast"
    assert (
        tool_calls_generated
        == '{ "location": "San Francisco, CA", "format": "fahrenheit", "num_days":3}'
    )
    assert chunks == response_snapshot


@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_grammar_tools_sea_creatures_stream_none(
    flash_llama_grammar_tools, response_snapshot
):
    client = InferenceClient(base_url=f"{flash_llama_grammar_tools.base_url}/v1")
    stream = client.chat_completion(
        max_tokens=100,
        seed=24,
        tools=tools,
        tool_choice="none",
        messages=[
            {
                "role": "system",
                "content": "You're a helpful assistant! Answer the users question best you can. If the question is not answerable by the tools, just generate a response.",
            },
            {
                "role": "user",
                "content": "Tell me a story about 3 sea creatures",
            },
        ],
        stream=True,
    )

    content_generated = ""
    chunks = []
    for chunk in stream:
        chunks.append(chunk)
        content_generated += chunk.choices[0].delta.content
        assert chunk.choices[0].delta.tool_calls is None

    assert (
        content_generated
        == "Once upon a time, in a vibrant ocean filled with coral reefs and schools of shimmering fish, lived three dear friends: Luna the sea turtle, Finley the friendly fish, and Crusty the wise crab.\n\nLuna was the oldest of the three. She had traveled the world, exploring hidden caves and shipwrecks, and collecting sparkling shells and shiny pebbles. Her shell was a beautiful mosaic of blues and greens, and her gentle eyes twinkled with the secrets of the deep"
    )
    assert chunks == response_snapshot


@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_grammar_tools_sea_creatures_stream_function_object(
    flash_llama_grammar_tools, response_snapshot
):
    client = InferenceClient(base_url=f"{flash_llama_grammar_tools.base_url}/v1")
    stream = client.chat_completion(
        messages=[
            {
                "role": "system",
                "content": "You're a helpful assistant! Answer the users question best you can. If the question is not answerable by the tools, just generate a response.",
            },
            {
                "role": "user",
                "content": "Tell me a story about 3 sea creatures",
            },
        ],
        tools=tools,
        tool_choice={
            "type": "function",
            "function": {"name": "get_n_day_weather_forecast"},
        },
        max_tokens=100,
        seed=24,
        stream=True,
    )
    chunks = []
    tool_calls_generated = ""
    name = ""
    for chunk in stream:
        assert chunk.choices[0].delta.content is None
        if chunk.choices[0].delta.tool_calls[0].function.name:
            name += chunk.choices[0].delta.tool_calls[0].function.name
        tool_calls_generated += chunk.choices[0].delta.tool_calls[0].function.arguments

    assert name == "get_n_day_weather_forecast"
    assert (
        tool_calls_generated
        == '{ "location": "San Francisco, CA", "format": "celsius", "num_days": 3}'
    )
    assert chunks == response_snapshot


@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_tool_reply_response(
    flash_llama_grammar_tools, response_snapshot
):
    client = InferenceClient(base_url=f"{flash_llama_grammar_tools.base_url}/v1")
    response = client.chat_completion(
        max_tokens=100,
        seed=42,
        messages=[
            {"role": "user", "content": "What's the weather like in Paris today?"},
            {
                "role": "assistant",
                "tool_calls": [
                    {
                        "id": "0",
                        "function": {
                            "arguments": '{"longitude": 2.2945, "latitude": 48.8567}',
                            "name": "get_weather",
                            "description": None,
                        },
                        "type": "function",
                    }
                ],
            },
            {"role": "tool", "tool_call_id": "0", "content": "6.7"},
        ],
        stream=False,
    )

    assert response.choices[0].message.tool_calls is None
    assert (
        response.choices[0].message.content
        == "I can't access real-time data, but I can provide you with current conditions and forecast for Paris, France:\n\nThe current conditions in Paris are mostly cloudy with a temperature of 6.7°C (44.1°F). \n\nPlease note that the actual weather may differ from the provided information. For up-to-date information, I suggest checking a reliable weather website or app for the latest conditions and forecast."
    )

    assert response == response_snapshot


================================================
FILE: integration-tests/models/test_transformers_llama4.py
================================================
# import base64
# from io import BytesIO
# from PIL import Image
#
# import pytest
#
#
# @pytest.fixture(scope="module")
# def flash_llama4_handle(launcher):
#     with launcher("ll-re/Llama-4-Scout-17B-16E-Instruct", num_shard=8) as handle:
#         yield handle
#
#
# @pytest.fixture(scope="module")
# async def flash_llama4(flash_llama4_handle):
#     await flash_llama4_handle.health(300)
#     return flash_llama4_handle.client
#
#
# async def test_flash_llama4(flash_llama4, response_snapshot):
#     response = await flash_llama4.generate(
#         "Hello I am doing a project on the 1918 flu pandemic and I am trying to find out how many",
#         seed=42,
#         max_new_tokens=100,
#     )
#
#     assert (
#         response.generated_text
#         == " people died in the 1918 flu pandemic. Estimating the death toll of the 1918 flu pandemic is difficult because of incomplete records and because of the fact that many of the extra deaths were not attributed to the flu. Many experts believe that the 1918 flu pandemic killed between 50 and 100 million people. Iassistant\n\nThe 1918 flu pandemic, also known as the Spanish flu, is indeed one of the most devastating public health crises in human history. Estimating the exact"
#     )
#     assert response.details.generated_tokens == 100
#     assert response == response_snapshot
#
#
# async def test_flash_llama4_image_cow_dog(flash_llama4, response_snapshot):
#     image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png"
#     response = await flash_llama4.chat(
#         seed=42,
#         messages=[
#             {
#                 "role": "user",
#                 "content": [
#                     {"type": "image_url", "image_url": {"url": image_url}},
#                     {
#                         "type": "text",
#                         "text": "What is the breed of the dog in the image?",
#                     },
#                 ],
#             },
#         ],
#         max_tokens=100,
#     )
#
#     assert (
#         response.choices[0].message.content
#         == "The image does not depict a dog; it shows a cow standing on a beach. Therefore, there is no breed of a dog to identify."
#     )
#     assert response.usage["completion_tokens"] == 30
#     assert response == response_snapshot
#
#
# async def test_flash_llama4_image_cow(flash_llama4, response_snapshot):
#     image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png"
#     response = await flash_llama4.chat(
#         seed=42,
#         messages=[
#             {
#                 "role": "user",
#                 "content": [
#                     {"type": "image_url", "image_url": {"url": image_url}},
#                     {"type": "text", "text": "What is shown in this image?"},
#                 ],
#             },
#         ],
#         max_tokens=100,
#     )
#     assert (
#         response.choices[0].message.content
#         == "The image shows a brown cow standing on the beach with a white face and black and white marking on its ears. The cow has a white patch around its nose and mouth. The ocean and blue sky are in the background."
#     )
#     assert response.usage["completion_tokens"] == 46
#     assert response == response_snapshot
#
#
# # Helper function to convert a Pillow image to a base64 data URL
# def image_to_data_url(img: Image.Image, fmt: str) -> str:
#     buffer = BytesIO()
#     img.save(buffer, format=fmt)
#     img_data = buffer.getvalue()
#     b64_str = base64.b64encode(img_data).decode("utf-8")
#     mime_type = "image/png" if fmt.upper() == "PNG" else "image/jpeg"
#     return f"data:{mime_type};base64,{b64_str}"
#
#
# async def test_flash_llama4_image_base64_rgba(flash_llama4, response_snapshot):
#     # Create an empty 100x100 PNG image with alpha (transparent background)
#     img = Image.new("RGBA", (100, 100), (0, 0, 0, 0))
#     data_url = image_to_data_url(img, "PNG")
#     response = await flash_llama4.chat(
#         seed=42,
#         messages=[
#             {
#                 "role": "user",
#                 "content": [
#                     {"type": "image_url", "image_url": {"url": data_url}},
#                     {
#                         "type": "text",
#                         "text": "What do you see in this transparent image?",
#                     },
#                 ],
#             },
#         ],
#         max_tokens=100,
#     )
#     assert response == response_snapshot
#
#
# async def test_flash_llama4_image_base64_rgb_png(flash_llama4, response_snapshot):
#     # Create an empty 100x100 PNG image without alpha (white background)
#     img = Image.new("RGB", (100, 100), (255, 255, 255))
#     data_url = image_to_data_url(img, "PNG")
#     response = await flash_llama4.chat(
#         seed=42,
#         messages=[
#             {
#                 "role": "user",
#                 "content": [
#                     {"type": "image_url", "image_url": {"url": data_url}},
#                     {"type": "text", "text": "What do you see in this plain image?"},
#                 ],
#             },
#         ],
#         max_tokens=100,
#     )
#     assert response == response_snapshot
#
#
# async def test_flash_llama4_image_base64_rgb_jpg(flash_llama4, response_snapshot):
#     # Create an empty 100x100 JPEG image (white background)
#     img = Image.new("RGB", (100, 100), (255, 255, 255))
#     data_url = image_to_data_url(img, "JPEG")
#     response = await flash_llama4.chat(
#         seed=42,
#         messages=[
#             {
#                 "role": "user",
#                 "content": [
#                     {"type": "image_url", "image_url": {"url": data_url}},
#                     {"type": "text", "text": "What do you see in this JPEG image?"},
#                 ],
#             },
#         ],
#         max_tokens=100,
#     )
#     assert response == response_snapshot


================================================
FILE: integration-tests/models/test_transformers_olmo.py
================================================
import pytest


@pytest.fixture(scope="module")
def flash_llama_handle(launcher):
    with launcher("allenai/OLMo-7B-0724-Instruct-hf", num_shard=2) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_llama(flash_llama_handle):
    await flash_llama_handle.health(300)
    return flash_llama_handle.client


@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_simple(flash_llama, response_snapshot):
    response = await flash_llama.generate(
        "Test request", max_new_tokens=10, decoder_input_details=True
    )

    assert response.details.generated_tokens == 10
    assert response.generated_text == ':\n\n```json\n{\n  "'
    assert response == response_snapshot


@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_load(flash_llama, generate_load, response_snapshot):
    responses = await generate_load(flash_llama, "Test request", max_new_tokens=10, n=4)

    assert len(responses) == 4
    assert responses[0].generated_text == ':\n\n```json\n{\n  "'
    assert all([r.generated_text == responses[0].generated_text for r in responses])

    assert responses == response_snapshot


================================================
FILE: integration-tests/neuron/test_generate.py
================================================
import pytest


@pytest.fixture
async def tgi_service(neuron_launcher, neuron_model_config):
    model_name_or_path = neuron_model_config["neuron_model_path"]
    service_name = neuron_model_config["name"]
    with neuron_launcher(service_name, model_name_or_path) as tgi_service:
        await tgi_service.health(600)
        yield tgi_service


@pytest.mark.asyncio
async def test_model_single_request(tgi_service):
    service_name = tgi_service.client.service_name
    prompt = "What is Deep Learning?"
    # Greedy bounded without input
    response = await tgi_service.client.text_generation(
        prompt, max_new_tokens=17, details=True, decoder_input_details=True
    )
    assert response.details.generated_tokens == 17
    greedy_expectations = {
        "llama": " and how does it work?\nDeep learning is a subset of machine learning that uses artificial",
        "qwen2": " - Deep Learning is a subset of Machine Learning that involves the use of artificial neural networks",
        "granite": "\n\nDeep Learning is a subset of machine learning that is inspired by the structure and",
        "qwen3": " And Why Should You Care?\n\nDeep learning is a subset of machine learning that uses neural",
        "phi3": "\n\nDeep learning is a subfield of machine learning that focuses on creating",
    }
    assert response.generated_text == greedy_expectations[service_name]

    # Greedy bounded with input
    greedy_response = await tgi_service.client.text_generation(
        "What is Deep Learning?",
        max_new_tokens=17,
        return_full_text=True,
        details=True,
        decoder_input_details=True,
    )
    assert greedy_response.details.generated_tokens == 17
    assert greedy_response.generated_text == prompt + greedy_expectations[service_name]

    # Sampling
    response = await tgi_service.client.text_generation(
        "What is Deep Learning?",
        do_sample=True,
        top_k=50,
        top_p=0.9,
        repetition_penalty=1.2,
        max_new_tokens=128,
        seed=42,
    )
    # The response must be different
    assert not response.startswith(greedy_expectations[service_name])

    # Greedy with stop sequence (using one of the words returned from the previous test)
    stop_sequence = greedy_response.generated_text.split(" ")[-5]
    response = await tgi_service.client.text_generation(
        "What is Deep Learning?",
        do_sample=False,
        max_new_tokens=128,
        stop_sequences=[stop_sequence],
    )
    assert response.endswith(stop_sequence)


@pytest.mark.asyncio
async def test_model_multiple_requests(tgi_service, neuron_generate_load):
    num_requests = 4
    responses = await neuron_generate_load(
        tgi_service.client,
        "What is Deep Learning?",
        max_new_tokens=17,
        n=num_requests,
    )

    assert len(responses) == 4
    expectations = {
        "llama": "Deep learning is a subset of machine learning that uses artificial",
        "qwen2": "Deep Learning is a subset of Machine Learning that involves",
        "granite": "Deep Learning is a subset of machine learning that is inspired by the structure and",
        "qwen3": " And Why Should You Care?\n\nDeep learning is a subset of machine learning that uses neural",
        "phi3": "Deep learning is a subfield of machine learning that focuses on creating",
    }
    expected = expectations[tgi_service.client.service_name]
    for r in responses:
        assert r.details.generated_tokens == 17
        assert expected in r.generated_text


================================================
FILE: integration-tests/neuron/test_implicit_env.py
================================================
import os

import pytest


@pytest.fixture(scope="module", params=["hub-neuron", "hub", "local-neuron"])
async def tgi_service(request, neuron_launcher, neuron_model_config):
    """Expose a TGI service corresponding to a model configuration

    For each model configuration, the service will be started using the following
    deployment options:
    - from the hub original model (export parameters chosen after hub lookup),
    - from the hub pre-exported neuron model,
    - from a local path to the neuron model.
    """
    # the tgi_env.py script will take care of setting these
    for var in [
        "MAX_BATCH_SIZE",
        "MAX_INPUT_TOKENS",
        "MAX_TOTAL_TOKENS",
        "HF_NUM_CORES",
        "HF_AUTO_CAST_TYPE",
    ]:
        if var in os.environ:
            del os.environ[var]
    if request.param == "hub":
        model_name_or_path = neuron_model_config["model_id"]
    elif request.param == "hub-neuron":
        model_name_or_path = neuron_model_config["neuron_model_id"]
    else:
        model_name_or_path = neuron_model_config["neuron_model_path"]
    service_name = neuron_model_config["name"]
    with neuron_launcher(service_name, model_name_or_path) as tgi_service:
        await tgi_service.health(600)
        yield tgi_service


@pytest.mark.asyncio
async def test_model_single_request(tgi_service):
    # Just verify that the generation works, and nothing is raised, with several set of params

    # No params
    await tgi_service.client.text_generation(
        "What is Deep Learning?",
    )

    response = await tgi_service.client.text_generation(
        "How to cook beans ?",
        max_new_tokens=17,
        details=True,
        decoder_input_details=True,
    )
    assert response.details.generated_tokens == 17

    # Sampling
    await tgi_service.client.text_generation(
        "What is Deep Learning?",
        do_sample=True,
        top_k=50,
        top_p=0.9,
        repetition_penalty=1.2,
        max_new_tokens=128,
        seed=42,
    )


================================================
FILE: integration-tests/pyproject.toml
================================================
[project]
name = "text-generation-integration-tests"
version = "2.0.1"
description = "Text Generation Inference integration tests"
authors = ["Nicolas Patry <nicolas@huggingface.co>"]
requires-python = ">=3.10,<3.13"

dependencies = [
    "pydantic>2,< 3",
    "syrupy>=4.8.0",
    "text-generation>=0.6.0",
    "pytest>=8.3.0",
    "pytest-asyncio>=0.23.1",
    "docker>=7",
    "numpy>=2.0",
    "openai>=1.65",
    "huggingface_hub>=0.29",
    "pillow>=11.1.0",
]

[tool.isort]
profile = "black"


================================================
FILE: integration-tests/pytest.ini
================================================
[pytest]
addopts = --snapshot-warn-unused
asyncio_mode = auto
markers =
    private: marks tests as requiring an admin hf token (deselect with '-m "not private"')


================================================
FILE: integration-tests/requirements.txt
================================================
# This file was autogenerated by uv via the following command:
#    uv pip compile pyproject.toml
aiohappyeyeballs==2.6.1
    # via aiohttp
aiohttp==3.11.13
    # via text-generation
aiosignal==1.3.2
    # via aiohttp
annotated-types==0.7.0
    # via pydantic
anyio==4.8.0
    # via
    #   httpx
    #   openai
attrs==25.3.0
    # via aiohttp
certifi==2025.1.31
    # via
    #   httpcore
    #   httpx
    #   requests
charset-normalizer==3.4.1
    # via requests
distro==1.9.0
    # via openai
docker==7.1.0
    # via text-generation-integration-tests (pyproject.toml)
filelock==3.18.0
    # via huggingface-hub
frozenlist==1.5.0
    # via
    #   aiohttp
    #   aiosignal
fsspec==2025.3.0
    # via huggingface-hub
h11==0.14.0
    # via httpcore
httpcore==1.0.7
    # via httpx
httpx==0.28.1
    # via openai
huggingface-hub==0.29.3
    # via
    #   text-generation-integration-tests (pyproject.toml)
    #   text-generation
idna==3.10
    # via
    #   anyio
    #   httpx
    #   requests
    #   yarl
iniconfig==2.0.0
    # via pytest
jiter==0.9.0
    # via openai
multidict==6.1.0
    # via
    #   aiohttp
    #   yarl
numpy==2.2.3
    # via text-generation-integration-tests (pyproject.toml)
openai==1.66.3
    # via text-generation-integration-tests (pyproject.toml)
packaging==24.2
    # via
    #   huggingface-hub
    #   pytest
pillow==11.1.0
    # via text-generation-integration-tests (pyproject.toml)
pluggy==1.5.0
    # via pytest
propcache==0.3.0
    # via
    #   aiohttp
    #   yarl
pydantic==2.10.6
    # via
    #   text-generation-integration-tests (pyproject.toml)
    #   openai
    #   text-generation
pydantic-core==2.27.2
    # via pydantic
pytest==8.3.5
    # via
    #   text-generation-integration-tests (pyproject.toml)
    #   pytest-asyncio
    #   syrupy
pytest-asyncio==0.25.3
    # via text-generation-integration-tests (pyproject.toml)
pyyaml==6.0.2
    # via huggingface-hub
requests==2.32.3
    # via
    #   docker
    #   huggingface-hub
sniffio==1.3.1
    # via
    #   anyio
    #   openai
syrupy==4.9.0
    # via text-generation-integration-tests (pyproject.toml)
text-generation==0.7.0
    # via text-generation-integration-tests (pyproject.toml)
tqdm==4.67.1
    # via
    #   huggingface-hub
    #   openai
typing-extensions==4.12.2
    # via
    #   anyio
    #   huggingface-hub
    #   openai
    #   pydantic
    #   pydantic-core
urllib3==2.3.0
    # via
    #   docker
    #   requests
yarl==1.18.3
    # via aiohttp


================================================
FILE: launcher/Cargo.toml
================================================
[package]
name = "text-generation-launcher"
description = "Text Generation Launcher"
version.workspace = true
edition.workspace = true
authors.workspace = true
homepage.workspace = true

[dependencies]
clap = { version = "4.4.5", features = ["derive", "env"] }
ctrlc = { version = "3.4.1", features = ["termination"] }
hf-hub = "0.4.2"
nix = { version = "0.28.0", features = ["signal"] }
once_cell = "1.19.0"
pyo3 = { workspace = true }
serde = { version = "1.0.188", features = ["derive"] }
serde_json = "1.0.107"
thiserror = "1.0.59"
tracing = "0.1.37"
tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] }
regex = "1.11.0"

[dev-dependencies]
float_eq = "1.0.1"
reqwest = { version = "0.11.20", features = ["blocking", "json"] }

[build-dependencies]
vergen = { version = "8.2.5", features = ["build", "cargo", "git", "gitcl", "rustc", "si"] }


================================================
FILE: launcher/build.rs
================================================
use std::error::Error;
use vergen::EmitBuilder;

fn main() -> Result<(), Box<dyn Error>> {
    // Emit cargo and rustc compile time values
    EmitBuilder::builder().all_cargo().all_rustc().emit()?;

    // Try to get the git sha from the local git repository
    if EmitBuilder::builder()
        .fail_on_error()
        .git_sha(false)
        .emit()
        .is_err()
    {
        // Unable to get the git sha
        if let Ok(sha) = std::env::var("GIT_SHA") {
            // Set it from an env var
            println!("cargo:rustc-env=VERGEN_GIT_SHA={sha}");
        }
    }

    // Set docker label if present
    if let Ok(label) = std::env::var("DOCKER_LABEL") {
        // Set it from an env var
        println!("cargo:rustc-env=DOCKER_LABEL={label}");
    }

    Ok(())
}


================================================
FILE: launcher/src/env_runtime.rs
================================================
use std::fmt;
use std::process::Command;

pub(crate) struct Env {
    cargo_target: &'static str,
    cargo_version: &'static str,
    git_sha: &'static str,
    docker_label: &'static str,
    nvidia_env: String,
    xpu_env: String,
    hpu_env: String,
}

impl Env {
    pub fn new() -> Self {
        let nvidia_env = nvidia_smi();
        let xpu_env = xpu_smi();
        let hpu_env = hl_smi();

        Self {
            nvidia_env: nvidia_env.unwrap_or("N/A".to_string()),
            xpu_env: xpu_env.unwrap_or("N/A".to_string()),
            hpu_env: hpu_env.unwrap_or("N/A".to_string()),
            cargo_target: env!("VERGEN_CARGO_TARGET_TRIPLE"),
            cargo_version: env!("VERGEN_RUSTC_SEMVER"),
            git_sha: option_env!("VERGEN_GIT_SHA").unwrap_or("N/A"),
            docker_label: option_env!("DOCKER_LABEL").unwrap_or("N/A"),
        }
    }
}

impl fmt::Display for Env {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        writeln!(f, "Runtime environment:")?;

        writeln!(f, "Target: {}", self.cargo_target)?;
        writeln!(f, "Cargo version: {}", self.cargo_version)?;
        writeln!(f, "Commit sha: {}", self.git_sha)?;
        writeln!(f, "Docker label: {}", self.docker_label)?;
        writeln!(f, "nvidia-smi:\n{}", self.nvidia_env)?;
        writeln!(f, "xpu-smi:\n{}", self.xpu_env)?;
        writeln!(f, "hpu-smi:\n{}", self.hpu_env)?;

        Ok(())
    }
}

fn nvidia_smi() -> Option<String> {
    let output = Command::new("nvidia-smi").output().ok()?;
    let nvidia_smi = String::from_utf8(output.stdout).ok()?;
    let output = nvidia_smi.replace('\n', "\n   ");
    Some(output.trim().to_string())
}

fn xpu_smi() -> Option<String> {
    let output = Command::new("xpu-smi").arg("discovery").output().ok()?;
    let xpu_smi = String::from_utf8(output.stdout).ok()?;
    let output = xpu_smi.replace('\n', "\n   ");
    Some(output.trim().to_string())
}

fn hl_smi() -> Option<String> {
    let output = Command::new("hl-smi").output().ok()?;
    let hl_smi = String::from_utf8(output.stdout).ok()?;
    let output = hl_smi.replace('\n', "\n   ");
    Some(output.trim().to_string())
}


================================================
FILE: launcher/src/gpu.rs
================================================
pub fn get_cuda_capability() -> Option<(usize, usize)> {
    use pyo3::prelude::*;

    let py_get_capability = |py: Python| -> PyResult<(isize, isize)> {
        let torch = py.import_bound("torch.cuda")?;
        let get_device_capability = torch.getattr("get_device_capability")?;
        get_device_capability.call0()?.extract()
    };

    match pyo3::Python::with_gil(py_get_capability) {
        Ok((major, minor)) if major < 0 || minor < 0 => {
            tracing::warn!("Ignoring negative GPU compute capabilities: {major}.{minor}");
            None
        }
        Ok((major, minor)) => Some((major as usize, minor as usize)),
        Err(err) => {
            tracing::warn!("Cannot determine GPU compute capability: {}", err);
            None
        }
    }
}


================================================
FILE: launcher/src/main.rs
================================================
use clap::{Parser, ValueEnum};
use hf_hub::{api::sync::ApiBuilder, Repo, RepoType};
use nix::sys::signal::{self, Signal};
use nix::unistd::Pid;
use serde::Deserialize;
use std::env;
use std::ffi::OsString;
use std::io::{BufRead, BufReader};
use std::os::unix::process::{CommandExt, ExitStatusExt};
use std::path::Path;
use std::process::{Child, Command, ExitStatus, Stdio};
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::mpsc::TryRecvError;
use std::sync::{mpsc, Arc};
use std::thread;
use std::thread::sleep;
use std::time::{Duration, Instant};
use std::{
    fs, io,
    io::{Read, Write},
};
use thiserror::Error;
use tracing_subscriber::{filter::LevelFilter, EnvFilter};

mod env_runtime;
mod gpu;

fn compute_optimal(config: Option<&Config>, compute: Option<&ComputeType>) -> Option<usize> {
    let config = config?;
    let compute = compute?;
    let f16_max_compute = compute.f16_flop()?;
    let model_compute = config.flop()?;
    tracing::debug!(
        "Max compute {} model compute {}",
        human_size(f16_max_compute as usize, "flop"),
        human_size(model_compute as usize, "flop")
    );
    let optimal_size = (f16_max_compute / model_compute) as usize;
    if optimal_size > 100 {
        // Ignore calculations that's too low
        // Most likely an error
        Some(optimal_size)
    } else {
        None
    }
}

fn human_size(size: usize, suffix: &str) -> String {
    let mut size: f64 = size as f64;
    let mut p = "";
    for prefix in ["", "K", "M", "G", "T"] {
        p = prefix;
        if size > 1_000.0 {
            size /= 1_000.0;
        } else {
            break;
        }
    }
    format!("{size:.2}{p}{suffix}")
}

fn vram_maximum(
    config: Option<&Config>,
    compute: Option<&ComputeType>,
    memory_fraction: f32,
) -> Option<usize> {
    let config = config?;
    let compute = compute?;
    let available = compute.vram(memory_fraction)?;
    let model = config.model_vram()?;
    let token_vram = config.token_vram()?;
    if let Some(vram) = available.checked_sub(model) {
        let tokens_allowed = vram / token_vram;
        tracing::debug!(
            "Available vram {}: model needs {}, every tokens requires {}, maximum allocatable tokens {tokens_allowed}",
            human_size(available, "B"),
            human_size(model, "B"),
            human_size(token_vram, "B"),
        );
        Some(tokens_allowed)
    } else {
        tracing::warn!(
            "Not enough VRAM to run the model: Available: {} - Model {}.",
            human_size(available, "B"),
            human_size(model, "B")
        );
        None
    }
}

fn get_config(
    model_id: &str,
    revision: &Option<String>,
) -> Result<Config, Box<dyn std::error::Error>> {
    let mut path = std::path::Path::new(model_id).to_path_buf();
    let model_id = model_id.to_string();
    let filename = if !path.exists() {
        // Assume it's a hub id

        let mut builder = ApiBuilder::from_env();
        if let Ok(token) = std::env::var("HF_TOKEN") {
            // env variable has precedence over on file token.
            builder = builder.with_token(Some(token))
        };
        if let Ok(origin) = env::var("HF_HUB_USER_AGENT_ORIGIN") {
            builder = builder.with_user_agent("origin", origin.as_str());
        }
        let api = builder.build()?;
        let repo = if let Some(ref revision) = revision {
            api.repo(Repo::with_revision(
                model_id,
                RepoType::Model,
                revision.to_string(),
            ))
        } else {
            api.model(model_id)
        };
        repo.get("config.json")?
    } else {
        path.push("config.json");
        path
    };

    let content = std::fs::read_to_string(filename)?;
    let config: RawConfig = serde_json::from_str(&content)?;

    let config: Config = config.into();
    Ok(config)
}

fn resolve_attention(config: &Option<Config>, lora_adapters: &Option<String>) -> (String, String) {
    let compute_capability = gpu::get_cuda_capability();
    let mut prefix_caching: Option<String> = std::env::var("PREFIX_CACHING").ok();
    let mut attention: Option<String> = std::env::var("ATTENTION").ok();
    if let Some(config) = config {
        if prefix_caching.is_none() {
            if config.vision_config.is_some() {
                tracing::info!("Disabling prefix caching because of VLM model");
                prefix_caching = Some("0".to_string());
            } else if config.is_encoder_decoder {
                tracing::info!("Disabling prefix caching because of seq2seq model");
                prefix_caching = Some("0".to_string());
            }
        }

        let fallback_attention = if compute_capability.is_none()
            || matches!(compute_capability, Some((major, _)) if major < 8)
        {
            "paged"
        } else {
            "flashdecoding"
        };

        match config.get_head_dim() {
            Some(h) if h == 64 || h == 128 || h == 256 => {
                if lora_adapters.is_some() && prefix_caching.is_none() {
                    tracing::info!("Disabling prefix caching because of lora adapters");
                    prefix_caching = Some("0".to_string());
                }
                match config.model_type.as_deref() {
                    Some("falcon") | Some("deepseek_v2") => {
                        // Required because gemma2 needs bfloat16 which is not supported by
                        // flashinfer ?
                        if attention.is_none() {
                            tracing::info!(
                                "Forcing attention to '{fallback_attention}' because model {} requires it",
                                config.model_type.as_ref().unwrap()
                            );
                            attention = Some(fallback_attention.to_string());
                        }
                        if fallback_attention == "paged" && prefix_caching.is_none() {
                            tracing::info!("Disabling prefix caching because it is not supported with 'paged' attention");
                            prefix_caching = Some("0".to_string());
                        }
                    }
                    Some("t5") => {}
                    _ => {}
                }
            }
            _ => {
                if attention.is_none() {
                    tracing::info!("Forcing attention to '{fallback_attention}' because head dim is not supported by flashinfer, also disabling prefix caching");
                    attention = Some(fallback_attention.to_string());
                }
                if prefix_caching.is_none() {
                    prefix_caching = Some("0".to_string());
                }
            }
        }
    }
    if attention == Some("paged".to_string()) && prefix_caching.is_none() {
        tracing::info!("Disabling prefix caching on paged attention");
        prefix_caching = Some("0".to_string());
    }

    let attention = attention.unwrap_or("flashinfer".to_string());
    let prefix_caching = prefix_caching.unwrap_or("true".to_string());

    (prefix_caching, attention)
}

#[derive(Deserialize)]
struct RawConfig {
    max_position_embeddings: Option<usize>,
    n_positions: Option<usize>,
    model_type: Option<String>,
    max_seq_len: Option<usize>,
    quantization_config: Option<QuantizationConfig>,
    n_embd: Option<usize>,
    hidden_size: Option<usize>,
    intermediate_size: Option<usize>,
    num_attention_heads: Option<usize>,
    num_key_value_heads: Option<usize>,
    num_hidden_layers: Option<usize>,
    head_dim: Option<usize>,
    text_config: Option<TextConfig>,
    vision_config: Option<VisionConfig>,
    is_encoder_decoder: Option<bool>,
    #[serde(rename = "num_experts_per_tok")]
    num_experts_per_token: Option<usize>,
    #[serde(rename = "n_shared_experts")]
    num_shared_experts: Option<usize>,
    #[serde(rename = "num_local_experts")]
    num_experts: Option<usize>,
    vocab_size: Option<usize>,
}

#[derive(Deserialize)]
struct QuantizationConfig {
    quant_method: Option<Quantization>,
}

#[derive(Debug, Deserialize)]
struct VisionConfig {}

#[derive(Debug, Deserialize)]
struct TextConfig {
    head_dim: Option<usize>,
}

#[derive(Debug, Deserialize)]
struct Config {
    max_position_embeddings: Option<usize>,
    quantize: Option<Quantization>,
    head_dim: Option<usize>,
    num_heads: Option<usize>,
    num_kv_heads: Option<usize>,
    num_layers: Option<usize>,
    intermediate_size: Option<usize>,
    hidden_size: Option<usize>,
    model_type: Option<String>,
    text_config: Option<TextConfig>,
    vision_config: Option<VisionConfig>,
    is_encoder_decoder: bool,
    num_experts_per_token: usize,
    num_shared_experts: usize,
    num_experts: usize,
    vocab_size: Option<usize>,
}

impl Config {
    fn get_head_dim(&self) -> Option<usize> {
        if let Some(head_dim) = self.head_dim {
            return Some(head_dim);
        }

        let text_config = self.text_config.as_ref()?;
        if let Some(head_size) = text_config.head_dim {
            return Some(head_size);
        }

        match self.model_type.as_deref() {
            // We special-case gemma3 here, since we need flashinfer for
            // handling bidirectional masks. And flashinfer can only be
            // used when the head size is known.
            Some("gemma3") => Some(256),
            _ => None,
        }
    }

    fn flop(&self) -> Option<u64> {
        if self.vision_config.is_some() {
            // VLM are much harder to predict and VRAM requirements
            // Are more complex.
            return None;
        }
        let num_heads = self.num_heads? as u64;
        let num_kv_heads = self.num_kv_heads? as u64;
        let head_dim = self.get_head_dim()? as u64;
        let hidden_size = self.hidden_size? as u64;
        let intermediate_size = (self.intermediate_size?
            * (self.num_experts_per_token + self.num_shared_experts))
            as u64;
        let num_layers = self.num_layers? as u64;

        let q_flops = 2 * num_heads * head_dim * hidden_size;
        let k_flops = 2 * num_kv_heads * head_dim * hidden_size;
        let v_flops = 2 * num_kv_heads * head_dim * hidden_size;
        let attn_flops = 2 * num_heads * head_dim * hidden_size;
        let o_flops = 2 * num_heads * head_dim * hidden_size;
        let attn_layer_flops = q_flops + k_flops + v_flops + attn_flops + o_flops;

        let gate_up_down_flops = 2 * 3 * hidden_size * intermediate_size;

        let layer_flops = attn_layer_flops + gate_up_down_flops;
        let total = layer_flops * num_layers;
        Some(total)
    }

    fn kv_vram_per_tok(&self) -> Option<usize> {
        if self.quantize.is_some() {
            // TODO handle quantization
            return None;
        }
        // 2 for key and values
        // 2 for f16 dtype?
        Some(self.num_kv_heads? * 2 * self.get_head_dim()? * 2 * self.num_layers?)
    }

    fn mlp_vram_per_tok(&self) -> Option<usize> {
        // TODO handle quantization
        // TODO This calculation depends on the actual implementation
        let dtype_size = 2;
        let mlp_size = self.intermediate_size?;
        // calculation is overshooting here.
        // Coming from here: https://github.com/vllm-project/vllm/blob/d1c2e15eb31ef12e688ce0cb71895f88eaf4cd4f/vllm/model_executor/layers/fused_moe/fused_moe.py#L618-L624
        Some((mlp_size + mlp_size / 2) * self.num_experts * dtype_size * 3)
    }

    fn token_vram(&self) -> Option<usize> {
        let kv = self.kv_vram_per_tok()?;
        let mlp_intermediary = self.mlp_vram_per_tok()?;
        let per_tok = kv + mlp_intermediary;
        Some(per_tok)
    }

    fn model_vram(&self) -> Option<usize> {
        let attn_vram = (self.num_heads? + 2 * self.num_kv_heads?) * self.get_head_dim()?;
        let o_vram = self.num_heads? * self.get_head_dim()? * self.hidden_size?;
        // gate + up + down = 3
        let mlp_vram = 3 * self.intermediate_size? * self.num_experts * self.hidden_size?;
        let layer_vram = mlp_vram + attn_vram + o_vram;
        let vocab = self.hidden_size? * self.vocab_size?;
        let params = layer_vram * self.num_layers? + 2 * vocab;
        let dtype_size = 2;
        if self.quantize.is_some() {
            // TODO handle quantization
            return None;
        }
        Some(params * dtype_size)
    }
}

impl From<RawConfig> for Config {
    fn from(other: RawConfig) -> Self {
        let max_position_embeddings = other
            .max_position_embeddings
            .or(other.max_seq_len)
            .or(other.n_positions);
        let quantize = other.quantization_config.and_then(|q| q.quant_method);
        let hidden_size = other.hidden_size.or(other.n_embd);
        let head_dim = other
            .head_dim
            .or_else(|| match (hidden_size, other.num_attention_heads) {
                (Some(hidden_size), Some(num_attention_heads))
                    if hidden_size % num_attention_heads == 0 =>
                {
                    Some(hidden_size / num_attention_heads)
                }
                _ => None,
            });
        let num_heads = other.num_attention_heads;
        let num_layers = other.num_hidden_layers;
        let num_kv_heads = other.num_key_value_heads.or(other.num_attention_heads);
        let intermediate_size = other.intermediate_size;
        let model_type = other.model_type;
        let text_config = other.text_config;
        let vision_config = other.vision_config;
        let is_encoder_decoder = other.is_encoder_decoder.unwrap_or(false);
        let num_experts_per_token = other.num_experts_per_token.unwrap_or(1);
        let num_shared_experts = other.num_shared_experts.unwrap_or(0);
        let num_experts = other.num_experts.unwrap_or(1);
        let vocab_size = other.vocab_size;
        Config {
            max_position_embeddings,
            quantize,
            head_dim,
            model_type,
            text_config,
            vision_config,
            is_encoder_decoder,
            hidden_size,
            num_heads,
            num_kv_heads,
            intermediate_size,
            num_layers,
            num_experts_per_token,
            num_shared_experts,
            num_experts,
            vocab_size,
        }
    }
}

#[derive(Clone, Copy, Debug, ValueEnum, Deserialize)]
#[serde(rename_all = "kebab-case")]
enum Quantization {
    /// 4 bit quantization. Requires a specific AWQ quantized model:
    ///   <https://hf.co/models?search=awq>.
    /// Should replace GPTQ models wherever possible because of the better latency
    Awq,
    /// Compressed tensors, which can be a mixture of different quantization methods.
    CompressedTensors,
    /// 8 bit quantization, doesn't require specific model.
    /// Should be a drop-in replacement to bitsandbytes with much better performance.
    /// Kernels are from <https://github.com/NetEase-FuXi/EETQ.git>
    Eetq,
    /// Variable bit quantization. Requires a specific EXL2 quantized model:
    /// <https://hf.co/models?search=exl2>. Requires exllama2 kernels and does
    /// not support tensor parallelism (num_shard > 1).
    Exl2,
    /// 4 bit quantization. Requires a specific GTPQ quantized model: <https://hf.co/models?search=gptq>.
    /// text-generation-inference will use exllama (faster) kernels wherever possible, and use
    /// triton kernel (wider support) when it's not.
    /// AWQ has faster kernels.
    Gptq,
    /// 4 bit quantization. Requires a specific Marlin quantized model: <https://hf.co/models?search=marlin>.
    Marlin,
    /// Bitsandbytes 8bit. Can be applied on any model, will cut the memory requirement in half,
    /// but it is known that the model will be much slower to run than the native f16.
    // #[deprecated(
    //     since = "1.1.0",
    //     note = "Use `eetq` instead, which provides better latencies overall and is drop-in in most cases"
    // )]
    Bitsandbytes,
    /// Bitsandbytes 4bit. Can be applied on any model, will cut the memory requirement by 4x,
    /// but it is known that the model will be much slower to run than the native f16.
    BitsandbytesNf4,
    /// Bitsandbytes 4bit. nf4 should be preferred in most cases but maybe this one has better
    /// perplexity performance for you model
    BitsandbytesFp4,
    /// [FP8](https://developer.nvidia.com/blog/nvidia-arm-and-intel-publish-fp8-specification-for-standardization-as-an-interchange-format-for-ai/) (e4m3) works on H100 and above
    /// This dtype has native ops should be the fastest if available.
    /// This is currently not the fastest because of local unpacking + padding to satisfy matrix
    /// multiplication limitations.
    Fp8,
}

impl std::fmt::Display for Quantization {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        // To keep in track with `server`.
        match self {
            #[allow(deprecated)]
            // Use `eetq` instead, which provides better latencies overall and is drop-in in most cases
            Quantization::Bitsandbytes => {
                write!(f, "bitsandbytes")
            }
            Quantization::BitsandbytesNf4 => {
                write!(f, "bitsandbytes-nf4")
            }
            Quantization::BitsandbytesFp4 => {
                write!(f, "bitsandbytes-fp4")
            }
            Quantization::Exl2 => {
                write!(f, "exl2")
            }
            Quantization::Gptq => {
                write!(f, "gptq")
            }
            Quantization::Marlin => {
                write!(f, "marlin")
            }
            Quantization::Awq => {
                write!(f, "awq")
            }
            Quantization::CompressedTensors => {
                write!(f, "compressed-tensors")
            }
            Quantization::Eetq => {
                write!(f, "eetq")
            }
            Quantization::Fp8 => {
                write!(f, "fp8")
            }
        }
    }
}

#[derive(Clone, Copy, Debug, ValueEnum)]
enum Dtype {
    Float16,
    #[clap(name = "bfloat16")]
    BFloat16,
}

impl std::fmt::Display for Dtype {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        // To keep in track with `server`.
        match self {
            Dtype::Float16 => {
                write!(f, "float16")
            }
            Dtype::BFloat16 => {
                write!(f, "bfloat16")
            }
        }
    }
}

#[derive(Clone, Copy, Debug, ValueEnum)]
enum KVCacheDtype {
    #[clap(name = "fp8_e4m3fn")]
    Fp8e4m3fn,

    #[clap(name = "fp8_e5m2")]
    Fp8e5m2,
}

impl std::fmt::Display for KVCacheDtype {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            KVCacheDtype::Fp8e4m3fn => {
                write!(f, "fp8_e4m3fn")
            }
            KVCacheDtype::Fp8e5m2 => {
                write!(f, "fp8_e5m2")
            }
        }
    }
}

#[derive(Clone, Copy, Debug, ValueEnum)]
enum RopeScaling {
    Linear,
    Dynamic,
}

impl std::fmt::Display for RopeScaling {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        // To keep in track with `server`.
        match self {
            RopeScaling::Linear => {
                write!(f, "linear")
            }
            RopeScaling::Dynamic => {
                write!(f, "dynamic")
            }
        }
    }
}

#[derive(Clone, Copy, Debug, ValueEnum)]
pub enum UsageStatsLevel {
    /// Default option, usage statistics are collected anonymously
    On,
    /// Disables all collection of usage statistics
    Off,
    /// Doesn't send the error stack trace or error type, but allows sending a crash event
    NoStack,
}

impl std::fmt::Display for UsageStatsLevel {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        // To keep in track with `server`.
        match self {
            UsageStatsLevel::On => {
                write!(f, "on")
            }
            UsageStatsLevel::Off => {
                write!(f, "off")
            }
            UsageStatsLevel::NoStack => {
                write!(f, "no-stack")
            }
        }
    }
}

/// App Configuration
#[derive(Parser, Debug)]
#[clap(author, version, about, long_about = None)]
struct Args {
    /// The name of the model to load.
    /// Can be a MODEL_ID as listed on <https://hf.co/models> like
    /// `gpt2` or `OpenAssistant/oasst-sft-1-pythia-12b`.
    /// Or it can be a local directory containing the necessary files
    /// as saved by `save_pretrained(...)` methods of transformers
    #[clap(default_value = "bigscience/bloom-560m", long, env)]
    model_id: String,

    /// The actual revision of the model if you're referring to a model
    /// on the hub. You can use a specific commit id or a branch like `refs/pr/2`.
    #[clap(long, env)]
    revision: Option<String>,

    /// The number of tokenizer workers used for payload validation and truncation inside the
    /// router.
    #[clap(default_value = "2", long, env)]
    validation_workers: usize,

    /// Whether to shard the model across multiple GPUs
    /// By default text-generation-inference will use all available GPUs to run
    /// the model. Setting it to `false` deactivates `num_shard`.
    #[clap(long, env)]
    sharded: Option<bool>,

    /// The number of shards to use if you don't want to use all GPUs on a given machine.
    /// You can use `CUDA_VISIBLE_DEVICES=0,1 text-generation-launcher... --num_shard 2`
    /// and `CUDA_VISIBLE_DEVICES=2,3 text-generation-launcher... --num_shard 2` to
    /// launch 2 copies with 2 shard each on a given machine with 4 GPUs for instance.
    #[clap(long, env)]
    num_shard: Option<usize>,

    /// Quantization method to use for the model. It is not necessary to specify this option
    /// for pre-quantized models, since the quantization method is read from the model
    /// configuration.
    ///
    /// Marlin kernels will be used automatically for GPTQ/AWQ models.
    #[clap(long, env, value_enum)]
    quantize: Option<Quantization>,

    /// The number of input_ids to speculate on
    /// If using a medusa model, the heads will be picked up automatically
    /// Other wise, it will use n-gram speculation which is relatively free
    /// in terms of compute, but the speedup heavily depends on the task.
    #[clap(long, env)]
    speculate: Option<usize>,

    /// The dtype to be forced upon the model. This option cannot be used with `--quantize`.
    #[clap(long, env, value_enum)]
    dtype: Option<Dtype>,

    /// Specify the dtype for the key-value cache. When this option is not provided,
    /// the dtype of the model is used (typically `float16` or `bfloat16`). Currently
    /// the only supported value are `fp8_e4m3fn` and `fp8_e5m2` on CUDA.
    #[clap(long, env, value_enum)]
    kv_cache_dtype: Option<KVCacheDtype>,

    /// Whether you want to execute hub modelling code. Explicitly passing a `revision` is
    /// encouraged when loading a model with custom code to ensure no malicious code has been
    /// contributed in a newer revision.
    #[clap(long, env, value_enum)]
    trust_remote_code: bool,

    /// The maximum amount of concurrent requests for this particular deployment.
    /// Having a low limit will refuse clients requests instead of having them
    /// wait for too long and is usually good to handle backpressure correctly.
    #[clap(default_value = "128", long, env)]
    max_concurrent_requests: usize,

    /// This is the maximum allowed value for clients to set `best_of`.
    /// Best of makes `n` generations at the same time, and return the best
    /// in terms of overall log probability over the entire generated sequence
    #[clap(default_value = "2", long, env)]
    max_best_of: usize,

    /// This is the maximum allowed value for clients to set `stop_sequences`.
    /// Stop sequences are used to allow the model to stop on more than just
    /// the EOS token, and enable more complex "prompting" where users can preprompt
    /// the model in a specific way and define their "own" stop token aligned with
    /// their prompt.
    #[clap(default_value = "4", long, env)]
    max_stop_sequences: usize,

    /// This is the maximum allowed value for clients to set `top_n_tokens`.
    /// `top_n_tokens` is used to return information about the the `n` most likely
    /// tokens at each generation step, instead of just the sampled token. This
    /// information can be used for downstream tasks like for classification or
    /// ranking.
    #[clap(default_value = "5", long, env)]
    max_top_n_tokens: u32,

    /// This is the maximum allowed input length (expressed in number of tokens)
    /// for users. The larger this value, the longer prompt users can send which
    /// can impact the overall memory required to handle the load.
    /// Please note that some models have a finite range of sequence they can handle.
    /// Default to min(max_allocatable, max_position_embeddings) - 1
    #[clap(long, env)]
    max_input_tokens: Option<usize>,

    /// Legacy version of [`Args::max_input_tokens`].
    #[clap(long, env)]
    max_input_length: Option<usize>,

    /// This is the most important value to set as it defines the "memory budget"
    /// of running clients requests.
    /// Clients will send input sequences and ask to generate `max_new_tokens`
    /// on top. with a value of `1512` users can send either a prompt of
    /// `1000` and ask for `512` new tokens, or send a prompt of `1` and ask for
    /// `1511` max_new_tokens.
    /// The larger this value, the larger amount each request will be in your RAM
    /// and the less effective batching can be.
    /// Default to min(max_allocatable, max_position_embeddings)
    #[clap(long, env)]
    max_total_tokens: Option<usize>,

    /// This represents the ratio of waiting queries vs running queries where
    /// you want to start considering pausing the running queries to include the waiting
    /// ones into the same batch.
    /// `waiting_served_ratio=1.2` Means when 12 queries are waiting and there's
    /// only 10 queries left in the current batch we check if we can fit those 12
    /// waiting queries into the batching strategy, and if yes, then batching happens
    /// delaying the 10 running queries by a `prefill` run.
    ///
    /// This setting is only applied if there is room in the batch
    /// as defined by `max_batch_total_tokens`.
    #[clap(default_value = "0.3", long, env)]
    waiting_served_ratio: f32,

    /// Limits the number of tokens for the prefill operation.
    /// Since this operation take the most memory and is compute bound, it is interesting
    /// to limit the number of requests that can be sent.
    /// Default to `max_input_tokens + 50` to give a bit of room.
    #[clap(long, env)]
    max_batch_prefill_tokens: Option<u32>,

    /// **IMPORTANT** This is one critical control to allow maximum usage
    /// of the available hardware.
    ///
    /// This represents the total amount of potential tokens within a batch.
    /// When using padding (not recommended) this would be equivalent of
    /// `batch_size` * `max_total_tokens`.
    ///
    /// However in the non-padded (flash attention) version this can be much finer.
    ///
    /// For `max_batch_total_tokens=1000`, you could fit `10` queries of `total_tokens=100`
    /// or a single query of `1000` tokens.
    ///
    /// Overall this number should be the largest possible amount that fits the
    /// remaining memory (after the model is loaded). Since the actual memory overhead
    /// depends on other parameters like if you're using quantization, flash attention
    /// or the model implementation, text-generation-inference infers this number automatically
    /// if not provided ensuring that the value is as large as possible.
    #[clap(long, env)]
    max_batch_total_tokens: Option<u32>,

    /// This setting defines how many tokens can be passed before forcing the waiting
    /// queries to be put on the batch (if the size of the batch allows for it).
    /// New queries require 1 `prefill` forward, which is different from `decode`
    /// and therefore you need to pause the running batch in order to run `prefill`
    /// to create the correct values for the waiting queries to be able to join the batch.
    ///
    /// With a value too small, queries will always "steal" the compute to run `prefill`
    /// and running queries will be delayed by a lot.
    ///
    /// With a value too big, waiting queries could wait for a very long time
    /// before being allowed a slot in the running batch. If your server is busy
    /// that means that requests that could run in ~2s on an empty server could
    /// end up running in ~20s because the query had to wait for 18s.
    ///
    /// This number is expressed in number of tokens to make it a bit more
    /// "model" agnostic, but what should really matter is the overall latency
    /// for end users.
    #[clap(default_value = "20", long, env)]
    max_waiting_tokens: usize,

    /// Enforce a maximum number of requests per batch
    /// Specific flag for hardware targets that do not support unpadded inference
    #[clap(long, env)]
    max_batch_size: Option<usize>,

    /// Specify the batch sizes to compute cuda graphs for.
    /// Use "0" to disable.
    /// Default = "1,2,4,8,16,32"
    #[clap(long, env, value_delimiter = ',')]
    cuda_graphs: Option<Vec<usize>>,

    /// The IP address to listen on
    #[clap(default_value = "0.0.0.0", long, env)]
    hostname: String,

    /// The port to listen on.
    #[clap(default_value = "3000", long, short, env)]
    port: u16,

    /// The Prometheus port to listen on.
    #[clap(default_value = "9000", long, short, env)]
    prometheus_port: u16,

    /// The name of the socket for gRPC communication between the webserver
    /// and the shards.
    #[clap(default_value = "/tmp/text-generation-server", long, env)]
    shard_uds_path: String,

    /// The address the master shard will listen on. (setting used by torch distributed)
    #[clap(default_value = "localhost", long, env)]
    master_addr: String,

    /// The address the master port will listen on. (setting used by torch distributed)
    #[clap(default_value = "29500", long, env)]
    master_port: usize,

    /// The location of the huggingface hub cache.
    /// Used to override the location if you want to provide a mounted disk for instance
    #[clap(long, env)]
    huggingface_hub_cache: Option<String>,

    /// The location of the huggingface hub cache.
    /// Used to override the location if you want to provide a mounted disk for instance
    #[clap(long, env)]
    weights_cache_override: Option<String>,

    /// For some models (like bloom), text-generation-inference implemented custom
    /// cuda kernels to speed up inference. Those kernels were only tested on A100.
    /// Use this flag to disable them if you're running on different hardware and
    /// encounter issues.
    #[clap(long, env)]
    disable_custom_kernels: bool,

    /// Limit the CUDA available memory.
    /// The allowed value equals the total visible memory multiplied by cuda-memory-fraction.
    #[clap(default_value = "1.0", long, env)]
    cuda_memory_fraction: f32,

    /// Rope scaling will only be used for RoPE models
    /// and allow rescaling the position rotary to accomodate for
    /// larger prompts.
    ///
    /// Goes together with `rope_factor`.
    ///
    /// `--rope-factor 2.0` gives linear scaling with a factor of 2.0
    /// `--rope-scaling dynamic` gives dynamic scaling with a factor of 1.0
    /// `--rope-scaling linear` gives linear scaling with a factor of 1.0 (Nothing will be changed
    /// basically)
    ///
    /// `--rope-scaling linear --rope-factor` fully describes the scaling you want
    #[clap(long, env)]
    rope_scaling: Option<RopeScaling>,

    /// Rope scaling will only be used for RoPE models
    /// See `rope_scaling`
    #[clap(long, env)]
    rope_factor: Option<f32>,

    /// Outputs the logs in JSON format (useful for telemetry)
    #[clap(long, env)]
    json_output: bool,

    #[clap(long, env)]
    otlp_endpoint: Option<String>,

    #[clap(default_value = "text-generation-inference.router", long, env)]
    otlp_service_name: String,

    #[clap(long, env)]
    cors_allow_origin: Vec<String>,

    #[clap(long, env)]
    api_key: Option<String>,

    #[clap(long, env)]
    watermark_gamma: Option<f32>,
    #[clap(long, env)]
    watermark_delta: Option<f32>,

    /// Enable ngrok tunneling
    #[clap(long, env)]
    ngrok: bool,

    /// ngrok authentication token
    #[clap(long, env)]
    ngrok_authtoken: Option<String>,

    /// ngrok edge
    #[clap(long, env)]
    ngrok_edge: Option<String>,

    /// The path to the tokenizer config file. This path is used to load the tokenizer configuration which may
    /// include a `chat_template`. If not provided, the default config will be used from the model hub.
    #[clap(long, env)]
    tokenizer_config_path: Option<String>,

    /// Disable outlines grammar constrained generation.
    /// This is a feature that allows you to generate text that follows a specific grammar.
    #[clap(long, env)]
    disable_grammar_support: bool,

    /// Display a lot of information about your runtime environment
    #[clap(long, short, action)]
    env: bool,

    /// Control the maximum number of inputs that a client can send in a single request
    #[clap(default_value = "4", long, env)]
    max_client_batch_size: usize,

    /// Lora Adapters a list of adapter ids i.e. `repo/adapter1,repo/adapter2` to load during
    /// startup that will be available to callers via the `adapter_id` field in a request.
    #[clap(long, env)]
    lora_adapters: Option<String>,

    /// Control if anonymous usage stats are collected.
    /// Options are "on", "off" and "no-stack"
    /// Defaul is on.
    #[clap(default_value = "on", long, env)]
    usage_stats: UsageStatsLevel,

    /// Payload size limit in bytes
    ///
    /// Default is 2MB
    #[clap(default_value = "2000000", long, env)]
    payload_limit: usize,

    /// Enables prefill logprobs
    ///
    /// Logprobs in the prompt are deactivated by default because they consume
    /// a large amount of VRAM (especially for long prompts).
    /// Using this flag reallows users to ask for them.
    #[clap(long, env)]
    enable_prefill_logprobs: bool,

    /// Change timeout of graceful termination of the TGI server
    #[clap(default_value = "90", long, short, env)]
    graceful_termination_timeout: u64,
}

#[derive(Debug)]
enum ShardStatus {
    Ready,
    Failed(usize),
}

#[allow(clippy::too_many_arguments)]
fn shard_manager(
    model_id: String,
    revision: Option<String>,
    quantize: Option<Quantization>,
    speculate: Option<usize>,
    dtype: Option<Dtype>,
    kv_cache_dtype: Option<KVCacheDtype>,
    trust_remote_code: bool,
    uds_path: String,
    rank: usize,
    world_size: usize,
    master_addr: String,
    master_port: usize,
    huggingface_hub_cache: Option<String>,
    weights_cache_override: Option<String>,
    disable_custom_kernels: bool,
    watermark_gamma: Option<f32>,
    watermark_delta: Option<f32>,
    cuda_graphs: Vec<usize>,
    cuda_memory_fraction: f32,
    rope_scaling: Option<RopeScaling>,
    rope_factor: Option<f32>,
    max_total_tokens: Option<usize>,
    max_batch_size: Option<usize>,
    max_input_tokens: Option<usize>,
    lora_adapters: Option<String>,
    enable_prefill_logprobs: bool,
    otlp_endpoint: Option<String>,
    otlp_service_name: String,
    log_level: LevelFilter,
    status_sender: mpsc::Sender<ShardStatus>,
    shutdown: Arc<AtomicBool>,
    graceful_termination_timeout: u64,
    _shutdown_sender: mpsc::Sender<()>,
) {
    // Enter shard-manager tracing span
    let _span = tracing::span!(tracing::Level::INFO, "shard-manager", rank = rank).entered();

    // Get UDS path
    let uds_string = format!("{uds_path}-{rank}");
    let uds = Path::new(&uds_string);
    // Clean previous runs
    if uds.exists() {
        fs::remove_file(uds).unwrap();
    }

    // Process args
    let mut shard_args = vec![
        "serve".to_string(),
        model_id,
        "--uds-path".to_string(),
        uds_path,
        "--logger-level".to_string(),
        log_level.to_string().to_uppercase(),
        "--json-output".to_string(),
    ];

    // Activate trust remote code
    if trust_remote_code {
        shard_args.push("--trust-remote-code".to_string());
    }

    // Activate tensor parallelism
    if world_size > 1 {
        shard_args.push("--sharded".to_string());
    }

    if let Some(quantize) = quantize {
        shard_args.push("--quantize".to_string());
        shard_args.push(quantize.to_string())
    }

    if let Some(speculate) = speculate {
        shard_args.push("--speculate".to_string());
        shard_args.push(speculate.to_string())
    }

    if let Some(dtype) = dtype {
        shard_args.push("--dtype".to_string());
        shard_args.push(dtype.to_string())
    }

    if let Some(kv_cache_dtype) = kv_cache_dtype {
        shard_args.push("--kv-cache-dtype".to_string());
        shard_args.push(kv_cache_dtype.to_string())
    }

    // Model optional revision
    if let Some(revision) = revision {
        shard_args.push("--revision".to_string());
        shard_args.push(revision)
    }

    let rope = match (rope_scaling, rope_factor) {
        (None, None) => None,
        (Some(scaling), None) => Some((scaling, 1.0)),
        (Some(scaling), Some(factor)) => Some((scaling, factor)),
        (None, Some(factor)) => Some((RopeScaling::Linear, factor)),
    };

    // OpenTelemetry Endpoint
    if let Some(otlp_endpoint) = otlp_endpoint {
        shard_args.push("--otlp-endpoint".to_string());
        shard_args.push(otlp_endpoint);
    }

    // OpenTelemetry Service Name
    shard_args.push("--otlp-service-name".to_string());
    shard_args.push(otlp_service_name);

    // In case we use sliding window, we may ignore the sliding in flash for some backends depending on the parameter.
    if let Some(max_input_tokens) = max_input_tokens {
        shard_args.push("--max-input-tokens".to_string());
        shard_args.push(max_input_tokens.to_string());
    }

    // Copy current process env
    let mut envs: Vec<(OsString, OsString)> = env::vars_os().collect();

    // Remove LOG_LEVEL if present
    envs.retain(|(name, _)| name != "LOG_LEVEL");

    // Torch Distributed Env vars
    envs.push(("RANK".into(), rank.to_string().into()));
    envs.push(("WORLD_SIZE".into(), world_size.to_string().into()));
    envs.push(("MASTER_ADDR".into(), master_addr.into()));
    envs.push(("MASTER_PORT".into(), master_port.to_string().into()));
    envs.push(("TORCH_NCCL_AVOID_RECORD_STREAMS".into(), "1".into()));

    // CUDA memory fraction
    envs.push((
        "CUDA_MEMORY_FRACTION".into(),
        cuda_memory_fraction.to_string().into(),
    ));

    // Safetensors load fast
    envs.push(("SAFETENSORS_FAST_GPU".into(), "1".into()));

    // Disable progress bar
    envs.push(("HF_HUB_DISABLE_PROGRESS_BARS".into(), "1".into()));

    // Enable hf transfer for insane download speeds
    let enable_hf_transfer = env::var("HF_HUB_ENABLE_HF_TRANSFER").unwrap_or("1".to_string());
    envs.push((
        "HF_HUB_ENABLE_HF_TRANSFER".into(),
        enable_hf_transfer.into(),
    ));

    // Parse Inference API token
    if let Ok(api_token) = env::var("HF_API_TOKEN") {
        envs.push(("HF_TOKEN".into(), api_token.into()))
    };

    // Detect rope scaling
    // Sending as env instead of CLI args to not bloat everything
    // those only can be used by RoPE models, so passing information around
    // for all models will complexify code unnecessarily
    if let Some((scaling, factor)) = rope {
        envs.push(("ROPE_SCALING".into(), scaling.to_string().into()));
        envs.push(("ROPE_FACTOR".into(), factor.to_string().into()));
    }

    if let Some(max_total_tokens) = max_total_tokens {
        envs.push((
            "MAX_TOTAL_TOKENS".into(),
            max_total_tokens.to_string().into(),
        ));
    }
    if let Some(max_batch_size) = max_batch_size {
        envs.push(("MAX_BATCH_SIZE".into(), max_batch_size.to_string().into()));
    }

    // Lora Adapters
    if let Some(lora_adapters) = lora_adapters {
        envs.push(("LORA_ADAPTERS".into(), lora_adapters.into()));
    }

    // Logprobs
    if enable_prefill_logprobs {
        envs.push(("REQUEST_LOGPROBS".into(), "1".into()));
    }

    // If huggingface_hub_cache is some, pass it to the shard
    // Useful when running inside a docker container
    if let Some(huggingface_hub_cache) = huggingface_hub_cache {
        envs.push(("HUGGINGFACE_HUB_CACHE".into(), huggingface_hub_cache.into()));
    };

    // If weights_cache_override is some, pass it to the shard
    // Useful when running inside a HuggingFace Inference Endpoint
    if let Some(weights_cache_override) = weights_cache_override {
        envs.push((
            "WEIGHTS_CACHE_OVERRIDE".into(),
            weights_cache_override.into(),
        ));
    };

    // Enable experimental support for cuda graphs
    if !cuda_graphs.is_empty() {
        envs.push((
            "CUDA_GRAPHS".into(),
            cuda_graphs
                .into_iter()
                .map(|c| c.to_string())
                .collect::<Vec<_>>()
                .join(",")
                .into(),
        ));
    }

    // If disable_custom_kernels is true, pass it to the shard as an env var
    if disable_custom_kernels {
        envs.push(("DISABLE_CUSTOM_KERNELS".into(), "True".into()))
    }

    // Watermark Gamma
    if let Some(watermark_gamma) = watermark_gamma {
        envs.push(("WATERMARK_GAMMA".into(), watermark_gamma.to_string().into()))
    }

    // Watermark Delta
    if let Some(watermark_delta) = watermark_delta {
        envs.push(("WATERMARK_DELTA".into(), watermark_delta.to_string().into()))
    }

    // Start process
    tracing::info!("Starting shard");
    let mut p = match Command::new("text-generation-server")
        .args(shard_args)
        .env_clear()
        .envs(envs)
        .stdin(Stdio::piped())
        .stdout(Stdio::piped())
        .stderr(Stdio::piped())
        .process_group(0)
        .spawn()
    {
        Ok(p) => p,
        Err(err) => {
            if err.kind() == io::ErrorKind::NotFound {
                tracing::error!("text-generation-server not found in PATH");
                tracing::error!("Please install it with `make install-server`")
            }
            {
                tracing::error!("{}", err);
            }

            status_sender.send(ShardStatus::Failed(rank)).unwrap();
            return;
        }
    };

    // Redirect STDOUT to the console
    let mut pstdin = p.stdin.take().unwrap();
    let shard_stdout_reader = BufReader::new(p.stdout.take().unwrap());
    let shard_stderr_reader = BufReader::new(p.stderr.take().unwrap());

    //stdout tracing thread
    thread::spawn(move || {
        log_lines(shard_stdout_reader);
    });
    // We read stderr in another thread as it seems that lines() can block in some cases
    let (err_sender, err_receiver) = mpsc::channel();
    thread::spawn(move || {
        for line in shard_stderr_reader.lines().map_while(Result::ok) {
            err_sender.send(line).unwrap_or(());
        }
    });
    // We read stdin in another thread as it seems that lines() can block in some cases
    if LevelFilter::current() >= tracing::Level::DEBUG {
        thread::spawn(move || {
            let mut stdin = io::stdin(); // We get `Stdin` here.
            loop {
                let mut buffer = vec![0; 4096];
                if let Ok(n) = stdin.read(&mut buffer) {
                    if n > 0 {
                        let _ = pstdin.write_all(&buffer[..n]);
                    }
                }
            }
        });
    }

    let mut ready = false;
    let start_time = Instant::now();
    let mut wait_time = Instant::now();
    loop {
        // Process exited
        if let Some(exit_status) = p.try_wait().unwrap() {
            let mut err = String::new();
            while let Ok(line) = err_receiver.recv_timeout(Duration::from_millis(10)) {
                err = err + "\n" + &line;
            }

            tracing::error!("Shard complete standard error output:\n{err}");

            if let Some(signal) = exit_status.signal() {
                tracing::error!("Shard process was signaled to shutdown with signal {signal}");
            }

            status_sender.send(ShardStatus::Failed(rank)).unwrap();
            return;
        }

        // We received a shutdown signal
        if shutdown.load(Ordering::SeqCst) {
            terminate(
                "shard",
                p,
                Duration::from_secs(graceful_termination_timeout),
            )
            .unwrap();
            return;
        }

        // Shard is ready
        if uds.exists() && !ready {
            tracing::info!("Shard ready in {:?}", start_time.elapsed());
            status_sender.send(ShardStatus::Ready).unwrap();
            ready = true;
        } else if !ready && wait_time.elapsed() > Duration::from_secs(10) {
            tracing::info!("Waiting for shard to be ready...");
            wait_time = Instant::now();
        }
        sleep(Duration::from_millis(100));
    }
}

fn shutdown_shards(shutdown: Arc<AtomicBool>, shutdown_receiver: &mpsc::Receiver<()>) {
    tracing::info!("Shutting down shards");
    // Update shutdown value to true
    // This will be picked up by the shard manager
    shutdown.store(true, Ordering::SeqCst);

    // Wait for shards to shutdown
    // This will block till all shutdown_sender are dropped
    let _ = shutdown_receiver.recv();
}

fn num_cuda_devices() -> Option<usize> {
    let devices = match env::var("CUDA_VISIBLE_DEVICES") {
        Ok(devices) => devices,
        Err(_) => match env::var("NVIDIA_VISIBLE_DEVICES") {
            Ok(devices) => {
                // NVIDIA_VISIBLE_DEVICES is always set when not specified and the nvidia container runtime is
                // in (jit-)cdi mode (since 1.14)
                // nvidia container runtime default mode switched from legacy to cdi mode from 1.18 on
                // Let's handle the void case as all here
                // See: https://github.com/NVIDIA/nvidia-container-toolkit
                if ["all", "void"].contains(&devices.trim())  {
                    // Count the number of all GPUs via nvidia-smi
                    let output = Command::new("nvidia-smi")
                        .args(["--query-gpu=uuid", "--format=csv,noheader"])
                        .output()
                        .ok()?;

                    String::from_utf8_lossy(&output.stdout)
                        .lines()
                        .filter(|line| !line.trim().is_empty())
                        .collect::<Vec<_>>().join(",")
                } else {
                    devices
                }
            }
            Err(_) => env::var("ZE_AFFINITY_MASK").ok()?,
        },
    };
    let n_devices = devices.split(',').count();
    Some(n_devices)
}

#[derive(Deserialize)]
#[serde(rename_all = "UPPERCASE")]
enum PythonLogLevelEnum {
    Trace,
    Debug,
    Info,
    Success,
    Warning,
    Error,
    Critical,
}

#[derive(Deserialize)]
struct PythonLogLevel {
    name: PythonLogLevelEnum,
}

#[derive(Deserialize)]
struct PythonLogRecord {
    level: PythonLogLevel,
}

#[derive(Deserialize)]
struct PythonLogMessage {
    text: String,
    record: PythonLogRecord,
}

impl PythonLogMessage {
    fn trace(&self) {
        match self.record.level.name {
            PythonLogLevelEnum::Trace => tracing::trace!("{}", self.text.trim_end()),
            PythonLogLevelEnum::Debug => tracing::debug!("{}", self.text.trim_end()),
            PythonLogLevelEnum::Info => tracing::info!("{}", self.text.trim_end()),
            PythonLogLevelEnum::Success => tracing::info!("{}", self.text.trim_end()),
            PythonLogLevelEnum::Warning => tracing::warn!("{}", self.text.trim_end()),
            PythonLogLevelEnum::Error => tracing::error!("{}", self.text.trim_end()),
            PythonLogLevelEnum::Critical => tracing::error!("{}", self.text.trim_end()),
        }
    }
}

impl TryFrom<&[u8]> for PythonLogMessage {
    type Error = serde_json::Error;

    fn try_from(value: &[u8]) -> Result<Self, Self::Error> {
        serde_json::from_slice::<Self>(value)
    }
}

fn log_lines<R: Sized + Read>(mut bufread: BufReader<R>) {
    let mut buffer = vec![0u8; 8 * 4096];
    let mut stdout = std::io::stdout();
    loop {
        let n = bufread.read(&mut buffer);
        if let Ok(n) = n {
            if n > 0 {
                let mut lines = buffer[..n].split(|i| *i == b'\n').peekable();
                while let Some(line) = lines.next() {
                    match PythonLogMessage::try_from(line) {
                        Ok(log) => log.trace(),
                        // For interactive debugging ?
                        Err(_) => {
                            if LevelFilter::current() >= tracing::Level::DEBUG {
                                stdout.write_all(line).unwrap();
                                if lines.peek().is_some() {
                                    stdout.write_all(b"\n").unwrap();
                                }
                                stdout.flush().unwrap();
                            }
                        }
                    }
                }
            } else {
                break;
            }
        }
    }
}

fn find_num_shards(
    sharded: Option<bool>,
    num_shard: Option<usize>,
) -> Result<usize, LauncherError> {
    // get the number of shards given `sharded` and `num_shard`
    let num_shard = match (sharded, num_shard) {
        (Some(true), None) => {
            // try to default to the number of available GPUs
            tracing::info!("Parsing num_shard from CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES/ZE_AFFINITY_MASK");
            let n_devices = num_cuda_devices()
                .expect("--num-shard and CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES/ZE_AFFINITY_MASK are not set");
            if n_devices <= 1 {
                return Err(LauncherError::NotEnoughCUDADevices(format!(
                    "`sharded` is true but only found {n_devices} CUDA devices"
                )));
            }
            n_devices
        }
        (Some(true), Some(num_shard)) => {
            // we can't have only one shard while sharded
            if num_shard <= 1 {
                return Err(LauncherError::ArgumentValidation(
                    "`sharded` is true but `num_shard` <= 1".to_string(),
                ));
            }
            num_shard
        }
        (Some(false), Some(num_shard)) => num_shard,
        (Some(false), None) => 1,
        (None, None) => num_cuda_devices().unwrap_or(1),
        (None, Some(num_shard)) => num_shard,
    };
    if num_shard < 1 {
        return Err(LauncherError::ArgumentValidation(
            "`num_shard` cannot be < 1".to_string(),
        ));
    }
    Ok(num_shard)
}

#[derive(Debug, Error)]
enum LauncherError {
    #[error("Invalid argument: {0}")]
    ArgumentValidation(String),
    #[error("not enough cuda devices: {0}")]
    NotEnoughCUDADevices(String),
    #[error("Download error")]
    DownloadError,
    #[error("Shard cannot start")]
    ShardCannotStart,
    #[error("Shard disconnected")]
    ShardDisconnected,
    #[error("Shard failed")]
    ShardFailed,
    #[error("Webserver failed")]
    WebserverFailed,
    #[error("Webserver cannot start")]
    WebserverCannotStart,
}

fn download_convert_model(
    model_id: &str,
    revision: Option<&str>,
    trust_remote_code: bool,
    huggingface_hub_cache: Option<&str>,
    weights_cache_override: Option<&str>,
    running: Arc<AtomicBool>,
    merge_lora: bool,
) -> Result<(), LauncherError> {
    // Enter download tracing span
    let _span = tracing::span!(tracing::Level::INFO, "download").entered();

    let mut download_args = vec![
        "download-weights".to_string(),
        model_id.to_string(),
        "--extension".to_string(),
        ".safetensors".to_string(),
        "--logger-level".to_string(),
        "INFO".to_string(),
        "--json-output".to_string(),
    ];

    if merge_lora {
        download_args.push("--merge-lora".to_string());
    }

    // Model optional revision
    if let Some(revision) = &revision {
        download_args.push("--revision".to_string());
        download_args.push(revision.to_string())
    }

    // Trust remote code for automatic peft fusion
    if trust_remote_code {
        download_args.push("--trust-remote-code".to_string());
    }

    // Copy current process env
    let mut envs: Vec<(OsString, OsString)> = env::vars_os().collect();

    // Remove LOG_LEVEL if present
    envs.retain(|(name, _)| name != "LOG_LEVEL");

    // Disable progress bar
    envs.push(("HF_HUB_DISABLE_PROGRESS_BARS".into(), "1".into()));

    // If huggingface_hub_cache is set, pass it to the download process
    // Useful when running inside a docker container
    if let Some(ref huggingface_hub_cache) = huggingface_hub_cache {
        envs.push(("HUGGINGFACE_HUB_CACHE".into(), huggingface_hub_cache.into()));
    };

    // Enable hf transfer for insane download speeds
    let enable_hf_transfer = env::var("HF_HUB_ENABLE_HF_TRANSFER").unwrap_or("1".to_string());
    envs.push((
        "HF_HUB_ENABLE_HF_TRANSFER".into(),
        enable_hf_transfer.into(),
    ));

    // Parse Inference API token
    if let Ok(api_token) = env::var("HF_API_TOKEN") {
        envs.push(("HF_TOKEN".into(), api_token.into()))
    };

    // If args.weights_cache_override is some, pass it to the download process
    // Useful when running inside a HuggingFace Inference Endpoint
    if let Some(weights_cache_override) = &weights_cache_override {
        envs.push((
            "WEIGHTS_CACHE_OVERRIDE".into(),
            weights_cache_override.into(),
        ));
    };

    // Start process
    tracing::info!("Starting check and download process for {model_id}");
    let mut download_process = match Command::new("text-generation-server")
        .args(download_args)
        .env_clear()
        .envs(envs)
        .stdout(Stdio::piped())
        .stderr(Stdio::piped())
        .process_group(0)
        .spawn()
    {
        Ok(p) => p,
        Err(err) => {
            if err.kind() == io::ErrorKind::NotFound {
                tracing::error!("text-generation-server not found in PATH");
                tracing::error!("Please install it with `make install-server`")
            } else {
                tracing::error!("{}", err);
            }

            return Err(LauncherError::DownloadError);
        }
    };

    let download_stdout = BufReader::new(download_process.stdout.take().unwrap());

    thread::spawn(move || {
        log_lines(download_stdout);
    });

    let download_stderr = BufReader::new(download_process.stderr.take().unwrap());

    // We read stderr in another thread as it seems that lines() can block in some cases
    let (err_sender, err_receiver) = mpsc::channel();
    thread::spawn(move || {
        for line in download_stderr.lines().map_while(Result::ok) {
            err_sender.send(line).unwrap_or(());
        }
    });

    loop {
        if let Some(status) = download_process.try_wait().unwrap() {
            if status.success() {
                tracing::info!("Successfully downloaded weights for {model_id}");
                break;
            }

            let mut err = String::new();
            while let Ok(line) = err_receiver.recv_timeout(Duration::from_millis(10)) {
                err = err + "\n" + &line;
            }

            if let Some(signal) = status.signal() {
                tracing::error!(
                    "Download process was signaled to shutdown with signal {signal}: {err}"
                );
            } else {
                tracing::error!("Download encountered an error: {err}");
            }

            return Err(LauncherError::DownloadError);
        }
        if !running.load(Ordering::SeqCst) {
            terminate("download", download_process, Duration::from_secs(10)).unwrap();
            return Ok(());
        }
        sleep(Duration::from_millis(100));
    }
    Ok(())
}

#[allow(clippy::too_many_arguments)]
fn spawn_shards(
    num_shard: usize,
    args: &Args,
    cuda_graphs: Vec<usize>,
    max_total_tokens: Option<usize>,
    max_input_tokens: Option<usize>,
    quantize: Option<Quantization>,
    max_log_level: LevelFilter,
    shutdown: Arc<AtomicBool>,
    shutdown_receiver: &mpsc::Receiver<()>,
    shutdown_sender: mpsc::Sender<()>,
    status_receiver: &mpsc::Receiver<ShardStatus>,
    status_sender: mpsc::Sender<ShardStatus>,
    running: Arc<AtomicBool>,
    graceful_termination_timeout: u64,
) -> Result<(), LauncherError> {
    // Start shard processes
    for rank in 0..num_shard {
        let model_id = args.model_id.clone();
        let revision = args.revision.clone();
        let uds_path = args.shard_uds_path.clone();
        let master_addr = args.master_addr.clone();
        let huggingface_hub_cache = args.huggingface_hub_cache.clone();
        let weights_cache_override = args.weights_cache_override.clone();
        let status_sender = status_sender.clone();
        let shutdown = shutdown.clone();
        let shutdown_sender = shutdown_sender.clone();
        let otlp_endpoint = args.otlp_endpoint.clone();
        let otlp_service_name = args.otlp_service_name.clone();
        let speculate = args.speculate;
        let dtype = args.dtype;
        let kv_cache_dtype = args.kv_cache_dtype;
        let trust_remote_code = args.trust_remote_code;
        let master_port = args.master_port;
        let disable_custom_kernels = args.disable_custom_kernels;
        let watermark_gamma = args.watermark_gamma;
        let watermark_delta = args.watermark_delta;
        let cuda_graphs_clone = cuda_graphs.clone();
        let cuda_memory_fraction = args.cuda_memory_fraction;
        let rope_scaling = args.rope_scaling;
        let rope_factor = args.rope_factor;
        let max_batch_size = args.max_batch_size;
        let lora_adapters = args.lora_adapters.clone();
        let enable_prefill_logprobs = args.enable_prefill_logprobs;
        thread::spawn(move || {
            shard_manager(
                model_id,
                revision,
                quantize,
                speculate,
                dtype,
                kv_cache_dtype,
                trust_remote_code,
                uds_path,
                rank,
                num_shard,
                master_addr,
                master_port,
                huggingface_hub_cache,
                weights_cache_override,
                disable_custom_kernels,
                watermark_gamma,
                watermark_delta,
                cuda_graphs_clone,
                cuda_memory_fraction,
                rope_scaling,
                rope_factor,
                max_total_tokens,
                max_batch_size,
                max_input_tokens,
                lora_adapters,
                enable_prefill_logprobs,
                otlp_endpoint,
                otlp_service_name,
                max_log_level,
                status_sender,
                shutdown,
                graceful_termination_timeout,
                shutdown_sender,
            )
        });
    }
    drop(shutdown_sender);

    // Wait for shard to start
    let mut shard_ready = 0;
    while running.load(Ordering::SeqCst) {
        match status_receiver.try_recv() {
            Ok(ShardStatus::Ready) => {
                shard_ready += 1;
                if shard_ready == num_shard {
                    break;
                }
            }
            Err(TryRecvError::Empty) => {
                sleep(Duration::from_millis(100));
            }
            Ok(ShardStatus::Failed(rank)) => {
                tracing::error!("Shard {rank} failed to start");
                shutdown_shards(shutdown, shutdown_receiver);
                return Err(LauncherError::ShardCannotStart);
            }
            Err(TryRecvError::Disconnected) => {
                tracing::error!("Shard status channel disconnected");
                shutdown_shards(shutdown, shutdown_receiver);
                return Err(LauncherError::ShardDisconnected);
            }
        }
    }
    Ok(())
}

#[derive(Debug)]
enum Gpu {
    RTX4090,
    T4,
    L4,
    L40,
    L40S,
    A10G,
    A40,
    H100,
    A100,
    H200,
    Unknown(String),
}

#[derive(Debug)]
struct ComputeType {
    count: usize,
    card: Gpu,
}

impl From<&str> for Gpu {
    fn from(value: &str) -> Self {
        match value {
            "nvidia-4090" => Gpu::RTX4090,
            "nvidia-t4" => Gpu::T4,
            "nvidia-l4" => Gpu::L4,
            "nvidia-l40" => Gpu::L40,
            "nvidia-l40s" => Gpu::L40S,
            "nvidia-a10g" => Gpu::A10G,
            "nvidia-a40" => Gpu::A40,
            "nvidia-h100-80gb-hbm3" => Gpu::H100,
            "nvidia-h100-nvl" => Gpu::H100,
            "nvidia-h100" => Gpu::H100,
            "nvidia-a100-sxm4-80gb" => Gpu::A100,
            "nvidia-a100-sxm4-40gb" => Gpu::A100,
            "nvidia-a100-80gb-pcie" => Gpu::A100,
            "nvidia-a100" => Gpu::A100,
            "nvidia-h200" => Gpu::H200,
            card => Gpu::Unknown(card.to_string()),
        }
    }
}

impl std::fmt::Display for Gpu {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Gpu::RTX4090 => write!(f, "nvidia-4090"),
            Gpu::T4 => write!(f, "nvidia-t4"),
            Gpu::L4 => write!(f, "nvidia-l4"),
            Gpu::L40 => write!(f, "nvidia-l40"),
            Gpu::L40S => write!(f, "nvidia-l40s"),
            Gpu::A10G => write!(f, "nvidia-a10g"),
            Gpu::A40 => write!(f, "nvidia-a40"),
            Gpu::H100 => write!(f, "nvidia-h100-80fb-hbm3"),
            Gpu::A100 => write!(f, "nvidia-a100-sxm4-80gb"),
            Gpu::H200 => write!(f, "nvidia-h200"),
            Gpu::Unknown(card) => write!(f, "{}", card),
        }
    }
}

impl ComputeType {
    fn f16_flop(&self) -> Option<u64> {
        let card_flop = match &self.card {
            // https://www.nvidia.com/en-us/geforce/graphics-cards/40-series/rtx-4090/
            // Specs are unclear https://www.itcreations.com/nvidia-gpu/nvidia-geforce-rtx-4090-gpu
            Gpu::RTX4090 => Some(82 * 10u64.pow(12)),
            // https://www.nvidia.com/en-us/data-center/tesla-t4/
            Gpu::T4 => Some(65 * 10u64.pow(12)),
            // https://www.nvidia.com/en-us/data-center/l4/
            Gpu::L4 => Some(121 * 10u64.pow(12)),
            // https://www.nvidia.com/en-us/data-center/l40/
            Gpu::L40 => Some(181 * 10u64.pow(12)),
            // https://www.nvidia.com/en-us/data-center/l40s/
            Gpu::L40S => Some(363 * 10u64.pow(12)),
            // https://www.nvidia.com/en-us/data-center/products/a10-gpu/
            Gpu::A10G => Some(125 * 10u64.pow(12)),
            // https://www.nvidia.com/en-us/data-center/a40/
            // https://images.nvidia.com/content/Solutions/data-center/a40/nvidia-a40-datasheet.pdf
            Gpu::A40 => Some(149 * 10u64.pow(12)),
            // https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf
            Gpu::A100 => Some(312 * 10u64.pow(12)),
            // https://www.nvidia.com/en-us/data-center/h100/
            // https://www.techpowerup.com/gpu-specs/docs/nvidia-gh100-architecture.pdf
            Gpu::H100 => Some(900 * 10u64.pow(12)),
            // https://www.nvidia.com/en-us/data-center/h200/
            Gpu::H200 => Some(989 * 10u64.pow(12)),
            Gpu::Unknown(card) => {
                tracing::warn!("Unkown compute for card {card}");
                None
            }
        };
        card_flop.map(|f| f * self.count as u64)
    }

    fn vram(&self, memory_fraction: f32) -> Option<usize> {
        let output = Command::new("nvidia-smi")
            .args(["--query-gpu=memory.total", "--format=csv"])
            .output()
            .ok()?;
        let output = String::from_utf8(output.stdout).ok()?;
        let fullname = output.split('\n').nth(1)?;
        let mut tokens = fullname.split(' ');
        let amount = tokens.next()?;
        let unit = tokens.next()?;
        if unit != "MiB" {
            tracing::warn!("Unexpected memory unit {unit}, expected MiB");
            return None;
        }
        let amount: usize = amount.parse().ok()?;
        let amount = amount * 2usize.pow(20);
        let wiggle_room: f32 = env::var("TGI_WIGGLE_ROOM")
            .ok()
            .and_then(|wiggle| wiggle.parse().ok())
            .unwrap_or(0.95);
        let total = amount * self.count;
        let adjusted = ((total as f32) * memory_fraction * wiggle_room) as usize;
        Some(adjusted)
    }
}

impl From<ComputeType> for OsString {
    fn from(value: ComputeType) -> Self {
        format!("{}-{}", value.count, value.card).into()
    }
}

fn compute_type(count: usize) -> Option<ComputeType> {
    let output = Command::new("nvidia-smi")
        .args(["--query-gpu=gpu_name", "--format=csv"])
        .output()
        .ok()?;
    let output = String::from_utf8(output.stdout).ok()?;
    let fullname = output.split('\n').nth(1)?;
    let cardname = fullname.replace(' ', "-").to_lowercase();
    let card = (&*cardname).into();
    Some(ComputeType { count, card })
}

fn spawn_webserver(
    num_shard: usize,
    args: Args,
    max_input_tokens: Option<usize>,
    max_total_tokens: Option<usize>,
    max_batch_prefill_tokens: u32,
    shutdown: Arc<AtomicBool>,
    shutdown_receiver: &mpsc::Receiver<()>,
) -> Result<Child, LauncherError> {
    // All shard started
    // Start webserver
    tracing::info!("Starting Webserver");
    let mut router_args = vec![
        "--max-client-batch-size".to_string(),
        args.max_client_batch_size.to_string(),
        "--max-concurrent-requests".to_string(),
        args.max_concurrent_requests.to_string(),
        "--max-best-of".to_string(),
        args.max_best_of.to_string(),
        "--max-stop-sequences".to_string(),
        args.max_stop_sequences.to_string(),
        "--max-top-n-tokens".to_string(),
        args.max_top_n_tokens.to_string(),
        "--max-batch-prefill-tokens".to_string(),
        max_batch_prefill_tokens.to_string(),
        "--waiting-served-ratio".to_string(),
        args.waiting_served_ratio.to_string(),
        "--max-waiting-tokens".to_string(),
        args.max_waiting_tokens.to_string(),
        "--validation-workers".to_string(),
        args.validation_workers.to_string(),
        "--hostname".to_string(),
        args.hostname.to_string(),
        "--port".to_string(),
        args.port.to_string(),
        "--prometheus-port".to_string(),
        args.prometheus_port.to_string(),
        "--master-shard-uds-path".to_string(),
        format!("{}-0", args.shard_uds_path),
        "--tokenizer-name".to_string(),
        args.model_id,
        "--payload-limit".to_string(),
        args.payload_limit.to_string(),
    ];
    if let Some(max_input_tokens) = max_input_tokens {
        router_args.extend_from_slice(&[
            "--max-input-tokens".to_string(),
            max_input_tokens.to_string(),
        ]);
    }
    if let Some(max_total_tokens) = max_total_tokens {
        router_args.extend_from_slice(&[
            "--max-total-tokens".to_string(),
            max_total_tokens.to_string(),
        ]);
    }

    // Pass usage stats flags to router
    router_args.push("--usage-stats".to_string());
    router_args.push(args.usage_stats.to_string());

    // Grammar support
    if args.disable_grammar_support {
        router_args.push("--disable-grammar-support".to_string());
    }

    // Tokenizer config path
    if let Some(ref tokenizer_config_path) = args.tokenizer_config_path {
        router_args.push("--tokenizer-config-path".to_string());
        router_args.push(tokenizer_config_path.to_string());
    }

    // Model optional max batch total tokens
    if let Some(max_batch_total_tokens) = args.max_batch_total_tokens {
        router_args.push("--max-batch-total-tokens".to_string());
        router_args.push(max_batch_total_tokens.to_string());
    }

    // Router optional max batch size
    if let Some(max_batch_size) = args.max_batch_size {
        router_args.push("--max-batch-size".to_string());
        router_args.push(max_batch_size.to_string());
    }

    // Model optional revision
    if let Some(ref revision) = args.revision {
        router_args.push("--revision".to_string());
        router_args.push(revision.to_string())
    }

    if args.trust_remote_code {
        router_args.push("--trust-remote-code".to_string());
    }

    if args.json_output {
        router_args.push("--json-output".to_string());
    }

    // OpenTelemetry
    if let Some(otlp_endpoint) = args.otlp_endpoint {
        router_args.push("--otlp-endpoint".to_string());
        router_args.push(otlp_endpoint);
    }

    // OpenTelemetry
    let otlp_service_name = args.otlp_service_name;
    router_args.push("--otlp-service-name".to_string());
    router_args.push(otlp_service_name);

    // CORS origins
    for origin in args.cors_allow_origin.into_iter() {
        router_args.push("--cors-allow-origin".to_string());
        router_args.push(origin);
    }

    // API Key
    if let Some(api_key) = args.api_key {
        router_args.push("--api-key".to_string());
        router_args.push(api_key);
    }
    // Ngrok
    if args.ngrok {
        router_args.push("--ngrok".to_string());
        router_args.push("--ngrok-authtoken".to_string());
        router_args.push(args.ngrok_authtoken.unwrap());
        router_args.push("--ngrok-edge".to_string());
        router_args.push(args.ngrok_edge.unwrap());
    }

    // Copy current process env
    let mut envs: Vec<(OsString, OsString)> = env::vars_os().collect();

    // Parse Inference API token
    if let Ok(api_token) = env::var("HF_API_TOKEN") {
        envs.push(("HF_TOKEN".into(), api_token.into()))
    };

    // Parse Compute type
    if let Ok(compute_type) = env::var("COMPUTE_TYPE") {
        envs.push(("COMPUTE_TYPE".into(), compute_type.into()))
    } else if let Some(compute_type) = compute_type(num_shard) {
        envs.push(("COMPUTE_TYPE".into(), compute_type.into()))
    }

    let mut webserver = match Command::new("text-generation-router")
        .args(router_args)
        .envs(envs)
        .stdout(Stdio::piped())
        .stderr(Stdio::piped())
        .process_group(0)
        .spawn()
    {
        Ok(p) => p,
        Err(err) => {
            tracing::error!("Failed to start webserver: {}", err);
            if err.kind() == io::ErrorKind::NotFound {
                tracing::error!("text-generation-router not found in PATH");
                tracing::error!("Please install it with `make install-router`")
            } else {
                tracing::error!("{}", err);
            }

            shutdown_shards(shutdown, shutdown_receiver);
            return Err(LauncherError::WebserverCannotStart);
        }
    };

    // Redirect STDOUT and STDERR to the console
    let webserver_stdout = webserver.stdout.take().unwrap();
    let webserver_stderr = webserver.stderr.take().unwrap();

    thread::spawn(move || {
        let stdout = BufReader::new(webserver_stdout);
        let stderr = BufReader::new(webserver_stderr);
        for line in stdout.lines() {
            println!("{}", line.unwrap());
        }
        for line in stderr.lines() {
            println!("{}", line.unwrap());
        }
    });
    Ok(webserver)
}

fn terminate(process_name: &str, mut process: Child, timeout: Duration) -> io::Result<ExitStatus> {
    tracing::info!("Terminating {process_name}");

    let terminate_time = Instant::now();
    signal::kill(Pid::from_raw(process.id() as i32), Signal::SIGTERM).unwrap();

    tracing::info!("Waiting for {process_name} to gracefully shutdown");
    while terminate_time.elapsed() < timeout {
        if let Some(status) = process.try_wait()? {
            tracing::info!("{process_name} terminated");
            return Ok(status);
        }
        sleep(Duration::from_millis(100));
    }
    tracing::info!("Killing {process_name}");

    process.kill()?;
    let exit_status = process.wait()?;

    tracing::info!("{process_name} killed");
    Ok(exit_status)
}

fn main() -> Result<(), LauncherError> {
    // Pattern match configuration
    let args: Args = Args::parse();

    let graceful_termination_timeout = args.graceful_termination_timeout;

    // Filter events with LOG_LEVEL
    let varname = "LOG_LEVEL";
    let env_filter = if let Ok(log_level) = std::env::var(varname) {
        // Override to avoid simple logs to be spammed with tokio level informations
        let log_level = match &log_level[..] {
            "warn" => "text_generation_launcher=warn,text_generation_router=warn",
            "info" => "text_generation_launcher=info,text_generation_router=info",
            "debug" => "text_generation_launcher=debug,text_generation_router=debug",
            log_level => log_level,
        };
        EnvFilter::builder()
            .with_default_directive(LevelFilter::INFO.into())
            .parse_lossy(log_level)
    } else {
        EnvFilter::new("info")
    };
    let max_log_level = env_filter.max_level_hint().unwrap_or(LevelFilter::INFO);

    if args.json_output {
        tracing_subscriber::fmt()
            .with_env_filter(env_filter)
            .json()
            .init();
    } else {
        tracing_subscriber::fmt()
            .with_env_filter(env_filter)
            .compact()
            .init();
    }

    if args.env {
        let env_runtime = env_runtime::Env::new();
        tracing::info!("{}", env_runtime);
    }

    tracing::info!("{:#?}", args);

    let config: Option<Config> = get_config(&args.model_id, &args.revision).ok();
    let quantize = config.as_ref().and_then(|c| c.quantize);
    // Quantization usually means you're even more RAM constrained.

    let (prefix_caching, attention) = resolve_attention(&config, &args.lora_adapters);
    tracing::info!("Using attention {attention} - Prefix caching {prefix_caching}");
    std::env::set_var("PREFIX_CACHING", prefix_caching);
    std::env::set_var("ATTENTION", attention);

    let num_shard = find_num_shards(args.sharded, args.num_shard)?;
    if num_shard > 1 {
        if matches!(args.quantize, Some(Quantization::Exl2)) {
            return Err(LauncherError::ArgumentValidation(
                "Sharding is currently not supported with `exl2` quantization".into(),
            ));
        }
        tracing::info!("Sharding model on {num_shard} processes");
    }

    let max_input_tokens = {
        match (args.max_input_tokens, args.max_input_length) {
            (Some(max_input_tokens), Some(max_input_length)) => {
                return Err(LauncherError::ArgumentValidation(
                    format!("Both `max_input_tokens` ({max_input_tokens}) and `max_input_length` ({max_input_length}) are set. Please define only `max_input_tokens` as `max_input_length is deprecated for naming consistency.",
                )));
            }
            (Some(max_input_tokens), None) | (None, Some(max_input_tokens)) => {
                Some(max_input_tokens)
            }
            (None, None) => None,
        }
    };
    let max_total_tokens = args.max_total_tokens;
    let max_batch_prefill_tokens = {
        match args.max_batch_prefill_tokens {
            Some(max_batch_prefill_tokens) => max_batch_prefill_tokens,
            None => {
                let compute_type = compute_type(num_shard);
                let compute_optimal = compute_optimal(config.as_ref(), compute_type.as_ref());
                // TODO: remove this when we correctly esimate the flops for VLMs
                // this is a short term temporary fix to enable vlms to avoid rejecting images
                let default_optimal = match config {
                    Some(ref config) => match config.model_type.as_deref() {
                        Some("qwen2_vl") | Some("qwen2_5_vl") => 10_000,
                        Some("gemma3") => 8000,
                        _ => 4096,
                    },
                    None => 4096,
                };
                let default = compute_optimal.unwrap_or(default_optimal);
                let vram_maximum = vram_maximum(
                    config.as_ref(),
                    compute_type.as_ref(),
                    args.cuda_memory_fraction,
                );
                let max_position_embeddings = config.and_then(|c| c.max_position_embeddings);
                let value = if let Some(max_position_embeddings) = max_position_embeddings {
                    default.min(max_position_embeddings)
                } else {
                    default
                };
                let value = if let Some(vram_maximum) = vram_maximum {
                    if vram_maximum < value {
                        tracing::warn!("Reducing the max batch prefill from {default} to {vram_maximum} because there is not enough VRAM to support it.");
                    }
                    value.min(vram_maximum)
                } else {
                    value
                };
                tracing::info!("Default `max_batch_prefill_tokens` to {value}");
                value as u32
            }
        }
    };

    // Validate args
    if let (Some(max_input_tokens), Some(max_total_tokens)) = (max_input_tokens, max_total_tokens) {
        if max_input_tokens >= max_total_tokens {
            return Err(LauncherError::ArgumentValidation(
                    format!("`max_input_tokens`({max_input_tokens}) must be < `max_total_tokens`({max_total_tokens})"),
                ));
        }
    }

    if matches!(args.quantize, Some(Quantization::Bitsandbytes)) {
        tracing::warn!("Bitsandbytes is deprecated, use `eetq` instead, which provides better latencies overall and is drop-in in most cases.");
    }
    let quantize = args.quantize.or(quantize);
    let cuda_graphs = match (&args.cuda_graphs, &quantize) {
        (Some(cuda_graphs), _) => cuda_graphs.iter().cloned().filter(|&c| c > 0).collect(),
        #[allow(deprecated)]
        (None, Some(Quantization::Bitsandbytes)) => {
            tracing::warn!("Bitsandbytes doesn't work with cuda graphs, deactivating them");
            vec![]
        }
        (None, Some(Quantization::Exl2)) => {
            tracing::warn!("Exl2 doesn't work with cuda graphs, deactivating them");
            vec![]
        }
        _ => {
            let cuda_graphs = vec![1, 2, 4, 8, 16, 32];
            tracing::info!("Using default cuda graphs {cuda_graphs:?}");
            cuda_graphs
        }
    };

    if args.validation_workers == 0 {
        return Err(LauncherError::ArgumentValidation(
            "`validation_workers` must be > 0".to_string(),
        ));
    }
    if args.trust_remote_code {
        tracing::warn!(
            "`trust_remote_code` is set. Trusting that model `{}` do not contain malicious code.",
            args.model_id
        );
    }

    if let Some(ref max_batch_total_tokens) = args.max_batch_total_tokens {
        if let Some(max_total_tokens) = max_total_tokens {
            if max_total_tokens as u32 > *max_batch_total_tokens {
                return Err(LauncherError::ArgumentValidation(format!(
                    "`max_total_tokens` must be <= `max_batch_total_tokens`. Given: {} and {}",
                    max_total_tokens, max_batch_total_tokens
                )));
            }
        }
    }

    if args.ngrok {
        if args.ngrok_authtoken.is_none() {
            return Err(LauncherError::ArgumentValidation(
                "`ngrok-authtoken` must be set when using ngrok tunneling".to_string(),
            ));
        }

        if args.ngrok_edge.is_none() {
            return Err(LauncherError::ArgumentValidation(
                "`ngrok-edge` must be set when using ngrok tunneling".to_string(),
            ));
        }
    }

    // Signal handler
    let running = Arc::new(AtomicBool::new(true));
    let r = running.clone();
    ctrlc::set_handler(move || {
        r.store(false, Ordering::SeqCst);
    })
    .expect("Error setting Ctrl-C handler");

    // Download and convert model weights
    download_convert_model(
        &args.model_id,
        args.revision.as_deref(),
        args.trust_remote_code,
        args.huggingface_hub_cache.as_deref(),
        args.weights_cache_override.as_deref(),
        running.clone(),
        true, // if its only a lora model - we should merge the lora adapters
    )?;

    // Download and convert lora adapters if any
    if let Some(lora_adapters) = &args.lora_adapters {
        for adapter in lora_adapters.split(',') {
            // skip download if a path is provided
            if adapter.contains('=') {
                continue;
            }

            let adapter = adapter.trim();

            // check if adapter has more than 1 '@'
            if adapter.matches('@').count() > 1 {
                return Err(LauncherError::ArgumentValidation(format!(
                    "Invalid LoRA adapter format: {}",
                    adapter
                )));
            }

            // capture adapter_id, path, revision in format of adapter_id=path@revision
            // path is disabled beforehand.
            let mut splits = adapter.split("@");
            let adapter_id = splits.next().ok_or_else(|| {
                LauncherError::ArgumentValidation("Missing adapter id".to_string())
            })?;
            let revision = splits.next();
            download_convert_model(
                adapter_id,
                revision,
                args.trust_remote_code,
                args.huggingface_hub_cache.as_deref(),
                args.weights_cache_override.as_deref(),
                running.clone(),
                false, // avoid merging lora adapters if using multi-lora
            )?;
        }
    }

    if !running.load(Ordering::SeqCst) {
        // Launcher was asked to stop
        return Ok(());
    }

    // Shared shutdown bool
    let shutdown = Arc::new(AtomicBool::new(false));
    // Shared shutdown channel
    // When shutting down, the main thread will wait for all senders to be dropped
    let (shutdown_sender, shutdown_receiver) = mpsc::channel();

    // Shared channel to track shard status
    let (status_sender, status_receiver) = mpsc::channel();

    spawn_shards(
        num_shard,
        &args,
        cuda_graphs,
        max_total_tokens,
        max_input_tokens,
        quantize,
        max_log_level,
        shutdown.clone(),
        &shutdown_receiver,
        shutdown_sender,
        &status_receiver,
        status_sender,
        running.clone(),
        graceful_termination_timeout,
    )?;

    // We might have received a termination signal
    if !running.load(Ordering::SeqCst) {
        shutdown_shards(shutdown, &shutdown_receiver);
        return Ok(());
    }

    let mut webserver = spawn_webserver(
        num_shard,
        args,
        max_input_tokens,
        max_total_tokens,
        max_batch_prefill_tokens,
        shutdown.clone(),
        &shutdown_receiver,
    )
    .inspect_err(|_| {
        shutdown_shards(shutdown.clone(), &shutdown_receiver);
    })?;

    // Default exit code
    let mut exit_code = Ok(());

    while running.load(Ordering::SeqCst) {
        if let Ok(ShardStatus::Failed(rank)) = status_receiver.try_recv() {
            tracing::error!("Shard {rank} crashed");
            exit_code = Err(LauncherError::ShardFailed);
            break;
        };

        match webserver.try_wait().unwrap() {
            Some(_) => {
                tracing::error!("Webserver Crashed");
                shutdown_shards(shutdown, &shutdown_receiver);
                return Err(LauncherError::WebserverFailed);
            }
            None => {
                sleep(Duration::from_millis(100));
            }
        };
    }

    // Graceful termination
    terminate(
        "webserver",
        webserver,
        Duration::from_secs(graceful_termination_timeout),
    )
    .unwrap();
    shutdown_shards(shutdown, &shutdown_receiver);

    exit_code
}


================================================
FILE: load_tests/Makefile
================================================

ShareGPT_V3_unfiltered_cleaned_split.json:
	wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json

prepare_share: ShareGPT_V3_unfiltered_cleaned_split.json
	python filter.py

prepare_orca:
	python orca.py


================================================
FILE: load_tests/benchmarks.py
================================================
import argparse
import datetime
import json
import os
import traceback
from typing import Dict, Tuple, List

import GPUtil
import docker
from docker.models.containers import Container
from loguru import logger
import pandas as pd


class InferenceEngineRunner:
    def __init__(self, model: str):
        self.model = model

    def run(self, parameters: list[tuple], gpus: int = 0):
        NotImplementedError("This method should be implemented by the subclass")

    def stop(self):
        NotImplementedError("This method should be implemented by the subclass")


class TGIDockerRunner(InferenceEngineRunner):
    def __init__(
        self,
        model: str,
        image: str = "ghcr.io/huggingface/text-generation-inference:latest",
        volumes=None,
    ):
        super().__init__(model)
        if volumes is None:
            volumes = []
        self.container = None
        self.image = image
        self.volumes = volumes

    def run(self, parameters: list[tuple], gpus: int = 0):
        params = f"--model-id {self.model} --port 8080"
        for p in parameters:
            params += f" --{p[0]} {str(p[1])}"
        logger.info(f"Running TGI with parameters: {params}")
        volumes = {}
        for v in self.volumes:
            volumes[v[0]] = {"bind": v[1], "mode": "rw"}
        self.container = run_docker(
            self.image,
            params,
            "Connected",
            "ERROR",
            volumes=volumes,
            gpus=gpus,
            ports={"8080/tcp": 8080},
        )

    def stop(self):
        if self.container:
            self.container.stop()


class BenchmarkRunner:
    def __init__(
        self,
        image: str = "ghcr.io/huggingface/text-generation-inference-benchmark:latest",
        volumes: List[Tuple[str, str]] = None,
    ):
        if volumes is None:
            volumes = []
        self.container = None
        self.image = image
        self.volumes = volumes

    def run(self, parameters: list[tuple], network_mode):
        params = "text-generation-inference-benchmark"
        for p in parameters:
            params += f" --{p[0]} {str(p[1])}" if p[1] is not None else f" --{p[0]}"
        logger.info(
            f"Running text-generation-inference-benchmarks with parameters: {params}"
        )
        volumes = {}
        for v in self.volumes:
            volumes[v[0]] = {"bind": v[1], "mode": "rw"}
        self.container = run_docker(
            self.image,
            params,
            "Benchmark finished",
            "Fatal:",
            volumes=volumes,
            extra_env={
                "RUST_LOG": "text_generation_inference_benchmark=info",
                "RUST_BACKTRACE": "full",
            },
            network_mode=network_mode,
        )

    def stop(self):
        if self.container:
            self.container.stop()


def run_docker(
    image: str,
    args: str,
    success_sentinel: str,
    error_sentinel: str,
    ports: Dict[str, int] = None,
    volumes=None,
    network_mode: str = "bridge",
    gpus: int = 0,
    extra_env: Dict[str, str] = None,
) -> Container:
    if ports is None:
        ports = {}
    if volumes is None:
        volumes = {}
    if extra_env is None:
        extra_env = {}
    client = docker.from_env(timeout=300)
    # retrieve the GPU devices from CUDA_VISIBLE_DEVICES
    devices = [f"{i}" for i in range(get_num_gpus())][:gpus]
    environment = {"HF_TOKEN": os.environ.get("HF_TOKEN")}
    environment.update(extra_env)
    container = client.containers.run(
        image,
        args,
        detach=True,
        device_requests=(
            [docker.types.DeviceRequest(device_ids=devices, capabilities=[["gpu"]])]
            if gpus > 0
            else None
        ),
        volumes=volumes,
        shm_size="1g",
        ports=ports,
        network_mode=network_mode,
        environment=environment,
    )
    for line in container.logs(stream=True):
        print(line.decode("utf-8"), end="")
        if success_sentinel.encode("utf-8") in line:
            break
        if error_sentinel.encode("utf-8") in line:
            container.stop()
            raise Exception(f"Error starting container: {line}")
    return container


def get_gpu_names() -> str:
    gpus = GPUtil.getGPUs()
    if len(gpus) == 0:
        return ""
    return f'{len(gpus)}x{gpus[0].name if gpus else "No GPU available"}'


def get_gpu_name() -> str:
    gpus = GPUtil.getGPUs()
    if len(gpus) == 0:
        return ""
    return gpus[0].name


def get_num_gpus() -> int:
    return len(GPUtil.getGPUs())


def build_df(model: str, data_files: dict[str, str]) -> pd.DataFrame:
    df = pd.DataFrame()
    now = datetime.datetime.now(datetime.timezone.utc)
    created_at = now.isoformat()  # '2024-10-02T11:53:17.026215+00:00'
    # Load the results
    for key, filename in data_files.items():
        with open(filename, "r") as f:
            data = json.load(f)
            for result in data["results"]:
                entry = result
                [config] = pd.json_normalize(result["config"]).to_dict(orient="records")
                entry.update(config)
                entry["engine"] = data["config"]["meta"]["engine"]
                entry["tp"] = data["config"]["meta"]["tp"]
                entry["version"] = data["config"]["meta"]["version"]
                entry["model"] = model
                entry["created_at"] = created_at
                del entry["config"]
                df = pd.concat([df, pd.DataFrame(entry, index=[0])])
    return df


def main(sha, results_file):
    results_dir = "results"
    # get absolute path
    results_dir = os.path.join(os.path.dirname(__file__), results_dir)
    logger.info("Starting benchmark")
    models = [
        ("meta-llama/Llama-3.1-8B-Instruct", 1),
        # ('meta-llama/Llama-3.1-70B-Instruct', 4),
        # ('mistralai/Mixtral-8x7B-Instruct-v0.1', 2),
    ]
    success = True
    for model in models:
        tgi_runner = TGIDockerRunner(model[0])
        # create results directory
        model_dir = os.path.join(
            results_dir, f'{model[0].replace("/", "_").replace(".", "_")}'
        )
        os.makedirs(model_dir, exist_ok=True)
        runner = BenchmarkRunner(
            volumes=[(model_dir, "/opt/text-generation-inference-benchmark/results")]
        )
        try:
            tgi_runner.run([("max-concurrent-requests", 512)], gpus=model[1])
            logger.info(f"TGI started for model {model[0]}")
            parameters = [
                ("tokenizer-name", model[0]),
                ("max-vus", 800),
                ("url", "http://localhost:8080"),
                ("duration", "120s"),
                ("warmup", "30s"),
                ("benchmark-kind", "rate"),
                (
                    "prompt-options",
                    "num_tokens=200,max_tokens=220,min_tokens=180,variance=10",
                ),
                (
                    "decode-options",
                    "num_tokens=200,max_tokens=220,min_tokens=180,variance=10",
                ),
                (
                    "extra-meta",
                    f'"engine=TGI,tp={model[1]},version={sha},gpu={get_gpu_name()}"',
                ),
                ("no-console", None),
            ]
            rates = [("rates", f"{r / 10.}") for r in list(range(8, 248, 8))]
            parameters.extend(rates)
            runner.run(parameters, f"container:{tgi_runner.container.id}")
        except Exception as e:
            logger.error(f"Error running benchmark for model {model[0]}: {e}")
            # print the stack trace
            print(traceback.format_exc())
            success = False
        finally:
            tgi_runner.stop()
            runner.stop()
    if not success:
        logger.error("Some benchmarks failed")
        exit(1)

    df = pd.DataFrame()
    # list recursively directories
    directories = [
        f"{results_dir}/{d}"
        for d in os.listdir(results_dir)
        if os.path.isdir(f"{results_dir}/{d}")
    ]
    logger.info(f"Found result directories: {directories}")
    for directory in directories:
        data_files = {}
        for filename in os.listdir(directory):
            if filename.endswith(".json"):
                data_files[filename.split(".")[-2]] = f"{directory}/{filename}"
        logger.info(f"Processing directory {directory}")
        df = pd.concat([df, build_df(directory.split("/")[-1], data_files)])
    df["device"] = get_gpu_name()
    df["error_rate"] = (
        df["failed_requests"]
        / (df["failed_requests"] + df["successful_requests"])
        * 100.0
    )
    df.to_parquet(results_file)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--sha", help="SHA of the commit to add to the results", required=True
    )
    parser.add_argument(
        "--results-file",
        help="The file where to store the results, can be a local file or a s3 path",
    )
    args = parser.parse_args()
    if args.results_file is None:
        results_file = f"{args.sha}.parquet"
    else:
        results_file = args.results_file

    main(args.sha, results_file)


================================================
FILE: load_tests/common.js
================================================
import { check } from 'k6';
import { scenario } from 'k6/execution';
import http from 'k6/http';
import { Trend, Counter } from 'k6/metrics';

const host = __ENV.HOST;
const model_id = __ENV.MODEL_ID;
const timePerToken = new Trend('time_per_token', true);
const tokens = new Counter('tokens');
const new_tokens = new Counter('new_tokens');
const input_tokens = new Counter('input_tokens');
const max_new_tokens = 50;

// const shareGPT = JSON.parse(open("ShareGPT_V3_unfiltered_cleaned_split.json"))
const shareGPT = JSON.parse(open("small.json"))


export function get_options() {
    return {
        thresholds: {
            http_req_failed: ['rate==0'],
            // time_per_token: [{
            //     threshold: `p(50)<${5 * reference_latency_ms}`,
            //     abortOnFail: true,
            //     delayAbortEval: '10s'
            // }],
        },
        scenarios: {
            // single_user: {
            //     executor: 'constant-arrival-rate',
            //     duration: '60s',
            //     preAllocatedVUs: 1,
            //     rate: 20,
            //     timeUnit: '1s',
            // },
            // load_test: {
            //     executor: 'constant-arrival-rate',
            //     duration: '60s',
            //     preAllocatedVUs: 100,
            //     rate: 1,
            //     timeUnit: '1s',
            // },
            // breakpoint: {
            //     executor: 'ramping-arrival-rate', //Assure load increase if the system slows
            //     preAllocatedVUs: 300,
            //     stages: [
            //         { duration: '60s', target: 100 }, // just slowly ramp-up to a HUGE load
            //     ],
            // },
            throughput: {
                executor: 'shared-iterations',
                vus: 100,
                iterations: 200,
                maxDuration: '40s',
            },
        },
    };
}

function generate_payload(gpt, max_new_tokens) {
    const input = gpt["conversations"][0]["value"];
    return { "messages": [{ "role": "user", "content": input }], "temperature": 0, "model": `${model_id}`, "max_tokens": max_new_tokens }
}

export const options = get_options();

export default function run() {
    const headers = { 'Content-Type': 'application/json' };
    const query = shareGPT[scenario.iterationInTest % shareGPT.length];
    const payload = JSON.stringify(generate_payload(query, max_new_tokens));
    const res = http.post(`http://${host}/v1/chat/completions`, payload, {
        headers,
    });
    if (res.status >= 400 && res.status < 500) {
        return;
    }


    check(res, {
        'Post status is 200': (res) => res.status === 200,
    });
    const duration = res.timings.duration;

    if (res.status === 200) {
        const body = res.json();
        const completion_tokens = body.usage.completion_tokens;
        const latency_ms_per_token = duration / completion_tokens;
        timePerToken.add(latency_ms_per_token);
        const prompt_tokens = body.usage.prompt_tokens;
        input_tokens.add(prompt_tokens);
        new_tokens.add(completion_tokens);
        tokens.add(completion_tokens + prompt_tokens);
    }
}


================================================
FILE: load_tests/filter.py
================================================
import json


def main():
    with open("./ShareGPT_V3_unfiltered_cleaned_split.json", "r") as f:
        data = json.load(f)

    # Select only the first 2k conversations that start with a human.
    max = 2000
    conversations = []
    for conversation in data:
        conv = conversation.get("conversations")
        if conv and conv[0]["from"] == "human":
            # Trim the rest of the output
            conversation["conversations"] = conversation["conversations"][:1]
            conversations.append(conversation)

            if len(conversation) >= max:
                break

    with open("./small.json", "w") as f:
        data = json.dump(conversations, f, indent=4)


if __name__ == "__main__":
    main()


================================================
FILE: load_tests/long.js
================================================
import { check } from 'k6';
import { scenario } from 'k6/execution';
import http from 'k6/http';
import { Trend, Counter } from 'k6/metrics';

const host = __ENV.HOST;
const model_id = __ENV.MODEL_ID;
const timePerToken = new Trend('time_per_token', true);
const tokens = new Counter('tokens');
const new_tokens = new Counter('new_tokens');
const input_tokens = new Counter('input_tokens');
const max_new_tokens = 50;

// const shareGPT = JSON.parse(open("ShareGPT_V3_unfiltered_cleaned_split.json"))
const shareGPT = JSON.parse(open("long.json"))


export function get_options() {
    return {
        thresholds: {
            http_req_failed: ['rate==0'],
            // time_per_token: [{
            //     threshold: `p(50)<${5 * reference_latency_ms}`,
            //     abortOnFail: true,
            //     delayAbortEval: '10s'
            // }],
        },
        scenarios: {
            // single_user: {
            //     executor: 'constant-arrival-rate',
            //     duration: '60s',
            //     preAllocatedVUs: 1,
            //     rate: 20,
            //     timeUnit: '1s',
            // },
            // load_test: {
            //     executor: 'constant-arrival-rate',
            //     duration: '60s',
            //     preAllocatedVUs: 100,
            //     rate: 1,
            //     timeUnit: '1s',
            // },
            // breakpoint: {
            //     executor: 'ramping-arrival-rate', //Assure load increase if the system slows
            //     preAllocatedVUs: 300,
            //     stages: [
            //         { duration: '60s', target: 100 }, // just slowly ramp-up to a HUGE load
            //     ],
            // },
            throughput: {
                executor: 'shared-iterations',
                vus: 10,
                iterations: 10,
                maxDuration: '120s',
            },
        },
    };
}

function generate_payload(gpt, max_new_tokens) {
    const input = gpt["conversations"][0]["value"];
    return { "messages": [{ "role": "user", "content": input }], "temperature": 0, "model": `${model_id}`, "max_tokens": max_new_tokens }
}

export const options = get_options();

export default function run() {
    const headers = { 'Content-Type': 'application/json' };
    const query = shareGPT[scenario.iterationInTest % shareGPT.length];
    const payload = JSON.stringify(generate_payload(query, max_new_tokens));
    const res = http.post(`http://${host}/v1/chat/completions`, payload, {
        headers,
    });
    if (res.status >= 400 && res.status < 500) {
        return;
    }


    check(res, {
        'Post status is 200': (res) => res.status === 200,
    });
    const duration = res.timings.duration;

    if (res.status === 200) {
        const body = res.json();
        const completion_tokens = body.usage.completion_tokens;
        const latency_ms_per_token = duration / completion_tokens;
        timePerToken.add(latency_ms_per_token);
        const prompt_tokens = body.usage.prompt_tokens;
        input_tokens.add(prompt_tokens);
        new_tokens.add(completion_tokens);
        tokens.add(completion_tokens + prompt_tokens);
    }
}


================================================
FILE: load_tests/long.py
================================================
import datasets
import json


dataset = datasets.load_dataset("ccdv/govreport-summarization")
max_new_tokens = 50


conversations = []

for i, item in enumerate(dataset["test"]):
    report = item["report"]

    messages = [{"from": "human", "value": f"Summarize this report: ```{report}```"}]

    conversations.append({"conversations": messages})

with open("long.json", "w") as f:
    json.dump(conversations, f, indent=4)


================================================
FILE: load_tests/long_prompt2.py
================================================
# https://www.gutenberg.org/cache/epub/103/pg103.txt
from openai import OpenAI
import os
import requests

if not os.path.exists("pg103.txt"):
    response = requests.get("https://www.gutenberg.org/cache/epub/103/pg103.txt")
    with open("pg103.txt", "w") as f:
        f.write(response.text)


length = 130000
with open("pg103.txt", "r") as f:
    data = f.read()

messages = [{"role": "user", "content": data[: length * 4]}]

client = OpenAI(base_url="http://localhost:8000/v1", api_key="w")

completion = client.chat.completions.create(
    model="meta-llama/Llama-3.1-8B-Instruct", messages=messages, max_tokens=2
)


================================================
FILE: load_tests/orca.py
================================================
import json
import datasets
import tqdm


def main():
    dataset = datasets.load_dataset("Open-Orca/OpenOrca", split="train")
    # Select only the first 2k conversations that start with a human.
    max = min(2000, len(dataset))
    conversations = []
    for item in tqdm.tqdm(dataset, total=max):
        conversation = {
            "conversations": [
                {"from": "human", "value": item["question"]},
            ],
            "id": item["id"],
        }
        conversations.append(conversation)
        if len(conversations) >= max:
            break

    with open("./small.json", "w") as f:
        json.dump(conversations, f, indent=4)


if __name__ == "__main__":
    main()


================================================
FILE: load_tests/pyproject.toml
================================================
[tool.poetry]
name = "text-generation-inference-benchmarks"
version = "0.1.0"
description = ""
authors = ["Hugo Larcher <hugo.larcher@huggingface.co>"]
readme = "README.md"

[tool.poetry.dependencies]
python = "^3.11"
docker = "^7.1.0"
loguru = "^0.7.2"
psutil = "^6.0.0"
gputil = "^1.4.0"
pandas = "^2.2.3"
pyarrow = "^17.0.0"

[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"


================================================
FILE: nix/client.nix
================================================
{
  buildPythonPackage,
  poetry-core,
  aiohttp,
  huggingface-hub,
  pydantic,
}:

buildPythonPackage {
  name = "text-generation";

  src = ../clients/python;

  pyproject = true;

  build-system = [ poetry-core ];

  dependencies = [
    aiohttp
    huggingface-hub
    pydantic
  ];
}


================================================
FILE: nix/crate-overrides.nix
================================================
{ pkgs, nix-filter }:

let
  filter = nix-filter.lib;
in
with pkgs;
defaultCrateOverrides
// {
  aws-lc-rs = attrs: {
    # aws-lc-rs does its own custom parsing of Cargo environment
    # variables like DEP_.*_INCLUDE. However buildRustCrate does
    # not use the version number, so the parsing fails.
    postPatch = ''
      substituteInPlace build.rs \
        --replace-fail \
        "assert!(!selected.is_empty()" \
        "// assert!(!selected.is_empty()"
    '';
  };
  rav1e = attrs: { env.CARGO_ENCODED_RUSTFLAGS = "-C target-feature=-crt-static"; };

  grpc-metadata = attrs: {
    src = filter {
      root = ../backends/grpc-metadata;
      include = with filter; [
        isDirectory
        (matchExt "rs")
      ];
    };
  };
  pyo3-build-config = attrs: {
    buildInputs = [ python3 ];
  };
  text-generation-benchmark = attrs: {
    src = filter {
      root = ../benchmark;
      include = with filter; [
        isDirectory
        (matchExt "rs")
      ];
    };
  };
  text-generation-client = attrs: {
    src = filter {
      root = ../.;
      include = with filter; [
        isDirectory
        (and (inDirectory "backends/client") (matchExt "rs"))
        (and (inDirectory "proto") (matchExt "proto"))
      ];
    };
    postPatch = "cd backends/client";
    buildInputs = [ protobuf ];
  };
  text-generation-launcher = attrs: {
    src = filter {
      root = ../launcher;
      include = with filter; [
        isDirectory
        (matchExt "rs")
      ];
    };
  };
  text-generation-router = attrs: {
    src = filter {
      root = ../router;
      include = with filter; [
        isDirectory
        (matchExt "rs")
      ];
    };
  };
  text-generation-router-v3 = attrs: {
    # We need to do the src/source root dance so that the build
    # has access to the protobuf file.
    src = filter {
      root = ../.;
      include = with filter; [
        isDirectory
        (and (inDirectory "backends/v3") (matchExt "rs"))
        (and (inDirectory "proto") (matchExt "proto"))
      ];
    };
    postPatch = "cd backends/v3";
    buildInputs = [ protobuf ];
  };
}


================================================
FILE: nix/docker.nix
================================================
{
  stdenv,
  dockerTools,
  cacert,
  text-generation-inference,
  stream ? false,
}:

let
  build = if stream then dockerTools.streamLayeredImage else dockerTools.buildLayeredImage;
in
build {
  name = "tgi-docker";
  tag = "latest";
  compressor = "zstd";
  config = {
    EntryPoint = [ "${text-generation-inference}/bin/text-generation-inference" ];
    Env = [
      "HF_HOME=/data"
      "PORT=80"
      # The CUDA container toolkit will mount the driver shim into the
      # container. We just have to ensure that the dynamic loader finds
      # the libraries.
      "LD_LIBRARY_PATH=/usr/lib64"
    ];

  };
  extraCommands = ''
    mkdir -p tmp
    chmod -R 1777 tmp
  '';
  contents = [
    cacert
    stdenv.cc
  ];
}


================================================
FILE: nix/impure-shell.nix
================================================
{
  lib,
  mkShell,
  black,
  cmake,
  isort,
  ninja,
  which,
  cudaPackages,
  openssl,
  pkg-config,
  poetry,
  protobuf,
  python3,
  pyright,
  redocly,
  ruff,
  rust-bin,
  server,

  # Enable dependencies for building CUDA packages. Useful for e.g.
  # developing marlin/moe-kernels in-place.
  withCuda ? false,
}:

mkShell {
  nativeBuildInputs =
    [
      black
      isort
      pkg-config
      poetry
      (rust-bin.stable.latest.default.override {
        extensions = [
          "rust-analyzer"
          "rust-src"
        ];
      })
      protobuf
      pyright
      redocly
      ruff
    ]
    ++ (lib.optionals withCuda [
      cmake
      ninja
      which

      # For most Torch-based extensions, setting CUDA_HOME is enough, but
      # some custom CMake builds (e.g. vLLM) also need to have nvcc in PATH.
      cudaPackages.cuda_nvcc
    ]);
  buildInputs =
    [
      openssl.dev
    ]
    ++ (with python3.pkgs; [
      venvShellHook
      docker
      pip
      ipdb
      click
      openai
      pytest
      pytest-asyncio
      syrupy
    ])
    ++ (lib.optionals withCuda (
      with cudaPackages;
      [
        cuda_cccl
        cuda_cudart
        cuda_nvrtc
        cuda_nvtx
        cuda_profiler_api
        cudnn
        libcublas
        libcusolver
        libcusparse
      ]
    ));

  inputsFrom = [ server ];

  env = lib.optionalAttrs withCuda {
    CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}";
    TORCH_CUDA_ARCH_LIST = lib.concatStringsSep ";" python3.pkgs.torch.cudaCapabilities;
  };

  venvDir = "./.venv";

  postVenvCreation = ''
    unset SOURCE_DATE_EPOCH
    ( cd server ; python -m pip install --no-build-isolation --no-dependencies -e . )
    ( cd clients/python ; python -m pip install --no-dependencies -e . )
  '';

  postShellHook =
    ''
      unset SOURCE_DATE_EPOCH
      export PATH=${cudaPackages.backendStdenv.cc}/bin:$PATH:~/.cargo/bin
    ''
    # At various points in time, the latest gcc supported by CUDA differs
    # from the default version in nixpkgs. A lot of the dependencies in
    # the impure environment pull in the default gcc from nixpkgs, so we
    # end up with the CUDA-supported gcc and the nixpkgs default gcc in
    # the path. To ensure that we can build CUDA kernels, put the CUDA
    # first in the path. It's a hack, but it works.
    + lib.optionalString withCuda ''
      export PATH=${cudaPackages.backendStdenv.cc}/bin:$PATH
    '';
}


================================================
FILE: nix/overlay.nix
================================================
final: prev: {
  # You can use this overlay to temporarily override packages for
  # development. For permanent overrides, it's better to do this in
  # our package flake:
  #
  # https://github.com/huggingface/text-generation-inference-nix
  #
  # Note that overriding packages that are in the transitive closure
  # of many other packages (e.g. transformers) will require a large
  # rebuild.

  pythonPackagesExtensions = prev.pythonPackagesExtensions ++ [
    (
      python-self: python-super: with python-self; {
        # Python package override example:
        #transformers = python-super.transformers.overrideAttrs (
        #  _: _: {
        #    src = final.fetchFromGitHub {
        #      owner = "huggingface";
        #      repo = "transformers";
        #      rev = "v4.51.0";
        #      hash = "sha256-dnVpc6fm1SYGcx7FegpwVVxUY6XRlsxLs5WOxYv11y8=";
        #    };
        #  }
        #);
        #huggingface-hub = python-super.huggingface-hub.overrideAttrs (
        #  _: _: {
        #    src = final.fetchFromGitHub {
        #      owner = "huggingface";
        #      repo = "huggingface_hub";
        #      rev = "v0.30.0";
        #      hash = "sha256-sz+n1uoWrSQPqJFiG/qCT6b4r08kD9MsoPZXbfWNB2o=";
        #    };
        #  }
        #);
      }
    )
  ];

  # Non-python package override example:
  #
  # ripgrep = prev.ripgrep.overrideAttrs (
  #    _: _: {
  #      src = final.fetchFromGitHub {
  #      owner = "BurntSushi";
  #      repo = "ripgrep";
  #      rev = "79cbe89deb1151e703f4d91b19af9cdcc128b765";
  #      hash = "sha256-JPTM2KNmGMb+/jOfK3X7OM1wnN+3TU35SJOIcqmp3mg=";
  #   };
  # });
}


================================================
FILE: nix/server.nix
================================================
{
  nix-filter,
  buildPythonPackage,
  poetry-core,
  mypy-protobuf,
  awq-inference-engine,
  causal-conv1d,
  compressed-tensors,
  einops,
  exllamav2,
  flashinfer,
  flash-attn,
  flash-attn-layer-norm,
  flash-attn-v1,
  grpc-interceptor,
  grpcio-reflection,
  grpcio-status,
  grpcio-tools,
  hf-transfer,
  hf-xet,
  kernels,
  loguru,
  mamba-ssm,
  moe,
  opentelemetry-api,
  opentelemetry-exporter-otlp,
  opentelemetry-instrumentation-grpc,
  opentelemetry-semantic-conventions,
  outlines,
  paged-attention,
  peft,
  pillow,
  prometheus-client,
  punica-sgmv,
  py-cpuinfo,
  pydantic,
  quantization,
  quantization-eetq,
  rotary,
  safetensors,
  tokenizers,
  torch,
  sentencepiece,
  transformers,
  typer,
}:

let
  filter = nix-filter.lib;
in
buildPythonPackage {
  name = "text-generation-server";

  src = filter {
    root = ../.;
    include = with filter; [
      isDirectory
      (and (inDirectory "server") (or_ (matchExt "py") (matchExt "pyi")))
      "server/pyproject.toml"
      (and (inDirectory "proto/v3") (matchExt "proto"))
    ];
  };

  pyproject = true;

  build-system = [ poetry-core ];

  nativeBuildInputs = [ mypy-protobuf ];

  pythonRelaxDeps = [
    "einops"
    "huggingface-hub"
    "loguru"
    "opentelemetry-instrumentation-grpc"
    "pillow"
    "sentencepiece"
    "typer"
  ];

  pythonRemoveDeps = [ "scipy" ];

  dependencies = [
    awq-inference-engine
    causal-conv1d
    compressed-tensors
    einops
    exllamav2
    flashinfer
    flash-attn
    flash-attn-layer-norm
    grpc-interceptor
    grpcio-reflection
    grpcio-status
    grpcio-tools
    hf-transfer
    hf-xet
    kernels
    loguru
    mamba-ssm
    moe
    opentelemetry-api
    opentelemetry-exporter-otlp
    opentelemetry-instrumentation-grpc
    opentelemetry-semantic-conventions
    outlines
    paged-attention
    peft
    pillow
    prometheus-client
    punica-sgmv
    py-cpuinfo
    pydantic
    quantization
    quantization-eetq
    rotary
    safetensors
    sentencepiece
    tokenizers
    transformers
    typer
  ];

  prePatch = ''
    python -m grpc_tools.protoc -Iproto/v3 --python_out=server/text_generation_server/pb \
           --grpc_python_out=server/text_generation_server/pb --mypy_out=server/text_generation_server/pb proto/v3/generate.proto
    find server/text_generation_server/pb/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \;
    touch server/text_generation_server/pb/__init__.py
    cd server
  '';
}


================================================
FILE: proto/generate.proto
================================================
syntax = "proto3";

package generate.v2;

service TextGenerationService {
    /// Model Info
    rpc Info (InfoRequest) returns (InfoResponse) {}
    /// Service discovery
    rpc ServiceDiscovery (ServiceDiscoveryRequest) returns (ServiceDiscoveryResponse) {}
    /// Empties batch cache
    rpc ClearCache (ClearCacheRequest) returns (ClearCacheResponse);
    /// Remove requests from a cached batch
    rpc FilterBatch (FilterBatchRequest) returns (FilterBatchResponse);
    /// Warmup the model and compute max cache size
    rpc Warmup (WarmupRequest) returns (WarmupResponse);
    /// Prefill batch and decode first token
    rpc Prefill (PrefillRequest) returns (PrefillResponse);
    /// Decode token for a list of prefilled batches
    rpc Decode (DecodeRequest) returns (DecodeResponse);
    /// Health check
    rpc Health (HealthRequest) returns (HealthResponse);
}

message HealthRequest {}
message HealthResponse {}

/// Empty request
message InfoRequest {}

message InfoResponse {
    bool requires_padding = 1;
    string dtype = 2;
    string device_type = 3;
    optional uint32 window_size = 4;
    uint32 speculate = 5;
}

/// Empty request
message ServiceDiscoveryRequest {}

message ServiceDiscoveryResponse {
    /// Other shards urls
    repeated string urls = 1;
}

message ClearCacheRequest {
    /// Optional batch id
    optional uint64 id = 1;
}

/// Empty response
message ClearCacheResponse {}

enum GrammarType {
    GRAMMAR_TYPE_NONE = 0;
    GRAMMAR_TYPE_JSON = 1;
    GRAMMAR_TYPE_REGEX = 2;
}

message NextTokenChooserParameters {
    /// exponential scaling output probability distribution
    float temperature = 1;
    /// restricting to the k highest probability elements
    uint32 top_k = 2;
    /// restricting to top tokens summing to prob_cut_off <= prob_cut_off
    float top_p = 3;
    /// restricting to top tokens summing to prob_cut_off <= prob_cut_off
    float typical_p = 4;
    /// apply sampling on the logits
    bool do_sample = 5;
    /// random seed for sampling
    uint64 seed = 6;
    /// repetition penalty
    float repetition_penalty = 7;
    /// frequency penalty
    float frequency_penalty = 9;
    /// token watermarking using "A Watermark for Large Language Models"
    bool watermark = 8;
    /// grammar (applied if not empty)
    string grammar = 10;
    /// grammar type
    GrammarType grammar_type = 11;
}

message StoppingCriteriaParameters {
    /// Maximum number of generated tokens
    uint32 max_new_tokens = 1;
    /// Optional stopping sequences
    repeated string stop_sequences = 2;
    /// Ignore end of sequence token
    /// used for benchmarking
    bool ignore_eos_token = 3;
}

message Request {
    /// Request ID
    uint64 id = 1;
    /// The generation context
    string inputs = 2;
    /// Context truncation
    uint32 truncate = 3;
    /// Next Token Chooser Parameters
    NextTokenChooserParameters parameters = 4;
    /// Stopping Criteria Parameters
    StoppingCriteriaParameters stopping_parameters = 5;
    /// Return prefill logprobs
    bool prefill_logprobs = 6;
    /// Return most likely n tokens
    uint32 top_n_tokens = 7;
}

message Batch {
    /// Batch ID
    uint64 id = 1;
    /// Individual requests
    repeated Request requests = 2;
    /// Batch size (==len(requests))
    uint32 size = 3;
    /// Maximum number of tokens this batch will grow to
    uint32 max_tokens = 4;
}

message CachedBatch {
    /// Batch ID
    uint64 id = 1;
    /// Individual requests ids
    repeated uint64 request_ids = 2;
    /// Batch size (==len(requests))
    uint32 size = 3;
    /// Maximum number of tokens this batch will grow to
    uint32 max_tokens = 4;
}

enum FinishReason {
    FINISH_REASON_LENGTH = 0;
    FINISH_REASON_EOS_TOKEN = 1;
    FINISH_REASON_STOP_SEQUENCE = 2;
}

message GeneratedText {
    /// Output
    string text = 1;
    /// Number of generated tokens
    uint32 generated_tokens = 2;
    /// Finish reason
    FinishReason finish_reason = 3;
    /// Seed
    optional uint64 seed = 4;
}

message Tokens {
    /// Token IDs
    repeated uint32 ids = 1;
    /// Logprobs
    repeated float logprobs = 2;
    /// tokens
    repeated string texts = 3;
    /// special
    repeated bool is_special = 4;
}

message Generation {
    /// Request ID
    uint64 request_id = 1;
    /// Prefill tokens (optional)
    Tokens prefill_tokens = 2;
    Tokens tokens = 3;
    /// Complete generated text
    optional GeneratedText generated_text = 4;
    /// Top tokens
    repeated Tokens top_tokens = 5;
}

message FilterBatchRequest {
    /// Batch ID
    uint64 batch_id = 1;
    /// Requests to keep
    repeated uint64 request_ids = 2;
}

message FilterBatchResponse {
    /// Filtered Batch (cached)
    CachedBatch batch = 1;
}


message PrefillRequest {
    /// Batch
    Batch batch = 1;
}

message PrefillResponse {
    /// Generation
    repeated Generation generations = 1;
    /// Next batch (cached)
    optional CachedBatch batch = 2;
    /// Forward elapsed time in nanoseconds
    uint64 forward_ns = 3;
    /// Decode elapsed time in nanoseconds
    uint64 decode_ns = 4;
    /// Total elapsed time in nanoseconds
    uint64 total_ns = 5;
}

message DecodeRequest {
    /// Cached batches
    repeated CachedBatch batches = 1;
}

message DecodeResponse {
    /// Decodes
    repeated Generation generations = 1;
    /// Next batch (cached)
    optional CachedBatch batch = 2;
    /// Forward elapsed time in nanoseconds
    uint64 forward_ns = 3;
    /// Decode elapsed time in nanoseconds
    uint64 decode_ns = 4;
    /// Total elapsed time in nanoseconds
    uint64 total_ns = 5;
    /// Concatenate elapsed time in nanoseconds
    optional uint64 concat_ns = 6;
}

message WarmupRequest {
    /// Batch to warmup on
    Batch batch = 1;
    uint32 max_input_length = 2;
    uint32 max_prefill_tokens = 3;
    uint32 max_total_tokens = 4;
}

message WarmupResponse {
    /// Maximum number of tokens supported by the model
    optional uint32 max_supported_total_tokens = 1;
}


================================================
FILE: proto/v3/generate.proto
================================================
syntax = "proto3";

package generate.v3;

service TextGenerationService {
  /// Model Info
  rpc Info(InfoRequest) returns (InfoResponse) {}
  /// Service discovery
  rpc ServiceDiscovery(ServiceDiscoveryRequest)
      returns (ServiceDiscoveryResponse) {}
  /// Empties batch cache
  rpc ClearCache(ClearCacheRequest) returns (ClearCacheResponse);
  /// Remove requests from a cached batch
  rpc FilterBatch(FilterBatchRequest) returns (FilterBatchResponse);
  /// Warmup the model and compute max cache size
  rpc Warmup(WarmupRequest) returns (WarmupResponse);
  /// Prefill batch and decode first token
  rpc Prefill(PrefillRequest) returns (PrefillResponse);
  /// Decode token for a list of prefilled batches
  rpc Decode(DecodeRequest) returns (DecodeResponse);
  /// Health check
  rpc Health(HealthRequest) returns (HealthResponse);
}

message HealthRequest {}
message HealthResponse {}

/// Empty request
message InfoRequest {}

message InfoResponse {
  bool requires_padding = 1;
  string dtype = 2;
  string device_type = 3;
  optional uint32 window_size = 4;
  uint32 speculate = 5;
  bool support_chunking = 6;
  bool use_prefix_caching = 7;
  string attention_impl = 8;
  uint32 block_size = 9;
}

/// Empty request
message ServiceDiscoveryRequest {}

message ServiceDiscoveryResponse {
  /// Other shards urls
  repeated string urls = 1;
}

message ClearCacheRequest {
  /// Optional batch id
  optional uint64 id = 1;
}

/// Empty response
message ClearCacheResponse {}

message Image {
  /// Binary image data.
  bytes data = 1;

  /// Image MIME type.
  string mimetype = 2;
}

message InputChunk {
  oneof chunk {
    /// Plain text data
    string text = 1;
    /// Image data
    Image image = 2;
  }
}

message Input { repeated InputChunk chunks = 1; }

enum GrammarType {
  GRAMMAR_TYPE_NONE = 0;
  GRAMMAR_TYPE_JSON = 1;
  GRAMMAR_TYPE_REGEX = 2;
}

message NextTokenChooserParameters {
  /// exponential scaling output probability distribution
  float temperature = 1;
  /// restricting to the k highest probability elements
  uint32 top_k = 2;
  /// restricting to top tokens summing to prob_cut_off <= prob_cut_off
  float top_p = 3;
  /// restricting to top tokens summing to prob_cut_off <= prob_cut_off
  float typical_p = 4;
  /// apply sampling on the logits
  bool do_sample = 5;
  /// random seed for sampling
  uint64 seed = 6;
  /// repetition penalty
  float repetition_penalty = 7;
  /// frequency penalty
  float frequency_penalty = 9;
  /// token watermarking using "A Watermark for Large Language Models"
  bool watermark = 8;
  /// grammar (applied if not empty)
  string grammar = 10;
  /// grammar type
  GrammarType grammar_type = 11;
}

message StoppingCriteriaParameters {
  /// Maximum number of generated tokens
  uint32 max_new_tokens = 1;
  /// Optional stopping sequences
  repeated string stop_sequences = 2;
  /// Ignore end of sequence token
  /// used for benchmarking
  bool ignore_eos_token = 3;
}

message Request {
  /// Request ID
  uint64 id = 1;
  /// The generation context as chunks
  Input input_chunks = 8;
  /// The generation context, stringified input_chunks
  string inputs = 2;
  /// Context truncation
  uint32 truncate = 3;
  /// Next Token Chooser Parameters
  NextTokenChooserParameters parameters = 4;
  /// Stopping Criteria Parameters
  StoppingCriteriaParameters stopping_parameters = 5;
  /// Return prefill logprobs
  bool prefill_logprobs = 6;
  /// Return most likely n tokens
  uint32 top_n_tokens = 7;
  /// Paged attention blocks
  repeated uint32 blocks = 9;
  /// Paged attention slots
  repeated uint32 slots = 10;
  /// LORA adapter index
  optional string adapter_id = 11;
  /// Tokens that can be retrieved from the KV cache.
  /// This value is set for the first prefill and never reset
  uint32 cache_len = 12;
  /// Context truncation
  bool add_special_tokens = 13;
  /// Chunk of tokens that must be computed for the first prefill
  /// This value is set for the first prefill and never reset
  optional uint32 chunk_len = 14;
}

message Batch {
  /// Batch ID
  uint64 id = 1;
  /// Individual requests
  repeated Request requests = 2;
  /// Batch size (==len(requests))
  uint32 size = 3;
  /// Maximum number of tokens this batch will grow to
  uint32 max_tokens = 4;
  /// Maximum number of Paged Attention blocks
  uint32 max_blocks = 5;
}

message CachedBatch {
  /// Batch ID
  uint64 id = 1;
  /// Individual requests ids
  repeated uint64 request_ids = 2;
  /// Batch size (==len(requests))
  uint32 size = 3;
  /// Maximum number of tokens this batch will grow to
  uint32 max_tokens = 4;
  /// Number of tokens in the next forward
  uint32 current_tokens = 5;
}

enum FinishReason {
  FINISH_REASON_LENGTH = 0;
  FINISH_REASON_EOS_TOKEN = 1;
  FINISH_REASON_STOP_SEQUENCE = 2;
}

message GeneratedText {
  /// Output
  string text = 1;
  /// Number of generated tokens
  uint32 generated_tokens = 2;
  /// Finish reason
  FinishReason finish_reason = 3;
  /// Seed
  optional uint64 seed = 4;
}

message Tokens {
  /// Token IDs
  repeated uint32 ids = 1;
  /// Logprobs
  repeated float logprobs = 2;
  /// tokens
  repeated string texts = 3;
  /// special
  repeated bool is_special = 4;
}

message Generation {
  /// Request ID
  uint64 request_id = 1;
  /// Prefill tokens (optional)
  Tokens prefill_tokens = 2;
  Tokens tokens = 3;
  /// Complete generated text
  optional GeneratedText generated_text = 4;
  /// Top tokens
  repeated Tokens top_tokens = 5;
}

message FilterBatchRequest {
  /// Batch ID
  uint64 batch_id = 1;
  /// Requests to keep
  repeated uint64 request_ids = 2;
}

message FilterBatchResponse {
  /// Filtered Batch (cached)
  CachedBatch batch = 1;
}

message PrefillRequest {
  /// Batch
  Batch batch = 1;
  /// Optional cached batch
  CachedBatch cached_batch = 2;
}

message PrefillResponse {
  /// Generation
  repeated Generation generations = 1;
  /// Next batch (cached)
  optional CachedBatch batch = 2;
  /// Forward elapsed time in nanoseconds
  uint64 forward_ns = 3;
  /// Decode elapsed time in nanoseconds
  uint64 decode_ns = 4;
  /// Total elapsed time in nanoseconds
  uint64 total_ns = 5;
  /// Concatenate elapsed time in nanoseconds
  optional uint64 concat_ns = 6;
}

message DecodeRequest {
  /// Cached batches
  repeated CachedBatch batches = 1;
}

message DecodeResponse {
  /// Decodes
  repeated Generation generations = 1;
  /// Next batch (cached)
  optional CachedBatch batch = 2;
  /// Forward elapsed time in nanoseconds
  uint64 forward_ns = 3;
  /// Decode elapsed time in nanoseconds
  uint64 decode_ns = 4;
  /// Total elapsed time in nanoseconds
  uint64 total_ns = 5;
  /// Concatenate elapsed time in nanoseconds
  optional uint64 concat_ns = 6;
}

message WarmupRequest {
  /// Batch to warmup on
  Batch batch = 1;
  optional uint32 max_input_tokens = 2;
  uint32 max_prefill_tokens = 3;
  optional uint32 max_total_tokens = 4;
}

message WarmupResponse {
  /// Maximum number of tokens supported by the model
  optional uint32 max_supported_total_tokens = 1;
  /// Maximum input tokens by clients should be equal to request value if it's set
  /// Otherwise warmup automatically allocates a value here
  uint32 max_input_tokens = 2;
  /// Maximum total tokens by clients should be equal to request value if it's set
  /// Otherwise warmup automatically allocates a value here
  uint32 max_total_tokens = 3;
}


================================================
FILE: router/Cargo.toml
================================================
[package]
name = "text-generation-router"
description = "Text Generation Webserver"
build = "build.rs"
version.workspace = true
edition.workspace = true
authors.workspace = true
homepage.workspace = true

[dependencies]
anyhow = "1"
async-trait = "0.1.74"
async-stream = "0.3.5"
axum = { version = "0.7", features = ["json"] }
axum-tracing-opentelemetry = "0.16"
clap = { version = "4.4.5", features = ["derive", "env"] }
futures = "0.3.28"
hf-hub = { workspace = true }
itertools = "0.10"
jsonschema = { version = "0.28.0" }
metrics = { workspace = true }
metrics-exporter-prometheus = { workspace = true }
nohash-hasher = "0.2.0"
opentelemetry = { version = "0.20.0", features = ["rt-tokio"] }
opentelemetry-otlp = "0.13.0"
outlines-core = { git = "https://github.com/dottxt-ai/outlines-core.git", rev = "ba10c619fc9bf3c487e43f49bdecb95a24bb465c" }
rand = "0.8.5"
reqwest = { version = "0.11.20", features = ["blocking"] }
serde = "1.0.188"
serde_json = "1.0.107"
thiserror = "1.0.48"
tokenizers = { workspace = true }
tokio = { version = "1.32.0", features = [
  "rt",
  "rt-multi-thread",
  "parking_lot",
  "signal",
  "sync",
] }
tokio-stream = "0.1.14"
tower-http = { version = "0.5.1", features = ["cors"] }
tracing = "0.1.40"
tracing-opentelemetry = "0.21.0"
tracing-subscriber = { version = "0.3.18", features = ["json", "env-filter"] }
utoipa = { version = "4.2.0", features = ["axum_extras"] }
utoipa-swagger-ui = { version = "6.0.0", features = ["axum"] }
ngrok = { version = "0.13.1", features = ["axum"], optional = true }
init-tracing-opentelemetry = { version = "0.14.1", features = [
  "opentelemetry-otlp",
] }
minijinja = { workspace = true, features = ["loop_controls"] }
minijinja-contrib = { workspace = true }
futures-util = "0.3.30"
regex = "1.10.3"
once_cell = "1.19.0"
image = "0.25.1"
base64 = { workspace = true }
sysinfo = "0.30.13"
uuid = { version = "1.9.1", default-features = false, features = [
  "v4",
  "fast-rng",
  "macro-diagnostics",
] }
csv = "1.3.0"
ureq = "=2.9"
pyo3 = { workspace = true }
chrono = "0.4.39"


[build-dependencies]
vergen = { version = "8.2.5", features = ["build", "git", "gitcl"] }

[features]
default = ["ngrok"]
ngrok = ["dep:ngrok"]
google = []
kserve = []


================================================
FILE: router/README.md
================================================
# Router

Also named `webserver` throughout the docs.

This router is handling most of the logic to handle the "batches" tell
when to pass new `prefill` requests and pausing `decode` requests, which ones etc...

It uses gRPC to communicate with the shards which can therefore be kept
much simpler and focus on having the most efficient forward passes as possible.

## Continuous batching

One important feature of `text-generation-inference` is enabled
by this `router`.

Continuous batching is the act of regularly running queries in the same
`forward` step of the LLM (a "batch") and also removing them when they are
finished.

In order for continuous batching to be useful, you need to have more compute available
with respect to the memory requirements of your model. This is essentially true for
LLMs and the larger the model, the truer it gets (since you have to pool multiple
GPUs to load the model, you effectively have a lot of compute power at your hands).


Static batching is the act of doing several queries at the same time, but usually
this is controlled by the client, and therefore the amount of batching is decided
beforehand.

For text-generation, and LLMs which are memory bound we can try to be much more
efficient with the available compute, by having client sending us single queries,
and let the router mix&match queries into or out of batches to make the use the
compute the most efficiently. This is possible because for LLMs the total compute
for running the model is much bigger than doing mix&match of the batches themselves.


### Simple continuous batching

text-generation works by feeding a prompt to a model, and iteratively calling
`forward` on the model to produce new text, 1 token at a time.

The first idea is simple, when a query arrives, we start working on it directly.
When new queries arrive, we simply wait for the current `forward` to be finished
then batch the current running prompt with the new query, and call `forward`.

Whenever either query is finished: either the model produce EOS (end of sentence) token
or the query reached the allowed limit. We simply drop it from the batch, remove
all the allocated memory and we can continue with the rest until nothing is left.

This simple idea generalizes very well and we could potentially stack many requests
in the same batch.

One thing to note, is that queries can be potentially run with different parameters
meaning different way to choose the next token (sampling, not sampling, temperature, top_k etc..). This is not problematic for the proposed approach we just need to do the sampling
independantly on each member of the batch.

### Prefill, decode and past key values

In order to make LLMs and text-generation efficient, there's actually a very powerful
trick that can be used, which is the "caching" of some attention matrices. [More on that
in the first part of this blog](https://huggingface.co/blog/accelerated-inference#getting-to-the-first-10x-speedup)

What this means, is that the first "pass" of a prompt is different from the subsequent
"forward" passes. Since for the first one we have to compute the entire attention matrix, whereas in the follow-ups only require to compute the new token attention.
The first pass is called `prefill` throughout this codebase where as the follow-ups are called `decode`.

Since `prefill` is much more expensive than `decode` we don't want to do it all the time,
but a currently running query is probably doing `decode`. If we want to do the continuous
batching as explained previously we need to run `prefill` at some point in order to create
the attention matrix required to be able to join the `decode` group.

`text-generation-inference` uses a bunch of different strategies and parameters in
order to enable you to find the sweet spot between exploiting the hardware and perceived latency.

With no continuous batching at all, latency is going to be super good, but throughput (meaning
the total number of requests allowed in a given timeframe) is going to be super bad (since it's essentially 1).

With static batching, you can probably reach the maximum throughput (by using the maximum total batch size applicable to your hardware), but the latency is super bad since in order to have maximum throughput you need to wait for requests to come in before processing.

With continuous batching you can find a sweet spot. In general latency is the most critical
parameter users care about. But a 2x latency slowdown for 10x more users on the same
hardware is an acceptable tradeoff.

## Token streaming

This is a very important aspect of client UX. As mentionned above, latency is the
most critical perceived quality of an LLM API.

With token streaming, the server can start answering after the first `prefill` pass
directly, without waiting for all the generation to be done. For extremely long queries
this means clients can start to see something happening orders of magnitude before
the work is done. Seeing something in progress allows them to cut short if it's not
what's wanted but also it "feels" better.


================================================
FILE: router/build.rs
================================================
use std::error::Error;
use vergen::EmitBuilder;

fn main() -> Result<(), Box<dyn Error>> {
    // Try to get the git sha from the local git repository
    if EmitBuilder::builder()
        .fail_on_error()
        .git_sha(false)
        .emit()
        .is_err()
    {
        // Unable to get the git sha
        if let Ok(sha) = std::env::var("GIT_SHA") {
            // Set it from an env var
            println!("cargo:rustc-env=VERGEN_GIT_SHA={sha}");
        }
    }

    // Set docker label if present
    if let Ok(label) = std::env::var("DOCKER_LABEL") {
        // Set it from an env var
        println!("cargo:rustc-env=DOCKER_LABEL={label}");
    }

    Ok(())
}


================================================
FILE: router/src/chat.rs
================================================
use crate::{
    infer::InferError, ChatCompletionChoice, ChatCompletionChunk, ChatCompletionDelta,
    ChatCompletionLogprobs, CompletionType, DeltaToolCall, Function, FunctionDefinition,
    StreamOptions, StreamResponse, TextMessage, ToolCallDelta, Usage,
};
use serde::Deserialize;
use serde_json::Value;

#[derive(Debug, Deserialize)]
struct ToolCall {
    _name: String,
    #[serde(flatten, default)]
    /// Using Map to preserve order
    arguments: serde_json::Map<String, Value>,
}
#[derive(Debug, Deserialize)]
struct Call {
    function: ToolCall,
}

#[cfg_attr(test, derive(Debug))]
pub(crate) enum ChatEvent {
    NoTool,
    Events(Vec<CompletionType>),
}

#[cfg_attr(test, derive(Debug))]
pub(crate) enum ChatChoice {
    NoTool,
    ToolCalls(Vec<crate::ToolCall>),
}

pub(crate) fn parse_output(generated_text: &str) -> Result<ChatChoice, InferError> {
    let call: Call = serde_json::from_str(generated_text).map_err(|e| {
        InferError::ToolError(format!(
            "Failed to parse generated text: {} {:?}",
            e, generated_text
        ))
    })?;
    let name = call.function._name;

    match &name[..] {
        "no_tool" => {
            // parse the content message
            Ok(ChatChoice::NoTool)
        }
        name => {
            let tool_calls = vec![crate::ToolCall {
                id: "0".to_string(),
                r#type: "function".to_string(),
                function: FunctionDefinition {
                    description: None,
                    name: name.to_string(),
                    arguments: serde_json::to_value(call.function.arguments).map_err(|err| {
                        InferError::ToolError(format!(
                            "Could not convert arguments to JSON map {err}"
                        ))
                    })?,
                },
            }];
            Ok(ChatChoice::ToolCalls(tool_calls))
        }
    }
}

/// Convert a StreamResponse into an Event to be sent over SSE
fn create_event_from_stream_token(
    stream_token: &StreamResponse,
    logprobs: bool,
    inner_using_tools: bool,
    system_fingerprint: String,
    model_id: String,
    function_name: Option<String>,
    id: String,
) -> CompletionType {
    let current_time = std::time::SystemTime::now()
        .duration_since(std::time::UNIX_EPOCH)
        .unwrap_or_else(|_| std::time::Duration::from_secs(0))
        .as_secs();

    let logprobs = logprobs.then(|| {
        ChatCompletionLogprobs::from((stream_token.token.clone(), stream_token.top_tokens.clone()))
    });

    // replace the content with the tool calls if grammar is present
    let content = if !stream_token.token.special {
        Some(stream_token.token.text.clone())
    } else {
        None
    };
    let (content, tool_calls) = if inner_using_tools {
        // Cast into a vec
        (None, content)
    } else {
        (content, None)
    };
    let finish_reason = stream_token
        .details
        .as_ref()
        .map(|details| details.finish_reason.format(true));
    let delta = match (content, tool_calls) {
        (Some(delta), _) => ChatCompletionDelta::Chat(TextMessage {
            role: "assistant".to_string(),
            content: delta,
            ..Default::default()
        }),
        (None, Some(tool_calls)) => ChatCompletionDelta::Tool(ToolCallDelta {
            role: "assistant".to_string(),
            tool_calls: vec![DeltaToolCall {
                index: 0,
                id,
                r#type: "function".to_string(),
                function: Function {
                    name: function_name,
                    arguments: tool_calls,
                },
            }],
        }),
        (None, None) => ChatCompletionDelta::Chat(TextMessage {
            role: "assistant".to_string(),
            content: "".to_string(),
            ..Default::default()
        }),
    };
    let choices = vec![ChatCompletionChoice {
        index: 0,
        delta,
        logprobs,
        finish_reason,
    }];
    CompletionType::ChatCompletionChunk(ChatCompletionChunk::new(
        model_id,
        system_fingerprint,
        current_time,
        choices,
        None,
    ))
}

#[derive(Debug)]
enum StreamState {
    /// Before the tools was parsed
    Buffering,
    /// We detected a tool call here
    Tool,
    /// This is without tool calling
    Content,
}

pub struct ChatState {
    state: StreamState,
    text: String,
    options: StreamOptions,
    model_id: String,
    fingerprint: String,
    logprobs: bool,
    id: String,
}

impl ChatState {
    pub fn new(
        using_tools: bool,
        options: StreamOptions,
        fingerprint: String,
        model_id: String,
        logprobs: bool,
        id: String,
    ) -> Self {
        let state = if using_tools {
            StreamState::Buffering
        } else {
            StreamState::Content
        };
        let text = String::new();
        Self {
            state,
            text,
            options,
            fingerprint,
            model_id,
            logprobs,
            id,
        }
    }

    pub fn push(&mut self, mut stream_token: StreamResponse) -> ChatEvent {
        let mut events = vec![];
        let token_text = &stream_token.token.text;
        match self.state {
            StreamState::Buffering => {
                self.text.push_str(token_text);
                tracing::info!("Current text {:?}", self.text);
                let partial = &self.text;
                let partial =
                    partial.trim_end_matches(|c: char| c.is_whitespace() || c == ',' || c == '}');
                if let Ok(call) = serde_json::from_str::<Call>(&format!("{}}}}}", partial)) {
                    // This can be no_tool before the content has been emitted
                    if call.function._name != "no_tool" {
                        stream_token.token.text = "{".to_string();
                        let chat_complete = create_event_from_stream_token(
                            &stream_token,
                            self.logprobs,
                            true,
                            self.fingerprint.clone(),
                            self.model_id.clone(),
                            Some(call.function._name),
                            self.id.clone(),
                        );

                        events.push(chat_complete);
                        self.state = StreamState::Tool;
                    } else {
                        return ChatEvent::NoTool;
                    }
                }
            }
            StreamState::Tool => {
                self.text.push_str(token_text);
                if serde_json::from_str::<Call>(&self.text).is_ok() {
                    self.state = StreamState::Buffering;
                    let mut text = stream_token.token.text.trim_end();
                    // Effectively trimming only the last closing brace
                    if text.ends_with('}') {
                        text = &text[..text.len() - 1];
                    }
                    stream_token.token.text = text.to_string();
                    let chat_complete = create_event_from_stream_token(
                        &stream_token,
                        self.logprobs,
                        true,
                        self.fingerprint.clone(),
                        self.model_id.clone(),
                        None,
                        self.id.clone(),
                    );
                    events.push(chat_complete);
                } else {
                    let chat_complete = create_event_from_stream_token(
                        &stream_token,
                        self.logprobs,
                        true,
                        self.fingerprint.clone(),
                        self.model_id.clone(),
                        None,
                        self.id.clone(),
                    );
                    events.push(chat_complete);
                }
            }
            StreamState::Content => {
                let chat_complete = create_event_from_stream_token(
                    &stream_token,
                    self.logprobs,
                    false,
                    self.fingerprint.clone(),
                    self.model_id.clone(),
                    None,
                    self.id.clone(),
                );

                events.push(chat_complete);
            }
        }

        if self.options.include_usage {
            if let Some(details) = stream_token.details {
                let completion_tokens = details.generated_tokens;
                let prompt_tokens = details.input_length;
                let total_tokens = prompt_tokens + completion_tokens;

                let usage = Usage {
                    completion_tokens,
                    prompt_tokens,
                    total_tokens,
                };
                let current_time = std::time::SystemTime::now()
                    .duration_since(std::time::UNIX_EPOCH)
                    .unwrap_or_else(|_| std::time::Duration::from_secs(0))
                    .as_secs();

                let chat_complete = CompletionType::ChatCompletionChunk(ChatCompletionChunk {
                    id: String::new(),
                    created: current_time,
                    model: self.model_id.clone(),
                    system_fingerprint: self.fingerprint.clone(),
                    choices: vec![],
                    usage: Some(Usage {
                        prompt_tokens: usage.prompt_tokens,
                        completion_tokens: usage.completion_tokens,
                        total_tokens: usage.total_tokens,
                    }),
                });

                events.push(chat_complete);
            }
        }
        ChatEvent::Events(events)
    }
}

#[cfg(test)]
mod tests {
    use crate::{
        ChatCompletionChoice, ChatCompletionDelta, FinishReason, StreamDetails, TextMessage, Token,
    };

    use super::*;

    fn get_tool_call_content(event: &CompletionType) -> (Option<&String>, &String) {
        match event {
            CompletionType::ChatCompletionChunk(ChatCompletionChunk { choices, .. }) => {
                assert_eq!(choices.len(), 1);
                if let ChatCompletionChoice {
                    delta: ChatCompletionDelta::Tool(ToolCallDelta { tool_calls, .. }),
                    ..
                } = &choices[0]
                {
                    assert_eq!(tool_calls.len(), 1);
                    let DeltaToolCall {
                        index,
                        id,
                        r#type,
                        function,
                    } = &tool_calls[0];
                    assert_eq!(*index, 0);
                    assert_eq!(id, "0");
                    assert_eq!(r#type, "function");
                    (function.name.as_ref(), &function.arguments)
                } else {
                    panic!("Expected plain message");
                }
            }
            _ => panic!("Unexpected chunk"),
        }
    }

    #[test]
    fn test_chat_stream() {
        let mut chat_state = ChatState::new(
            false,
            StreamOptions {
                include_usage: false,
            },
            "fingerprint".to_string(),
            "model_id".to_string(),
            false,
            "0".to_string(),
        );

        let events = chat_state.push(StreamResponse {
            generated_text: None,
            token: Token {
                id: 42,
                text: "Hi".to_string(),
                logprob: 0.0,
                special: false,
            },
            top_tokens: vec![],
            index: 0,
            details: None,
        });
        if let ChatEvent::Events(events) = events {
            assert_eq!(events.len(), 1);
            match &events[0] {
                CompletionType::ChatCompletionChunk(ChatCompletionChunk { choices, .. }) => {
                    assert_eq!(
                        choices,
                        &[ChatCompletionChoice {
                            index: 0,
                            delta: ChatCompletionDelta::Chat(TextMessage {
                                role: "assistant".to_string(),
                                content: "Hi".to_string(),
                                tool_call_id: None,
                            }),
                            logprobs: None,
                            finish_reason: None,
                        }]
                    );
                }
                _ => panic!("Unexpected chunk"),
            }
        } else {
            panic!("Expected chat events");
        }
    }

    #[test]
    fn test_chat_stream_usage() {
        let mut chat_state = ChatState::new(
            false,
            StreamOptions {
                include_usage: true,
            },
            "fingerprint".to_string(),
            "model_id".to_string(),
            false,
            "0".to_string(),
        );

        let events = chat_state.push(StreamResponse {
            generated_text: None,
            token: Token {
                id: 42,
                text: "Hi".to_string(),
                logprob: 0.0,
                special: false,
            },
            top_tokens: vec![],
            index: 0,
            details: Some(StreamDetails {
                input_length: 2,
                generated_tokens: 10,
                seed: None,
                finish_reason: FinishReason::Length,
            }),
        });
        if let ChatEvent::Events(events) = events {
            assert_eq!(events.len(), 2);
            match &events[0] {
                CompletionType::ChatCompletionChunk(ChatCompletionChunk { choices, .. }) => {
                    assert_eq!(
                        choices,
                        &[ChatCompletionChoice {
                            index: 0,
                            delta: ChatCompletionDelta::Chat(TextMessage {
                                role: "assistant".to_string(),
                                content: "Hi".to_string(),
                                tool_call_id: None,
                            }),
                            logprobs: None,
                            // HAS A FINISH REASON
                            finish_reason: Some("length".to_string()),
                        }]
                    );
                }
                _ => panic!("Unexpected chunk"),
            }
            match &events[1] {
                CompletionType::ChatCompletionChunk(ChatCompletionChunk { usage, .. }) => {
                    assert_eq!(
                        *usage,
                        Some(Usage {
                            prompt_tokens: 2,
                            completion_tokens: 10,
                            total_tokens: 12,
                        })
                    );
                }
                _ => panic!("Unexpected chunk"),
            }
        } else {
            panic!("Expected chat events");
        }
    }

    #[test]
    fn test_chat_stream_tool_no_tool_simple() {
        let mut chat_state = ChatState::new(
            true,
            StreamOptions {
                include_usage: true,
            },
            "fingerprint".to_string(),
            "model_id".to_string(),
            false,
            "0".to_string(),
        );

        let tokens = vec![
            "{\"".to_string(),
            "function".to_string(),
            "\":".to_string(),
            " {\"".to_string(),
            "_".to_string(),
            "name".to_string(),
            "\":".to_string(),
            " \"".to_string(),
            "no".to_string(),
            "_tool".to_string(),
            "\",".to_string(),
            " \"".to_string(),
            "content".to_string(),
            "\":".to_string(),
            " \"".to_string(),        // Token 14
            "I".to_string(),          // Event 1
            " am".to_string(),        // Event 2
            " a".to_string(),         // Event 3
            " helpful".to_string(),   // Event 4
            " assistant".to_string(), // Event 5
            "!\"".to_string(),        // Event 6 (with trailing quore removed)
            "}".to_string(),
            "}".to_string(),
        ];
        let tokens: Vec<_> = tokens
            .into_iter()
            .map(|text| StreamResponse {
                generated_text: None,
                token: Token {
                    id: 42,
                    text: text.to_string(),
                    logprob: 0.0,
                    special: false,
                },
                top_tokens: vec![],
                index: 0,
                details: None,
            })
            .collect();

        // Initial ignored output
        for token in &tokens[..10] {
            let events = chat_state.push(token.clone());
            if let ChatEvent::Events(events) = events {
                assert_eq!(events.len(), 0, "{events:?}");
            } else {
                panic!("Expected chat events");
            }
        }

        // No tool output
        let events = chat_state.push(tokens[10].clone());
        if let ChatEvent::NoTool = events {
            assert!(true);
        } else {
            panic!("Expected chat events");
        }
    }

    #[test]
    fn test_chat_stream_tool_no_tool_empty() {
        let mut chat_state = ChatState::new(
            true,
            StreamOptions {
                include_usage: true,
            },
            "fingerprint".to_string(),
            "model_id".to_string(),
            false,
            "0".to_string(),
        );

        let tokens = vec![
            "{\"".to_string(),
            "function".to_string(),
            "\":".to_string(),
            " {\"".to_string(),
            "_".to_string(),
            "name".to_string(),
            "\":".to_string(),
            " \"".to_string(),
            "no".to_string(),
            "_tool".to_string(),
            "\",".to_string(),
            " \"".to_string(),
            "content".to_string(),
            "\":\"".to_string(),
            "\"}".to_string(), // Token 13
            "}".to_string(),   // Event 1
        ];
        let tokens: Vec<_> = tokens
            .into_iter()
            .map(|text| StreamResponse {
                generated_text: None,
                token: Token {
                    id: 42,
                    text: text.to_string(),
                    logprob: 0.0,
                    special: false,
                },
                top_tokens: vec![],
                index: 0,
                details: None,
            })
            .collect();

        // Initial ignored output
        for token in &tokens[..10] {
            let events = chat_state.push(token.clone());
            if let ChatEvent::Events(events) = events {
                assert_eq!(events.len(), 0, "{events:?}");
            } else {
                panic!("Expected chat events");
            }
        }

        // No tool output
        let events = chat_state.push(tokens[10].clone());
        if let ChatEvent::NoTool = events {
            assert!(true);
        } else {
            panic!("Expected chat events");
        }
    }

    #[test]
    fn test_chat_stream_tool_get_weather() {
        let mut chat_state = ChatState::new(
            true,
            StreamOptions {
                include_usage: true,
            },
            "fingerprint".to_string(),
            "model_id".to_string(),
            false,
            "0".to_string(),
        );

        let tokens = vec![
            "{\"".to_string(),
            "function".to_string(),
            "\":".to_string(),
            " {\"".to_string(),
            "_".to_string(),
            "name".to_string(),
            "\":".to_string(),
            " \"".to_string(),
            "get".to_string(),
            "_current".to_string(),
            "_weather".to_string(),
            "\",".to_string(),
            // Event 1 is the function name
            // Event 2 is the start of the arguments "{"
            " \"".to_string(),        // Event 3
            "location".to_string(),   // Event 4
            "\":".to_string(),        // Event 5
            " \"".to_string(),        // Event 6
            "San".to_string(),        // Event 7
            " Francisco".to_string(), // Event 8
            ",".to_string(),          // Event 9
            " CA".to_string(),        // Event 10
            "\",".to_string(),        // Event 11
            " \"".to_string(),        // Event 12
            "format".to_string(),     // Event 13
            "\":".to_string(),        // Event 14
            " \"".to_string(),        // Event 15
            "c".to_string(),          // Event 16
            "elsius".to_string(),     // Event 17
            "\"}}".to_string(),       // Event 18 retained (trailing brace removed)
        ];
        let tokens: Vec<_> = tokens
            .into_iter()
            .map(|text| StreamResponse {
                generated_text: None,
                token: Token {
                    id: 42,
                    text: text.to_string(),
                    logprob: 0.0,
                    special: false,
                },
                top_tokens: vec![],
                index: 0,
                details: None,
            })
            .collect();

        // Initial ignored output
        for token in &tokens[..11] {
            let events = chat_state.push(token.clone());
            if let ChatEvent::Events(events) = events {
                assert_eq!(events.len(), 0, "{events:?}");
            } else {
                panic!("Expected chat events");
            }
        }

        // No tool output
        let mut output = String::new();
        let mut output_name = String::new();
        for token in &tokens[11..11 + 17] {
            let events = chat_state.push(token.clone());
            if let ChatEvent::Events(events) = events {
                assert_eq!(events.len(), 1);
                let (name, arguments) = get_tool_call_content(&events[0]);
                if let Some(name) = name {
                    assert_eq!(name, "get_current_weather");
                    output_name.push_str(name);
                }
                output.push_str(arguments);
            } else {
                panic!("Expected chat events");
            }
        }

        assert_eq!(output_name, "get_current_weather");
        assert_eq!(
            output,
            "{ \"location\": \"San Francisco, CA\", \"format\": \"celsius\"}"
        );

        // No tool finish
        for token in &tokens[11 + 17..] {
            let events = chat_state.push(token.clone());
            if let ChatEvent::Events(events) = events {
                assert_eq!(events.len(), 0, "{events:?}");
            } else {
                panic!("Expected chat events");
            }
        }
    }
}


================================================
FILE: router/src/config.rs
================================================
use serde::{Deserialize, Serialize};
use std::collections::{HashMap, HashSet};

#[derive(Clone, Debug, Serialize, Deserialize)]
#[serde(tag = "model_type")]
#[serde(rename_all = "snake_case")]
pub struct LlavaNext {
    pub(crate) text_config: TextConfig,
    pub(crate) vision_config: VisionConfig,
    pub(crate) image_grid_pinpoints: Vec<(usize, usize)>,
}

fn get_anyres_image_grid_shape(
    height: usize,
    width: usize,
    grid_pinpoints: &[(usize, usize)],
    patch_size: usize,
) -> (usize, usize) {
    let (height, width) = select_best_resolution(height, width, grid_pinpoints);
    (height / patch_size, width / patch_size)
}

/// Selects the best resolution from a list of possible resolutions based on the original size.
/// This is done by calculating the effective and wasted resolution for each possible resolution.
/// The best fit resolution is the one that maximizes the effective resolution and minimizes the wasted resolution.
fn select_best_resolution(
    original_height: usize,
    original_width: usize,
    possible_resolutions: &[(usize, usize)],
) -> (usize, usize) {
    let mut best_fit = None;
    let mut max_effective_resolution = 0;
    let mut min_wasted_resolution = f32::NEG_INFINITY;

    for (height, width) in possible_resolutions {
        let wscale = *width as f32 / original_width as f32;
        let hscale = *height as f32 / original_height as f32;
        // f32 partial ord.
        let scale = if wscale > hscale { hscale } else { wscale };
        let downscaled_width = (*width as f32 * scale) as usize;
        let downscaled_height = (*height as f32 * scale) as usize;
        let effective_resolution = std::cmp::min(
            downscaled_width * downscaled_height,
            original_width * original_height,
        );
        let wasted_resolution = (width * height) - effective_resolution;

        if effective_resolution > max_effective_resolution
            || (effective_resolution == max_effective_resolution
                && (wasted_resolution as f32) < min_wasted_resolution)
        {
            max_effective_resolution = effective_resolution;
            min_wasted_resolution = wasted_resolution as f32;
            best_fit = Some((*height, *width));
        }
    }

    best_fit.unwrap_or((original_height, original_width))
}

fn get_unpadded_features(
    height: usize,
    width: usize,
    npatches: usize,
    num_patch_height: usize,
    num_patch_width: usize,
) -> (usize, usize) {
    let current_height = npatches * num_patch_height;
    let current_width = npatches * num_patch_width;

    let aspect_ratio: f64 = width as f64 / height as f64;
    let current_aspect_ratio: f64 = current_width as f64 / current_height as f64;
    let (current_height, current_width) = if aspect_ratio > current_aspect_ratio {
        let new_height = (height * current_width) / width;
        let padding = (current_height - new_height) / 2;
        (current_height - (2 * padding), current_width)
    } else {
        let new_width = (width * current_height) / height;
        let padding = (current_width - new_width) / 2;
        (current_height, current_width - (2 * padding))
    };

    let unpadded_features = current_height * current_width;
    let newline_features = current_height;
    (unpadded_features, newline_features)
}

impl LlavaNext {
    pub fn get_number_of_features(&self, height: usize, width: usize) -> usize {
        let image_size = self.vision_config.image_size;
        let patch_size = self.vision_config.patch_size;
        assert!(image_size % patch_size == 0);
        let npatches = image_size / patch_size;
        // Dimensions are intentionally swapped to be bug-compatible with
        // upstream: https://github.com/LLaVA-VL/LLaVA-NeXT/issues/59
        let (num_patch_width, num_patch_height) =
            get_anyres_image_grid_shape(height, width, &self.image_grid_pinpoints, image_size);

        let (unpadded_features, newline_features) =
            get_unpadded_features(height, width, npatches, num_patch_height, num_patch_width);
        // The base patch covers the entire image
        let base_features = npatches.pow(2);
        unpadded_features + newline_features + base_features
    }
}

#[derive(Clone, Debug, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub struct Llama4VisionConfig {
    image_size: usize,
    patch_size: usize,
    pixel_shuffle_ratio: f64,
}

#[derive(Clone, Debug, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub struct Llama4 {
    text_config: TextConfig,
    vision_config: Llama4VisionConfig,
}

fn gcd(a: usize, b: usize) -> usize {
    if b == 0 {
        a
    } else {
        gcd(b, a % b)
    }
}

fn get_factors(dividend: usize) -> HashSet<usize> {
    let mut factors_set = HashSet::new();

    for i in 1..=((dividend as f64).sqrt() as usize) {
        if dividend % i == 0 {
            factors_set.insert(i);
            factors_set.insert(dividend / i);
        }
    }

    factors_set
}

fn find_supported_resolutions(max_num_chunks: usize, height: usize) -> Vec<(usize, usize)> {
    let patch_size = height;

    let mut asp_dict: HashMap<(usize, usize), Vec<(usize, usize)>> = HashMap::new();

    for chunk_size in (1..=max_num_chunks).rev() {
        let mut _factors: Vec<_> = get_factors(chunk_size).into_iter().collect();
        _factors.sort();
        let _asp_ratios: Vec<(usize, usize)> =
            _factors.iter().map(|&f| (f, chunk_size / f)).collect();

        for (h, w) in _asp_ratios {
            let divisor = gcd(h, w);
            let key = (h / divisor, w / divisor); // reduced aspect ratio as key

            asp_dict.entry(key).or_default().push((h, w));
        }
    }

    let mut possible_resolutions = vec![];

    for (_key, value) in asp_dict {
        for (h, w) in value {
            possible_resolutions.push((h * patch_size, w * patch_size));
        }
    }

    possible_resolutions
}

fn get_best_fit(
    original_height: usize,
    original_width: usize,
    possible_resolutions: &[(usize, usize)],
    resize_to_max_canvas: bool,
) -> (usize, usize) {
    let orig_h = original_height as f32;
    let orig_w = original_width as f32;

    let mut scales = Vec::with_capacity(possible_resolutions.len());

    for &(h, w) in possible_resolutions.iter() {
        let scale_h = h as f32 / orig_h;
        let scale_w = w as f32 / orig_w;
        let scale = scale_h.min(scale_w);
        scales.push(scale);
    }

    let upscaling_options: Vec<f32> = scales.iter().copied().filter(|&s| s >= 1.0).collect();
    let selected_scale = if !upscaling_options.is_empty() {
        if resize_to_max_canvas {
            upscaling_options.into_iter().fold(f32::MIN, f32::max)
        } else {
            upscaling_options.into_iter().fold(f32::MAX, f32::min)
        }
    } else {
        let downscaling_options: Vec<f32> = scales.iter().copied().filter(|&s| s < 1.0).collect();
        downscaling_options.into_iter().fold(f32::MIN, f32::max)
    };

    let chosen_canvas: Vec<(usize, usize)> = possible_resolutions
        .iter()
        .zip(scales.iter())
        .filter(|&(_, &s)| (s - selected_scale).abs() < f32::EPSILON)
        .map(|(&(h, w), _)| (h, w))
        .collect();

    if chosen_canvas.len() > 1 {
        chosen_canvas
            .into_iter()
            .min_by_key(|(h, w)| h * w)
            .unwrap()
    } else {
        chosen_canvas[0]
    }
}

impl Llama4 {
    pub fn image_size(&self) -> usize {
        self.vision_config.image_size
    }

    pub fn patch_size(&self) -> usize {
        self.vision_config.patch_size
    }

    pub fn pixel_shuffle_ratio(&self) -> f64 {
        self.vision_config.pixel_shuffle_ratio
    }
    pub fn get_aspect_ratios(
        &self,
        height: usize,
        width: usize,
        max_chunks: usize,
    ) -> (usize, usize) {
        let patch_size = self.vision_config.image_size;
        let supported = find_supported_resolutions(max_chunks, patch_size);
        let (target_h, target_w) = get_best_fit(height, width, &supported, false);
        (target_h / patch_size, target_w / patch_size)
    }
}

#[derive(Clone, Debug, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub struct ClipVisionModel {
    image_size: usize,
    patch_size: usize,
}

#[derive(Clone, Debug, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub struct Idefics3 {}

impl Idefics3 {
    pub fn get_max_longest_edge(&self) -> usize {
        364
    }

    pub fn get_number_of_features(&self) -> usize {
        169
    }

    pub fn get_max_longest_edge_for_image_resize(&self) -> usize {
        1456
    }

    pub fn get_max_image_size(&self) -> usize {
        4096
    }
}

#[derive(Clone, Debug, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub struct Idefics2 {}

impl Idefics2 {
    pub fn get_number_of_features(&self, _height: usize, _width: usize) -> usize {
        64
    }
}

#[derive(Clone, Debug, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub struct PaliTextConfig {
    pub(crate) num_image_tokens: usize,
}

#[derive(Clone, Debug, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub struct Paligemma {
    pub(crate) text_config: PaliTextConfig,
}

impl Paligemma {
    pub fn get_number_of_features(&self, _height: usize, _width: usize) -> usize {
        self.text_config.num_image_tokens
    }
}

#[derive(Clone, Debug, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub struct Qwen2VlVisionConfig {
    pub(crate) depth: usize,
    pub(crate) embed_dim: usize,
    pub(crate) mlp_ratio: usize,
    pub(crate) num_heads: usize,
    pub(crate) in_chans: usize,
    pub(crate) hidden_size: usize,
    pub(crate) patch_size: usize,
    pub(crate) spatial_merge_size: usize,
    pub(crate) spatial_patch_size: usize,
    pub(crate) temporal_patch_size: usize,
}

#[derive(Clone, Debug, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub struct Qwen2Vl {
    pub(crate) vision_config: Qwen2VlVisionConfig,
}

impl Qwen2Vl {
    pub fn get_number_of_features(&self, height: usize, width: usize) -> usize {
        let num_pixels = height * width;
        num_pixels / self.vision_config.patch_size.pow(2)
    }
}

#[derive(Clone, Debug, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub struct Qwen2_5VlVisionConfig {
    // pub(crate) depth: usize,
    // pub(crate) hidden_act: String,
    // pub(crate) hidden_size: usize,
    // pub(crate) intermediate_size: usize,
    // pub(crate) num_heads: usize,
    // pub(crate) in_chans: usize,
    // pub(crate) out_hidden_size: usize,
    // pub(crate) patch_size: usize,
    // pub(crate) spatial_merge_size: usize,
    pub(crate) spatial_patch_size: usize,
    // pub(crate) window_size: usize,
    // pub(crate) fullatt_block_indexes: Vec<usize>,
    // pub(crate) tokens_per_second: usize,
    // pub(crate) temporal_patch_size: usize,
}

#[derive(Clone, Debug, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub struct Qwen2_5Vl {
    pub(crate) vision_config: Qwen2_5VlVisionConfig,
}

impl Qwen2_5Vl {
    pub fn get_number_of_features(&self, height: usize, width: usize) -> usize {
        let num_pixels = height * width;
        num_pixels / self.vision_config.spatial_patch_size.pow(2)
    }
}

#[derive(Clone, Debug, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub struct Gemma3VisionConfig {
    pub(crate) image_size: usize,
    pub(crate) patch_size: usize,
}

#[derive(Clone, Debug, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub struct Gemma3 {
    vision_config: Gemma3VisionConfig,
}

#[derive(Clone, Debug, Serialize, Deserialize)]
#[serde(tag = "model_type")]
#[serde(rename_all = "snake_case")]
pub enum Config {
    Qwen2_5Vl(Qwen2_5Vl),
    Qwen2Vl(Qwen2Vl),
    LlavaNext(LlavaNext),
    ClipVisionModel(ClipVisionModel),
    Mistral,
    Mamba,
    Idefics,
    Mllama,
    Idefics2(Idefics2),
    Idefics3(Idefics3),
    Ssm,
    GptBigcode,
    Granite,
    Santacoder,
    Bloom,
    Mpt,
    Gpt2,
    Gptj,
    GptNeox,
    Phi,
    #[serde(rename = "phi-msft")]
    PhiMsft,
    Phi3,
    Phimoe,
    Llama,
    Llama4(Llama4),
    Baichuan,
    Paligemma(Paligemma),
    Gemma,
    Gemma2,
    Gemma3(Gemma3),
    Gemma3Text,
    Cohere,
    Drbx,
    Falcon,
    Mixtral,
    Starcoder2,
    Qwen2,
    Opt,
    T5,
    DeepseekV2,
    DeepseekV3,
    Qwen3,
}

#[derive(Clone, Debug, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub struct TextConfig {}

#[derive(Clone, Debug, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub struct VisionConfig {
    pub(crate) image_size: usize,
    pub(crate) patch_size: usize,
}

#[cfg(test)]
mod test {
    use super::*;

    #[test]
    fn test_llava_next_features() {
        let config = LlavaNext {
            text_config: TextConfig {},
            vision_config: VisionConfig {
                image_size: 336,
                patch_size: 14,
            },
            image_grid_pinpoints: vec![
                (336, 672),
                (672, 336),
                (672, 672),
                (1008, 336),
                (336, 1008),
            ],
        };

        let slots = config.get_number_of_features(20, 20);
        assert_eq!(slots, 1176);
        let slots = config.get_number_of_features(640, 640);
        assert_eq!(slots, 2928);
        let slots = config.get_number_of_features(480, 640);
        assert_eq!(slots, 2340);
        let slots = config.get_number_of_features(899, 1024);
        assert_eq!(slots, 2634);
        let slots = config.get_number_of_features(1024, 899);
        assert_eq!(slots, 2640);
        let slots = config.get_number_of_features(1067, 1600);
        assert_eq!(slots, 2144);
    }
}


================================================
FILE: router/src/infer/chat_template.rs
================================================
use crate::infer::InferError;
use crate::{
    ChatTemplateInputs, Message, MessageBody, MessageChunk, TextMessage, TokenizerConfigToken, Tool,
};
use chrono::Local;
use minijinja::{Environment, ErrorKind, Template};
use minijinja_contrib::pycompat;

/// Raise a exception (custom function) used in the chat templates
pub(crate) fn raise_exception(err_text: String) -> Result<String, minijinja::Error> {
    Err(minijinja::Error::new(ErrorKind::SyntaxError, err_text))
}

/// Get the current date in a specific format (custom function), similar to `datetime.now().strftime()` in Python
pub(crate) fn strftime_now(format_str: String) -> Result<String, minijinja::Error> {
    Ok(Local::now().format(&format_str).to_string())
}

#[derive(Debug, Clone)]
pub(crate) struct ChatTemplate {
    template: Template<'static, 'static>,
    bos_token: Option<String>,
    eos_token: Option<String>,
    use_default_tool_template: bool,
}

impl ChatTemplate {
    pub(crate) fn new(
        template: String,
        bos_token: Option<TokenizerConfigToken>,
        eos_token: Option<TokenizerConfigToken>,
    ) -> Self {
        let mut env = Box::new(Environment::new());
        // enable things like .strip() or .capitalize()
        env.set_unknown_method_callback(pycompat::unknown_method_callback);

        // TODO: replace with better solution
        // hack to adjust gemma3 template for debug
        // replace 'messages[0]['content'][0]['text']' with 'messages[0]['content']'
        let mutated_template = template.replace(
            "messages[0]['content'][0]['text']",
            "messages[0]['content']",
        );
        //  Hack to fix Qwen3 templating.
        //  It uses python notation to reverse lists, which do not exist in minijinja
        //  so we're using the reverse filter instead.
        let mutated_template = mutated_template.replace("[::-1]", "|reverse");
        // TODO: replace with a better solution
        // Hack to remove the {% generation %} and {% endgeneration %} statements from
        // the Jinja2 chat templates if there, since those are only using for assistant
        // masking during training, and should be ignored during inference
        let mutated_template = mutated_template.replace("{% generation %}", "");
        let mutated_template = mutated_template.replace("{% endgeneration %}", "");

        let template_str = mutated_template.into_boxed_str();
        env.add_function("raise_exception", raise_exception);
        env.add_function("strftime_now", strftime_now);
        tracing::debug!("Loading template: {}", template_str);

        // leaking env and template_str as read-only, static resources for performance.
        let template = Box::leak(env)
            .template_from_str(Box::leak(template_str))
            .unwrap();

        // get the list of variables that are used in the template
        let variables = template.undeclared_variables(true);
        // check if the `tools` variable is used in the template
        let use_default_tool_template = !variables.contains("tools");
        tracing::debug!("Use default tool template: {}", use_default_tool_template);

        Self {
            template,
            bos_token: bos_token.map(|token| token.as_str().to_string()),
            eos_token: eos_token.map(|token| token.as_str().to_string()),
            use_default_tool_template,
        }
    }

    pub(crate) fn apply(
        &self,
        mut messages: Vec<Message>,
        tools_and_prompt: Option<(Vec<Tool>, String)>,
    ) -> Result<String, InferError> {
        let tools = match tools_and_prompt {
            Some((tools, tool_prompt)) => {
                // check if the `tools` variable is used in the template
                // if not, we need to append the tools to the last message
                let text = if self.use_default_tool_template {
                    match serde_json::to_string(&tools) {
                        Ok(tools_str) => format!("\n---\n{}\n{}", tools_str, tool_prompt),
                        Err(e) => return Err(InferError::ToolError(e.to_string())),
                    }
                } else {
                    // if the `tools` variable is used in the template, we just append the tool_prompt
                    format!("\n---\n{}", tool_prompt)
                };
                if let Some(last_message) = messages.last_mut() {
                    if let MessageBody::Content { content } = &mut last_message.body {
                        content.push(MessageChunk::Text { text });
                    }
                }
                Some(tools)
            }
            None => None,
        };

        let messages: Vec<TextMessage> = messages.into_iter().map(|c| c.into()).collect();
        let final_message = messages.last().cloned();
        let mut rendered_template = self
            .template
            .render(ChatTemplateInputs {
                messages,
                bos_token: self.bos_token.as_deref(),
                eos_token: self.eos_token.as_deref(),
                add_generation_prompt: true,
                tools,
            })
            .map_err(InferError::TemplateError)?;

        // if the last message is from the assistant, continue the generation prompt
        rendered_template = match final_message {
            Some(msg) if msg.role == "assistant" => {
                match rendered_template.rfind(msg.content.as_str()) {
                    // implementation based on feature in transformers pipeline
                    // https://github.com/huggingface/transformers/blob/1cf17077bf2d4affed31387c0943251a4ba8fab7/src/transformers/pipelines/text_generation.py#L418
                    Some(index) => rendered_template[..index + msg.content.len()]
                        .trim_end()
                        .to_string(),
                    None => rendered_template,
                }
            }
            _ => rendered_template,
        };

        Ok(rendered_template)
    }
}

// tests
#[cfg(test)]
mod tests {
    use crate::infer::chat_template::{raise_exception, strftime_now};
    use crate::infer::ChatTemplate;
    use crate::{
        ChatTemplateInputs, Message, MessageBody, MessageChunk, MessageContent, TextMessage,
        TokenizerConfigToken, Tool, Url,
    };
    use chrono::Local;
    use minijinja::Environment;

    #[test]
    fn test_chat_template() {
        let env = Environment::new();

        let source = r#"
        {% for message in messages %}
            {% if message['role'] == 'system' %}
                {% if message['content']%}
                    {{'### System:\n' + message['content']+'\n\n'}}
                {% endif %}
            {% elif message['role'] == 'user' %}
                {{'### User:\n' + message['content']+'\n\n'}}
            {% elif message['role'] == 'assistant' %}
                {{'### Assistant:\n'  + message['content']}}
            {% endif %}
            {% if loop.last and add_generation_prompt %}
                {{ '### Assistant:\n' }}
            {% endif %}
        {% endfor %}"#;

        // trim all the whitespace
        let source = source
            .lines()
            .map(|line| line.trim())
            .collect::<Vec<&str>>()
            .join("");

        let tmpl = env.template_from_str(&source);

        let chat_template_inputs = ChatTemplateInputs {
            messages: vec![
                TextMessage {
                    role: "user".to_string(),
                    content: "Hi!".to_string(),
                    ..Default::default()
                },
                TextMessage {
                    role: "assistant".to_string(),
                    content: "Hello how can I help?".to_string(),
                    ..Default::default()
                },
                TextMessage {
                    role: "user".to_string(),
                    content: "What is Deep Learning?".to_string(),
                    ..Default::default()
                },
                TextMessage {
                    role: "assistant".to_string(),
                    content: "magic!".to_string(),
                    ..Default::default()
                },
            ],
            bos_token: Some("[BOS]"),
            eos_token: Some("[EOS]"),
            add_generation_prompt: true,
            ..Default::default()
        };

        let result = tmpl.unwrap().render(chat_template_inputs).unwrap();

        assert_eq!(
            result,
            "### User:\nHi!\n\n### Assistant:\nHello how can I help?### User:\nWhat is Deep Learning?\n\n### Assistant:\nmagic!### Assistant:\n"
        );
    }

    #[test]
    fn test_chat_template_with_tool_response() {
        let env = Environment::new();

        // template modified from Llama-3.1-8B-Instruct
        // https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/blob/0e9e39f249a16976918f6564b8830bc894c89659/tokenizer_config.json#L2053
        // the main change is accesing `message.tool_call_id` from the messages
        let source = r#"
        {{- bos_token }}
        {%- if custom_tools is defined %}
            {%- set tools = custom_tools %}
        {%- endif %}
        {%- if not tools_in_user_message is defined %}
            {%- set tools_in_user_message = true %}
        {%- endif %}
        {%- if not date_string is defined %}
            {%- set date_string = "26 Jul 2024" %}
        {%- endif %}
        {%- if not tools is defined %}
            {%- set tools = none %}
        {%- endif %}

        {#- This block extracts the system message, so we can slot it into the right place. #}
        {%- if messages[0]['role'] == 'system' %}
            {%- set system_message = messages[0]['content']|trim %}
            {%- set messages = messages[1:] %}
        {%- else %}
            {%- set system_message = "" %}
        {%- endif %}

        {#- System message + builtin tools #}
        {{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
        {%- if builtin_tools is defined or tools is not none %}
            {{- "Environment: ipython\n" }}
        {%- endif %}
        {%- if builtin_tools is defined %}
            {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}}
        {%- endif %}
        {{- "Cutting Knowledge Date: December 2023\n" }}
        {{- "Today Date: " + date_string + "\n\n" }}
        {%- if tools is not none and not tools_in_user_message %}
            {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
            {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
            {{- "Do not use variables.\n\n" }}
            {%- for t in tools %}
                {{- t | tojson(indent=4) }}
                {{- "\n\n" }}
            {%- endfor %}
        {%- endif %}
        {{- system_message }}
        {{- "<|eot_id|>" }}

        {#- Custom tools are passed in a user message with some extra guidance #}
        {%- if tools_in_user_message and not tools is none %}
            {#- Extract the first user message so we can plug it in here #}
            {%- if messages | length != 0 %}
                {%- set first_user_message = messages[0]['content']|trim %}
                {%- set messages = messages[1:] %}
            {%- else %}
                {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
        {%- endif %}
            {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
            {{- "Given the following functions, please respond with a JSON for a function call " }}
            {{- "with its proper arguments that best answers the given prompt.\n\n" }}
            {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
            {{- "Do not use variables.\n\n" }}
            {%- for t in tools %}
                {{- t | tojson(indent=4) }}
                {{- "\n\n" }}
            {%- endfor %}
            {{- first_user_message + "<|eot_id|>"}}
        {%- endif %}

        {%- for message in messages %}
            {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
                {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
            {%- elif 'tool_calls' in message %}
                {%- if not message.tool_calls|length == 1 %}
                    {{- raise_exception("This model only supports single tool-calls at once!") }}
                {%- endif %}
                {%- set tool_call = message.tool_calls[0].function %}
                {%- if builtin_tools is defined and tool_call.name in builtin_tools %}
                    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
                    {{- "<|python_tag|>" + tool_call.name + ".call(" }}
                    {%- for arg_name, arg_val in tool_call.arguments | items %}
                        {{- arg_name + '="' + arg_val + '"' }}
                        {%- if not loop.last %}
                            {{- ", " }}
                        {%- endif %}
                        {%- endfor %}
                    {{- ")" }}
                {%- else  %}
                    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
                    {{- '{"name": "' + tool_call.name + '", ' }}
                    {{- '"parameters": ' }}
                    {{- tool_call.arguments | tojson }}
                    {{- "}" }}
                {%- endif %}
                {%- if builtin_tools is defined %}
                    {#- This means we're in ipython mode #}
                    {{- "<|eom_id|>" }}
                {%- else %}
                    {{- "<|eot_id|>" }}
                {%- endif %}
            {%- elif message.role == "tool" or message.role == "ipython" %}
                {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
                    {{- "TOOL CALL ID: " + message.tool_call_id + "\n\n" }}
                {%- if message.content is mapping or message.content is iterable %}
                    {{- message.content | tojson }}
                {%- else %}
                    {{- message.content }}
                {%- endif %}
                {{- "<|eot_id|>" }}
            {%- endif %}
        {%- endfor %}
        {%- if add_generation_prompt %}
            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
        {%- endif %}
        "#;

        // trim all the whitespace
        let source = source
            .lines()
            .map(|line| line.trim())
            .collect::<Vec<&str>>()
            .join("");

        let tmpl = env.template_from_str(&source);

        let chat_template_inputs = ChatTemplateInputs {
            messages: vec![
                TextMessage {
                    role: "user".to_string(),
                    content: "Hi!".to_string(),
                    ..Default::default()
                },
                TextMessage {
                    role: "assistant".to_string(),
                    content: r#"[ { "id": "0", "function": { "arguments": '{"longitude": 2.2945, "latitude": 48.8567}', "name": "get_weather", "description": None, }, "type": "function", } ]"#.to_string(),
                    ..Default::default()
                },
                TextMessage {
                    role: "tool".to_string(),
                    content: "6.7".to_string(),
                    tool_call_id: Some("0".to_string()),
                },
            ],
            bos_token: Some("[BOS]"),
            eos_token: Some("[EOS]"),
            add_generation_prompt: true,
            ..Default::default()
        };

        let result = tmpl.unwrap().render(chat_template_inputs).unwrap();

        assert_eq!(
            result,
            r#"[BOS]<|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

Hi!<|eot_id|><|start_header_id|>assistant<|end_header_id|>

[ { "id": "0", "function": { "arguments": '{"longitude": 2.2945, "latitude": 48.8567}', "name": "get_weather", "description": None, }, "type": "function", } ]<|eot_id|><|start_header_id|>ipython<|end_header_id|>

TOOL CALL ID: 0

"6.7"<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"#
        );
    }

    #[test]
    fn test_chat_template_loop_controls() {
        // some chat templates as e.g. CohereForAI/c4ai-command-r7b-12-202 contain `break`
        // statements in their chat templates, so the feature `loop_controls` has been included
        // in `minijinja`
        let env = Environment::new();

        let source = r#"
        {% set user_count = 0 %}
        {% for message in messages %}
            {% if message['role'] == 'user' %}
                {{'### User:\n' + message['content']+'\n\n'}}
                {% set user_count = user_count + 1 %}
                {% if user_count >= 2 %}
                    {% break %}
                {% endif %}
            {% elif message['role'] == 'assistant' %}
                {{'### Assistant:\n'  + message['content']}}
            {% endif %}
        {% endfor %}
        {% if add_generation_prompt %}
            {{ '### Assistant:\n' }}
        {% endif %}"#;

        // trim all the whitespace
        let source = source
            .lines()
            .map(|line| line.trim())
            .collect::<Vec<&str>>()
            .join("");

        let tmpl = env.template_from_str(&source);

        let chat_template_inputs = ChatTemplateInputs {
            messages: vec![
                TextMessage {
                    role: "user".to_string(),
                    content: "Hi!".to_string(),
                    ..Default::default()
                },
                TextMessage {
                    role: "assistant".to_string(),
                    content: "Hello how can I help?".to_string(),
                    ..Default::default()
                },
                TextMessage {
                    role: "user".to_string(),
                    content: "What is Deep Learning?".to_string(),
                    ..Default::default()
                },
                TextMessage {
                    role: "assistant".to_string(),
                    content: "magic!".to_string(),
                    ..Default::default()
                },
            ],
            bos_token: Some("[BOS]"),
            eos_token: Some("[EOS]"),
            add_generation_prompt: true,
            ..Default::default()
        };

        let result = tmpl.unwrap().render(chat_template_inputs).unwrap();

        assert_eq!(
            result,
            "### User:\nHi!\n\n### Assistant:\nHello how can I help?### User:\nWhat is Deep Learning?\n\n### Assistant:\n"
        );
    }

    #[test]
    fn test_chat_template_invalid_with_raise() {
        let mut env = Environment::new();
        env.add_function("raise_exception", raise_exception);
        env.add_function("strftime_now", strftime_now);

        let source = r#"
        {{ bos_token }}
        {% for message in messages %}
        {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
        {% endif %}
        {% if message['role'] == 'user' %}
        {{ '[INST] ' + message['content'] + ' [/INST]' }}
        {% elif message['role'] == 'assistant' %}
        {{ message['content'] + eos_token}}
        {% else %}
        {{ raise_exception('Only user and assistant roles are supported!') }}
        {% endif %}
        {% endfor %}"#;

        // trim all the whitespace
        let source = source
            .lines()
            .map(|line| line.trim())
            .collect::<Vec<&str>>()
            .join("");

        let tmpl = env.template_from_str(&source);

        let chat_template_inputs = ChatTemplateInputs {
            messages: vec![
                TextMessage {
                    role: "user".to_string(),
                    content: "Hi!".to_string(),
                    ..Default::default()
                },
                TextMessage {
                    role: "user".to_string(),
                    content: "Hi again!".to_string(),
                    ..Default::default()
                },
                TextMessage {
                    role: "assistant".to_string(),
                    content: "Hello how can I help?".to_string(),
                    ..Default::default()
                },
                TextMessage {
                    role: "user".to_string(),
                    content: "What is Deep Learning?".to_string(),
                    ..Default::default()
                },
                TextMessage {
                    role: "assistant".to_string(),
                    content: "magic!".to_string(),
                    ..Default::default()
                },
            ],
            bos_token: Some("[BOS]"),
            eos_token: Some("[EOS]"),
            add_generation_prompt: true,
            ..Default::default()
        };

        let result = tmpl.unwrap().render(chat_template_inputs); //.err().unwrap();

        match result {
            Ok(_) => panic!("Should have failed"),
            Err(e) => {
                assert_eq!(
                    e.detail().unwrap(),
                    "Conversation roles must alternate user/assistant/user/assistant/..."
                );
            }
        }
    }

    #[test]
    fn test_chat_template_valid_with_raise() {
        let mut env = Environment::new();
        env.add_function("raise_exception", raise_exception);
        env.add_function("strftime_now", strftime_now);

        let source = r#"
        {{ bos_token }}
        {% for message in messages %}
        {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
        {% endif %}
        {% if message['role'] == 'user' %}
        {{ '[INST] ' + message['content'] + ' [/INST]' }}
        {% elif message['role'] == 'assistant' %}
        {{ message['content'] + eos_token}}
        {% else %}
        {{ raise_exception('Only user and assistant roles are supported!') }}
        {% endif %}
        {% endfor %}"#;

        // trim all the whitespace
        let source = source
            .lines()
            .map(|line| line.trim())
            .collect::<Vec<&str>>()
            .join("");

        let tmpl = env.template_from_str(&source);

        let chat_template_inputs = ChatTemplateInputs {
            messages: vec![
                TextMessage {
                    role: "user".to_string(),
                    content: "Hi!".to_string(),
                    ..Default::default()
                },
                TextMessage {
                    role: "assistant".to_string(),
                    content: "Hello how can I help?".to_string(),
                    ..Default::default()
                },
                TextMessage {
                    role: "user".to_string(),
                    content: "What is Deep Learning?".to_string(),
                    ..Default::default()
                },
                TextMessage {
                    role: "assistant".to_string(),
                    content: "magic!".to_string(),
                    ..Default::default()
                },
            ],
            bos_token: Some("[BOS]"),
            eos_token: Some("[EOS]"),
            add_generation_prompt: true,
            ..Default::default()
        };

        let result = tmpl.unwrap().render(chat_template_inputs).unwrap();
        assert_eq!(result, "[BOS][INST] Hi! [/INST]Hello how can I help?[EOS][INST] What is Deep Learning? [/INST]magic![EOS]");
    }

    #[test]
    fn test_chat_template_valid_with_strftime_now() {
        let mut env = Environment::new();
        env.add_function("raise_exception", raise_exception);
        env.add_function("strftime_now", strftime_now);

        let source = r#"
        {% set today = strftime_now("%Y-%m-%d") %}
        {% set default_system_message = "The current date is " + today + "." %}
        {{ bos_token }}
        {% if messages[0]['role'] == 'system' %}
            { set system_message = messages[0]['content'] %}
            {%- set loop_messages = messages[1:] %}
        {% else %}
            {%- set system_message = default_system_message %}
            {%- set loop_messages = messages %}
        {% endif %}
        {{ '[SYSTEM_PROMPT]' + system_message + '[/SYSTEM_PROMPT]' }}
        {% for message in loop_messages %}
            {% if message['role'] == 'user' %}
                {{ '[INST]' + message['content'] + '[/INST]' }}
            {% elif message['role'] == 'assistant' %}
                {{ message['content'] + eos_token }}
            {% else %}
                {{ raise_exception('Only user and assistant roles are supported!') }}
            {% endif %}
        {% endfor %}
        "#;

        // trim all the whitespace
        let source = source
            .lines()
            .map(|line| line.trim())
            .collect::<Vec<&str>>()
            .join("");

        let tmpl = env.template_from_str(&source);

        let chat_template_inputs = ChatTemplateInputs {
            messages: vec![
                TextMessage {
                    role: "user".to_string(),
                    content: "Hi!".to_string(),
                    ..Default::default()
                },
                TextMessage {
                    role: "assistant".to_string(),
                    content: "Hello how can I help?".to_string(),
                    ..Default::default()
                },
                TextMessage {
                    role: "user".to_string(),
                    content: "What is Deep Learning?".to_string(),
                    ..Default::default()
                },
                TextMessage {
                    role: "assistant".to_string(),
                    content: "magic!".to_string(),
                    ..Default::default()
                },
            ],
            bos_token: Some("[BOS]"),
            eos_token: Some("[EOS]"),
            add_generation_prompt: true,
            ..Default::default()
        };

        let current_date = Local::now().format("%Y-%m-%d").to_string();
        let result = tmpl.unwrap().render(chat_template_inputs).unwrap();
        assert_eq!(result, format!("[BOS][SYSTEM_PROMPT]The current date is {}.[/SYSTEM_PROMPT][INST]Hi![/INST]Hello how can I help?[EOS][INST]What is Deep Learning?[/INST]magic![EOS]", current_date));
    }

    #[test]
    fn test_chat_template_valid_with_add_generation_prompt() {
        let mut env = Environment::new();
        env.add_function("raise_exception", raise_exception);
        env.add_function("strftime_now", strftime_now);

        let source = r#"
        {% for message in messages %}
        {{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}
        {% endfor %}
        {% if add_generation_prompt %}
            {{ '<|im_start|>assistant\n' }}
        {% endif %}"#;

        // trim all the whitespace
        let source = source
            .lines()
            .map(|line| line.trim())
            .collect::<Vec<&str>>()
            .join("");

        let tmpl = env.template_from_str(&source);

        let chat_template_inputs = ChatTemplateInputs {
            messages: vec![
                TextMessage {
                    role: "user".to_string(),
                    content: "Hi!".to_string(),
                    ..Default::default()
                },
                TextMessage {
                    role: "assistant".to_string(),
                    content: "Hello how can I help?".to_string(),
                    ..Default::default()
                },
                TextMessage {
                    role: "user".to_string(),
                    content: "What is Deep Learning?".to_string(),
                    ..Default::default()
                },
                TextMessage {
                    role: "assistant".to_string(),
                    content: "magic!".to_string(),
                    ..Default::default()
                },
            ],
            bos_token: Some("[BOS]"),
            eos_token: Some("[EOS]"),
            add_generation_prompt: true,
            ..Default::default()
        };

        let result = tmpl.unwrap().render(chat_template_inputs).unwrap();
        assert_eq!(result, "<|im_start|>user\nHi!<|im_end|>\n<|im_start|>assistant\nHello how can I help?<|im_end|>\n<|im_start|>user\nWhat is Deep Learning?<|im_end|>\n<|im_start|>assistant\nmagic!<|im_end|>\n<|im_start|>assistant\n");
    }

    struct ChatTemplateTestItem {
        name: &'static str,
        chat_template: &'static str,
        input: ChatTemplateInputs<'static>,
        target: &'static str,
    }

    #[test]
    fn test_many_chat_templates() {
        let example_chat = vec![
            TextMessage {
                role: "user".to_string(),
                content: "Hello, how are you?".to_string(),
                ..Default::default()
            },
            TextMessage {
                role: "assistant".to_string(),
                content: "I'm doing great. How can I help you today?".to_string(),
                ..Default::default()
            },
            TextMessage {
                role: "user".to_string(),
                content: "I'd like to show off how chat templating works!".to_string(),
                ..Default::default()
            },
        ];

        let example_chat_with_system = [TextMessage {
            role: "system".to_string(),
            content: "You are a friendly chatbot who always responds in the style of a pirate"
                .to_string(),
            ..Default::default()
        }]
        .iter()
        .chain(&example_chat)
        .cloned()
        .collect::<Vec<_>>();

        let test_default_templates = vec![
            ChatTemplateTestItem {
                name: "_base",
                chat_template: "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
                input: ChatTemplateInputs {
                    messages: example_chat.clone(),
                    add_generation_prompt: false,
                    bos_token: Some(""),
                    eos_token: Some(""),
                    ..Default::default()
                },
                target: "<|im_start|>user\nHello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing great. How can I help you today?<|im_end|>\n<|im_start|>user\nI'd like to show off how chat templating works!<|im_end|>\n",
            },
            ChatTemplateTestItem {
                name: "blenderbot",
                chat_template: "{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ '  ' }}{% endif %}{% endfor %}{{ eos_token }}",
                input: ChatTemplateInputs {
                    messages: example_chat.clone(),
                    add_generation_prompt: false,
                    bos_token: Some(""),
                    eos_token: Some("</s>"),
                    ..Default::default()
                },
                target: " Hello, how are you?  I'm doing great. How can I help you today?   I'd like to show off how chat templating works!</s>",
            },
            ChatTemplateTestItem {
                name: "blenderbot_small",
                chat_template: "{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ '  ' }}{% endif %}{% endfor %}{{ eos_token }}",
                input: ChatTemplateInputs {
                    messages: example_chat.clone(),
                    add_generation_prompt: false,
                    bos_token: Some(""),
                    eos_token: Some("</s>"),
                    ..Default::default()
                },
                target: " Hello, how are you?  I'm doing great. How can I help you today?   I'd like to show off how chat templating works!</s>",
            },
            ChatTemplateTestItem {
                name: "bloom",
                chat_template: "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}",
                input: ChatTemplateInputs {
                    messages: example_chat.clone(),
                    add_generation_prompt: false,
                    bos_token: Some(""),
                    eos_token: Some("</s>"),
                    ..Default::default()
                },
                target: "Hello, how are you?</s>I'm doing great. How can I help you today?</s>I'd like to show off how chat templating works!</s>",
            },
            ChatTemplateTestItem {
                name: "gpt_neox",
                chat_template: "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}",
                input: ChatTemplateInputs {
                    messages: example_chat.clone(),
                    add_generation_prompt: false,
                    bos_token: Some(""),
                    eos_token: Some("<|endoftext|>"),
                    ..Default::default()
                },
                target: "Hello, how are you?<|endoftext|>I'm doing great. How can I help you today?<|endoftext|>I'd like to show off how chat templating works!<|endoftext|>",
            },
            ChatTemplateTestItem {
                name: "gpt2",
                chat_template: "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}",
                input: ChatTemplateInputs {
                    messages: example_chat.clone(),
                    add_generation_prompt: false,
                    bos_token: Some(""),
                    eos_token: Some("<|endoftext|>"),
                    ..Default::default()
                },
                target: "Hello, how are you?<|endoftext|>I'm doing great. How can I help you today?<|endoftext|>I'd like to show off how chat templating works!<|endoftext|>",
            },
            ChatTemplateTestItem {
                name: "llama",
                // NOTE: the `.strip()` has been replaced with `| trim` in the following template
                chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif USE_DEFAULT_PROMPT == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token +'[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\n' + content | trim + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
                input: ChatTemplateInputs {
                    messages: example_chat_with_system.clone(),
                    add_generation_prompt: true,
                    bos_token: Some("<s>"),
                    eos_token: Some("</s>"),
                    ..Default::default()
                },
                target: "<s>[INST] <<SYS>>\nYou are a friendly chatbot who always responds in the style of a pirate\n<</SYS>>\n\nHello, how are you? [/INST] I'm doing great. How can I help you today? </s><s>[INST] I'd like to show off how chat templating works! [/INST]",
            },
            ChatTemplateTestItem {
                name: "whisper",
                chat_template: "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}",
                input: ChatTemplateInputs {
                    messages: example_chat.clone(),
                    add_generation_prompt: true,
                    bos_token: Some(""),
                    eos_token: Some("<|endoftext|>"),
                    ..Default::default()
                },
                target: "Hello, how are you?<|endoftext|>I'm doing great. How can I help you today?<|endoftext|>I'd like to show off how chat templating works!<|endoftext|>",
            },
        ];

        #[allow(unused_variables)] // name is unused
        for ChatTemplateTestItem {
            name,
            chat_template,
            input,
            target,
        } in test_default_templates
        {
            let mut env = Environment::new();
            env.add_function("raise_exception", raise_exception);
            env.add_function("strftime_now", strftime_now);
            let tmpl = env.template_from_str(chat_template);
            let result = tmpl.unwrap().render(input).unwrap();
            assert_eq!(result, target);
        }

        let test_custom_templates = vec![
            ChatTemplateTestItem {
                name: "HuggingFaceH4/zephyr-7b-beta (add_generation_prompt=false)",
                chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
                input: ChatTemplateInputs {
                    messages: example_chat_with_system.clone(),
                    add_generation_prompt: false,
                    bos_token: Some(""),
                    eos_token: Some("</s>"),
                    ..Default::default()
                },
                target: "<|system|>\nYou are a friendly chatbot who always responds in the style of a pirate</s><|user|>\nHello, how are you?</s><|assistant|>\nI'm doing great. How can I help you today?</s><|user|>\nI'd like to show off how chat templating works!</s>",
            },
            ChatTemplateTestItem {
                name: "HuggingFaceH4/zephyr-7b-beta (add_generation_prompt=true)",
                chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
                input: ChatTemplateInputs {
                    messages: vec![
                        TextMessage {
                            role: "system".to_string(),
                            content: "You are a friendly chatbot who always responds in the style of a pirate".to_string(),
                            ..Default::default()
                        },
                        TextMessage {
                            role: "user".to_string(),
                            content: "How many helicopters can a human eat in one sitting?".to_string(),
                            ..Default::default()
                        },
                    ],
                    add_generation_prompt: true,
                    bos_token: Some(""),
                    eos_token: Some("</s>"),
                    ..Default::default()
                },
                target: "<|system|>\nYou are a friendly chatbot who always responds in the style of a pirate</s><|user|>\nHow many helicopters can a human eat in one sitting?</s><|assistant|>",
            },
            ChatTemplateTestItem {
                name: "HuggingFaceH4/zephyr-7b-gemma-v0.1",
                chat_template: "{% if messages[0]['role'] == 'user' or messages[0]['role'] == 'system' %}{{ bos_token }}{% endif %}{% for message in messages %}{{ '<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% elif messages[-1]['role'] == 'assistant' %}{{ eos_token }}{% endif %}",
                input: ChatTemplateInputs {
                    messages: example_chat.clone(),
                    add_generation_prompt: false,
                    bos_token: Some("<bos>"),
                    eos_token: Some("<eos>"),
                    ..Default::default()
                },
                target: "<bos><|im_start|>user\nHello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing great. How can I help you today?<|im_end|>\n<|im_start|>user\nI'd like to show off how chat templating works!<|im_end|>\n",
            },
            ChatTemplateTestItem {
                name: "mistralai/Mistral-7B-Instruct-v0.1",
                chat_template: "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
                input: ChatTemplateInputs {
                    messages: example_chat.clone(),
                    add_generation_prompt: false,
                    bos_token: Some("<s>"),
                    eos_token: Some("</s>"),
                    ..Default::default()
                },
                target: "<s>[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today?</s> [INST] I'd like to show off how chat templating works! [/INST]",
            },
            ChatTemplateTestItem {
                name: "mistralai/Mixtral-8x7B-Instruct-v0.1",
                chat_template: "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
                input: ChatTemplateInputs {
                    messages: example_chat.clone(),
                    add_generation_prompt: false,
                    bos_token: Some("<s>"),
                    eos_token: Some("</s>"),
                    ..Default::default()
                },
                target: "<s>[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today?</s>[INST] I'd like to show off how chat templating works! [/INST]",
            },
            ChatTemplateTestItem {
                name: "cognitivecomputations/dolphin-2.5-mixtral-8x7b",
                chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
                input: ChatTemplateInputs {
                    messages: example_chat.clone(),
                    add_generation_prompt: false,
                    bos_token: Some("<s>"),
                    eos_token: Some("</s>"),
                    ..Default::default()
                },
                target: "<|im_start|>user\nHello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing great. How can I help you today?<|im_end|>\n<|im_start|>user\nI'd like to show off how chat templating works!<|im_end|>\n",
            },
            ChatTemplateTestItem {
                name: "openchat/openchat-3.5-0106",
                // `.title()` has been replaced with `| upper` in the following template
                chat_template: "{{ bos_token }}{% for message in messages %}{{ 'GPT4 Correct ' + (message['role'] | title) + ': ' + message['content'] + '<|end_of_turn|>'}}{% endfor %}{% if add_generation_prompt %}{{ 'GPT4 Correct Assistant:' }}{% endif %}",
                input: ChatTemplateInputs {
                    messages: example_chat.clone(),
                    add_generation_prompt: false,
                    bos_token: Some("<s>"),
                    eos_token: Some("</s>"),
                    ..Default::default()
                },
                target: "<s>GPT4 Correct User: Hello, how are you?<|end_of_turn|>GPT4 Correct Assistant: I'm doing great. How can I help you today?<|end_of_turn|>GPT4 Correct User: I'd like to show off how chat templating works!<|end_of_turn|>",
            },
            ChatTemplateTestItem {
                name: "upstage/SOLAR-10.7B-Instruct-v1.0",
                chat_template: "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}",
                input: ChatTemplateInputs {
                    messages: example_chat.clone(),
                    add_generation_prompt: false,
                    bos_token: Some("<s>"),
                    eos_token: Some("</s>"),
                    ..Default::default()
                },
                target: "Hello, how are you?</s>I'm doing great. How can I help you today?</s>I'd like to show off how chat templating works!</s>",
            },
            ChatTemplateTestItem {
                name: "codellama/CodeLlama-70b-Instruct-hf",
                // NOTE: `.strip()` has been replaced with `| trim` in the following template
                chat_template: "{% if messages[0]['role'] == 'system' %}{% set user_index = 1 %}{% else %}{% set user_index = 0 %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != ((loop.index0 + user_index) % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{{ '<s>' }}{% endif %}{% set content = 'Source: ' + message['role'] + '\\n\\n ' + message['content'] | trim %}{{ content + ' <step> ' }}{% endfor %}{{'Source: assistant\\nDestination: user\\n\\n '}}",
                input: ChatTemplateInputs {
                    messages: example_chat.clone(),
                    add_generation_prompt: false,
                    bos_token: Some("<s>"),
                    eos_token: Some("</s>"),
                    ..Default::default()
                },
                target: "<s>Source: user\n\n Hello, how are you? <step> Source: assistant\n\n I'm doing great. How can I help you today? <step> Source: user\n\n I'd like to show off how chat templating works! <step> Source: assistant\nDestination: user\n\n ",
            },
            ChatTemplateTestItem {
                name: "Deci/DeciLM-7B-instruct",
                chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '### User:\\n' + message['content'] }}\n{% elif message['role'] == 'system' %}\n{{ '### System:\\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '### Assistant:\\n'  + message['content'] }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '### Assistant:' }}\n{% endif %}\n{% endfor %}",
                input: ChatTemplateInputs {
                    messages: example_chat.clone(),
                    add_generation_prompt: false,
                    bos_token: Some("<s>"),
                    eos_token: Some("</s>"),
                    ..Default::default()
                },
                target: "### User:\nHello, how are you?### Assistant:\nI'm doing great. How can I help you today?### User:\nI'd like to show off how chat templating works!",
            },
            ChatTemplateTestItem {
                name: "Qwen/Qwen1.5-72B-Chat",
                chat_template: "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\\nYou are a helpful assistant<|im_end|>\\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
                input: ChatTemplateInputs {
                    messages: example_chat.clone(),
                    add_generation_prompt: false,
                    bos_token: Some("<s>"),
                    eos_token: Some("</s>"),
                    ..Default::default()
                },
                target: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nHello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing great. How can I help you today?<|im_end|>\n<|im_start|>user\nI'd like to show off how chat templating works!",
            },
            ChatTemplateTestItem {
                name: "deepseek-ai/deepseek-llm-7b-chat",
                chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\\n\\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
                input: ChatTemplateInputs {
                    messages: example_chat.clone(),
                    add_generation_prompt: false,
                    bos_token: Some("<｜begin▁of▁sentence｜>"),
                    eos_token: Some("<｜end▁of▁sentence｜>"),
                    ..Default::default()
                },
                target: "<｜begin▁of▁sentence｜>User: Hello, how are you?\n\nAssistant: I'm doing great. How can I help you today?<｜end▁of▁sentence｜>User: I'd like to show off how chat templating works!\n\n",
            },
            ChatTemplateTestItem {
                name: "h2oai/h2o-danube-1.8b-chat",
                chat_template: "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|prompt|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ '<|system|>' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ '<|answer|>'  + message['content'] + eos_token }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|answer|>' }}{% endif %}{% endfor %}",
                input: ChatTemplateInputs {
                    messages: example_chat.clone(),
                    add_generation_prompt: false,
                    bos_token: Some("<s>"),
                    eos_token: Some("</s>"),
                    ..Default::default()
                },
                target: "<|prompt|>Hello, how are you?</s><|answer|>I'm doing great. How can I help you today?</s><|prompt|>I'd like to show off how chat templating works!</s>",
            },
            ChatTemplateTestItem {
                name: "internlm/internlm2-chat-7b",
                chat_template: "{% if messages[0]['role'] == 'user' or messages[0]['role'] == 'system' %}{{ bos_token }}{% endif %}{% for message in messages %}{{ '<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% elif messages[-1]['role'] == 'assistant' %}{{ eos_token }}{% endif %}",
                input: ChatTemplateInputs {
                    messages: example_chat.clone(),
                    add_generation_prompt: false,
                    bos_token: Some("<s>"),
                    eos_token: Some("</s>"),
                    ..Default::default()
                },
                target: "<s><|im_start|>user\nHello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing great. How can I help you today?<|im_end|>\n<|im_start|>user\nI'd like to show off how chat templating works!<|im_end|>\n",
            },
            ChatTemplateTestItem {
                name: "TheBloke/deepseek-coder-33B-instruct-AWQ",
                chat_template: "{%- set found_item = false -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set found_item = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{%- if not found_item -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{{'### Response:\\n'}}\n",
                input: ChatTemplateInputs {
                    messages: example_chat.clone(),
                    add_generation_prompt: false,
                    bos_token: Some("<｜begin▁of▁sentence｜>"),
                    eos_token: Some("<|EOT|>"),
                    ..Default::default()
                },
                target: "You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.\n### Instruction:\nHello, how are you?\n### Response:\nI'm doing great. How can I help you today?\n<|EOT|>\n### Instruction:\nI'd like to show off how chat templating works!\n### Response:\n",
            },
            ChatTemplateTestItem {
                name: "ericzzz/falcon-rw-1b-chat",
                // `.strip()` has been replaced with `| trim` in the following template
                chat_template: "{% for message in messages %}{% if loop.index > 1 and loop.previtem['role'] != 'assistant' %}{{ ' ' }}{% endif %}{% if message['role'] == 'system' %}{{ '[SYS] ' + message['content'] | trim }}{% elif message['role'] == 'user' %}{{ '[INST] ' + message['content'] | trim }}{% elif message['role'] == 'assistant' %}{{ '[RESP] '  + message['content'] + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ ' [RESP] ' }}{% endif %}",
                input: ChatTemplateInputs {
                    messages: example_chat.clone(),
                    add_generation_prompt: false,
                    bos_token: Some("<|endoftext|>"),
                    eos_token: Some("<|endoftext|>"),
                    ..Default::default()
                },
                target: "[INST] Hello, how are you? [RESP] I'm doing great. How can I help you today?<|endoftext|>[INST] I'd like to show off how chat templating works!",
            },
            ChatTemplateTestItem {
                name: "abacusai/Smaug-34B-v0.1",
                chat_template: "{%- for idx in range(0, messages|length) -%}\n{%- if messages[idx]['role'] == 'user' -%}\n{%- if idx > 1 -%}\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\n{%- else -%}\n{{- messages[idx]['content'] + ' [/INST]' -}}\n{%- endif -%}\n{% elif messages[idx]['role'] == 'system' %}\n{{- '[INST] <<SYS>>\\n' + messages[idx]['content'] + '\\n<</SYS>>\\n\\n' -}}\n{%- elif messages[idx]['role'] == 'assistant' -%}\n{{- ' '  + messages[idx]['content'] + ' ' + eos_token -}}\n{% endif %}\n{% endfor %}",
                input: ChatTemplateInputs {
                    messages: example_chat.clone(),
                    add_generation_prompt: false,
                    bos_token: Some("<s>"),
                    eos_token: Some("</s>"),
                    ..Default::default()
                },
                target: "Hello, how are you? [/INST] I'm doing great. How can I help you today? </s><s>[INST] I'd like to show off how chat templating works! [/INST]",
            },
            ChatTemplateTestItem {
                name: "maywell/Synatra-Mixtral-8x7B",
                chat_template: "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n{% for message in messages %}{% if message['role'] == 'user' %}### Instruction:\n{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% elif message['role'] == 'assistant' %}### Response:\n{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% elif message['role'] == 'system' %}{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% endif %}\n{% endfor %}\n{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}\n### Response:\n{% endif %}",
                input: ChatTemplateInputs {
                    messages: example_chat.clone(),
                    add_generation_prompt: false,
                    bos_token: Some("<s>"),
                    eos_token: Some("</s>"),
                    ..Default::default()
                },
                target: "Below is an instruction that describes a task. Write a response that appropriately completes the request.### Instruction:Hello, how are you?### Response:I'm doing great. How can I help you today?### Instruction:I'd like to show off how chat templating works!",
            },
            ChatTemplateTestItem {
                name: "deepseek-ai/deepseek-coder-33b-instruct",
                chat_template: "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}",
                input: ChatTemplateInputs {
                    messages: example_chat.clone(),
                    add_generation_prompt: false,
                    bos_token: Some("<｜begin▁of▁sentence｜>"),
                    eos_token: Some("</EOT>"),
                    ..Default::default()
                },
                target: "<｜begin▁of▁sentence｜>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\nHello, how are you?\n### Response:\nI'm doing great. How can I help you today?\n<|EOT|>\n### Instruction:\nI'd like to show off how chat templating works!\n",
            },
            // NOT INCLUDED
            // - meetkai/functionary-medium-v3.2
            // - fireworks-ai/firefunction-v1
            // https://github
            ChatTemplateTestItem {
                name: "maywell/PiVoT-MoE",
                chat_template: "{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}{% for message in messages %}{% if message['role'] == 'system' %}{{ message['content']|trim }}{% elif message['role'] == 'user' %}### Instruction: {{ message['content']|trim }}{% elif message['role'] == 'assistant' %}### Response: {{ message['content']|trim }}{% elif message['role'] == 'user_context' %}### Input: {{ message['content']|trim }}{% endif %}{% if not loop.last %}\n{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}### Response:{% endif %}",
                input: ChatTemplateInputs {
                    messages: example_chat_with_system.clone(),
                    add_generation_prompt: false,
                    bos_token: Some("<s>"),
                    eos_token: Some("</s>"),
                    ..Default::default()
                },
                target: "You are a friendly chatbot who always responds in the style of a pirateYou are a friendly chatbot who always responds in the style of a pirate### Instruction: Hello, how are you?### Response: I'm doing great. How can I help you today?### Instruction: I'd like to show off how chat templating works!",
            },
        ];

        #[allow(unused_variables)] // name is unused
        for ChatTemplateTestItem {
            name,
            chat_template,
            input,
            target,
        } in test_custom_templates
        {
            let mut env = Environment::new();
            env.add_function("raise_exception", raise_exception);
            env.add_function("strftime_now", strftime_now);
            // trim all the whitespace
            let chat_template = chat_template
                .lines()
                .map(|line| line.trim())
                .collect::<Vec<&str>>()
                .join("");

            let tmpl = env.template_from_str(&chat_template);
            let result = tmpl.unwrap().render(input).unwrap();
            assert_eq!(result, target);
        }
    }

    #[test]
    fn test_chat_template_with_default_tool_template() {
        let ct = ChatTemplate::new(
            "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}".to_string(),
            Some(TokenizerConfigToken::String("<s>".to_string())),
            Some(TokenizerConfigToken::String("</s>".to_string())),
        );

        // convert TextMessage to Message
        let msgs: Vec<Message> = vec![
            Message {
                name: None,
                role: "user".to_string(),
                body: MessageBody::Content {
                    content: MessageContent::SingleText(
                        "I'd like to show off how chat templating works!".to_string(),
                    ),
                },
            },
            Message {
                name: None,
                role: "assistant".to_string(),
                body: MessageBody::Content {
                    content: MessageContent::SingleText(
                        "Great! How can I help you today?".to_string(),
                    ),
                },
            },
            Message {
                name: None,
                role: "user".to_string(),
                body: MessageBody::Content {
                    content: MessageContent::SingleText("Just testing".to_string()),
                },
            },
        ];
        let tools_string = r#"[{"type": "function","function": {"name": "get_current_weather","description": "Get the current weather","parameters": {"type": "object","properties": {"location": {"type": "string","description": "The city and state, e.g. San Francisco, CA"},"format": {"type": "string","enum": ["celsius", "fahrenheit"],"description": "The temperature unit to use. Infer this from the users location."}},"required": ["location", "format"]}}}]"#.to_string();
        let tools: Vec<Tool> = serde_json::from_str(&tools_string).unwrap();
        let tool_prompt = "This default prompt will be used".to_string();
        let tools_and_prompt = Some((tools, tool_prompt));
        let result = ct.apply(msgs, tools_and_prompt);
        let expected = "<s>[INST] I'd like to show off how chat templating works! [/INST]Great! How can I help you today?</s> [INST] Just testing\n---\n[{\"type\":\"function\",\"function\":{\"description\":\"Get the current weather\",\"name\":\"get_current_weather\",\"arguments\":\"{\\\"type\\\":\\\"object\\\",\\\"properties\\\":{\\\"location\\\":{\\\"type\\\":\\\"string\\\",\\\"description\\\":\\\"The city and state, e.g. San Francisco, CA\\\"},\\\"format\\\":{\\\"type\\\":\\\"string\\\",\\\"enum\\\":[\\\"celsius\\\",\\\"fahrenheit\\\"],\\\"description\\\":\\\"The temperature unit to use. Infer this from the users location.\\\"}},\\\"required\\\":[\\\"location\\\",\\\"format\\\"]}\"}}]\nThis default prompt will be used [/INST]".to_string();
        assert_eq!(result.unwrap(), expected);
    }

    #[test]
    fn test_chat_template_with_custom_tool_template() {
        // chat template from meta-llama/Meta-Llama-3.1-8B-Instruct
        let ct = ChatTemplate::new(
            "{{- bos_token }}\n{%- if not tools_in_user_message is defined %}\n    {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n    {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content']|trim %}\n    {%- set messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n    {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n    {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n    {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\\n\\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\\n\\n\" }}\n    {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n    {#- Extract the first user message so we can plug it in here #}\n    {%- if messages | length != 0 %}\n        {%- set first_user_message = messages[0]['content']|trim %}\n        {%- set messages = messages[1:] %}\n    {%- else %}\n        {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n    {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n    {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n    {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\\n\\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\\n\\n\" }}\n    {%- endfor %}\n    {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n    {%- elif 'tool_calls' in message %}\n        {%- if not message.tool_calls|length == 1 %}\n            {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n        {%- endif %}\n        {%- set tool_call = message.tool_calls[0].function %}\n        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n            {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n            {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n            {%- for arg_name, arg_val in tool_call.arguments | items %}\n                {{- arg_name + '=\"' + arg_val + '\"' }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- endif %}\n                {%- endfor %}\n            {{- \")\" }}\n        {%- else  %}\n            {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n            {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n            {{- '\"parameters\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- \"}\" }}\n        {%- endif %}\n        {%- if builtin_tools is defined %}\n            {#- This means we're in ipython mode #}\n            {{- \"<|eom_id|>\" }}\n        {%- else %}\n            {{- \"<|eot_id|>\" }}\n        {%- endif %}\n    {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n        {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n        {%- if message.content is mapping or message.content is iterable %}\n            {{- message.content | tojson }}\n        {%- else %}\n            {{- message.content }}\n        {%- endif %}\n        {{- \"<|eot_id|>\" }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n".to_string(),
            Some(TokenizerConfigToken::String("<s>".to_string())),
            Some(TokenizerConfigToken::String("</s>".to_string())),
        );
        let msgs: Vec<Message> = vec![
            Message {
                name: None,
                role: "system".to_string(),
                body: MessageBody::Content {
                    content: MessageContent::SingleText(
                        "Youre a helpful assistant! Answer the users question best you can."
                            .to_string(),
                    ),
                },
            },
            Message {
                name: None,
                role: "user".to_string(),
                body: MessageBody::Content {
                    content: MessageContent::SingleText(
                        "What is the weather like in Brooklyn, New York?".to_string(),
                    ),
                },
            },
        ];
        let tools_string = r#"[{"type": "function","function": {"name": "get_current_weather","description": "Get the current weather","parameters": {"type": "object","properties": {"location": {"type": "string","description": "The city and state, e.g. San Francisco, CA"},"format": {"type": "string","enum": ["celsius", "fahrenheit"],"description": "The temperature unit to use. Infer this from the users location."}},"required": ["location", "format"]}}}]"#.to_string();
        let tools: Vec<Tool> = serde_json::from_str(&tools_string).unwrap();
        let tool_prompt = "This default prompt will be used".to_string();
        let tools_and_prompt = Some((tools, tool_prompt));
        let result = ct.apply(msgs, tools_and_prompt);
        let expected = "<s><|start_header_id|>system<|end_header_id|>\n\nEnvironment: ipython\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\nYoure a helpful assistant! Answer the users question best you can.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nGiven the following functions, please respond with a JSON for a function call with its proper arguments that best answers the given prompt.\n\nRespond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.Do not use variables.\n\n{\n    \"function\": {\n        \"arguments\": \"{\\\"type\\\":\\\"object\\\",\\\"properties\\\":{\\\"location\\\":{\\\"type\\\":\\\"string\\\",\\\"description\\\":\\\"The city and state, e.g. San Francisco, CA\\\"},\\\"format\\\":{\\\"type\\\":\\\"string\\\",\\\"enum\\\":[\\\"celsius\\\",\\\"fahrenheit\\\"],\\\"description\\\":\\\"The temperature unit to use. Infer this from the users location.\\\"}},\\\"required\\\":[\\\"location\\\",\\\"format\\\"]}\",\n        \"description\": \"Get the current weather\",\n        \"name\": \"get_current_weather\"\n    },\n    \"type\": \"function\"\n}\n\nWhat is the weather like in Brooklyn, New York?\n---\nThis default prompt will be used<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n".to_string();
        assert_eq!(result.unwrap(), expected);
    }

    #[test]
    fn test_chat_template_with_special_system_prompt() {
        // chat template from gemma3
        let ct = ChatTemplate::new(
            r#"{{ bos_token }}
{%- if messages[0]['role'] == 'system' -%}
    {%- set first_user_prefix = messages[0]['content'][0]['text'] + '

' -%}
    {%- set loop_messages = messages[1:] -%}
{%- else -%}
    {%- set first_user_prefix = "" -%}
    {%- set loop_messages = messages -%}
{%- endif -%}
{%- for message in loop_messages -%}
    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
        {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
    {%- endif -%}
    {%- if (message['role'] == 'assistant') -%}
        {%- set role = "model" -%}
    {%- else -%}
        {%- set role = message['role'] -%}
    {%- endif -%}
    {{ '<start_of_turn>' + role + '
' + (first_user_prefix if loop.first else "") }}
    {%- if message['content'] is string -%}
        {{ message['content'] | trim }}
    {%- elif message['content'] is iterable -%}
        {%- for item in message['content'] -%}
            {%- if item['type'] == 'image' -%}
                {{ '<start_of_image>' }}
            {%- elif item['type'] == 'text' -%}
                {{ item['text'] | trim }}
            {%- endif -%}
        {%- endfor -%}
    {%- else -%}
        {{ raise_exception("Invalid content type") }}
    {%- endif -%}
    {{ '<end_of_turn>
' }}
{%- endfor -%}
{%- if add_generation_prompt -%}
    {{'<start_of_turn>model
'}}
{%- endif -%}
"#
            .to_string(),
            Some(TokenizerConfigToken::String("<bos>".to_string())),
            Some(TokenizerConfigToken::String("</eos>".to_string())),
        );
        let msgs: Vec<Message> = vec![
            Message {
                name: None,
                role: "system".to_string(),
                body: MessageBody::Content {
                    content: MessageContent::MultipleChunks(vec![MessageChunk::Text {
                        text: "You are a helpful assistant.".to_string(),
                    }]),
                },
            },
            Message {
                name: None,
                role: "user".to_string(),
                body: MessageBody::Content {
                    content: MessageContent::MultipleChunks(vec![
                        MessageChunk::Text {
                            text: "I'm already using this supplement ".to_string(),
                        },
                        MessageChunk::ImageUrl {
                            image_url: Url {
                                url:  "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/IMG_3018.JPG".to_string()
                            },
                        },
                        MessageChunk::Text {
                            text: "and I want to use this one too ".to_string()
                        },
                        MessageChunk::ImageUrl {
                            image_url: Url {
                                url: "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/IMG_3015.jpg".to_string()
                            },
                        },
                        MessageChunk::Text {
                            text: " what are cautions?".to_string()
                        },
                    ]),
                },
            },
        ];

        let result = ct.apply(msgs, None);
        let expected = "<bos><start_of_turn>user\nYou are a helpful assistant.\n\nI'm already using this supplement ![](https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/IMG_3018.JPG)and I want to use this one too ![](https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/IMG_3015.jpg) what are cautions?<end_of_turn>\n<start_of_turn>model\n".to_string();
        assert_eq!(result.unwrap(), expected);
    }
}


================================================
FILE: router/src/infer/mod.rs
================================================
// pub(crate) mod v2;
mod chat_template;
pub mod tool_grammar;

use crate::validation::{ValidGenerateRequest, Validation, ValidationError};
use crate::Tool;
use crate::{
    ChatTemplateVersions, FinishReason, GenerateRequest, HubProcessorConfig, HubTokenizerConfig,
    Message, PrefillToken, Token,
};
use async_stream::stream;
use async_trait::async_trait;
use axum::response::sse::Event;
use chat_template::ChatTemplate;
use futures::future::try_join_all;
use futures::Stream;
use minijinja::ErrorKind;
use serde::Serialize;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use thiserror::Error;
use tokio::sync::{OwnedSemaphorePermit, Semaphore, TryAcquireError};
use tokio::time::Instant;
use tokio_stream::wrappers::UnboundedReceiverStream;
use tokio_stream::StreamExt;
use tracing::instrument;

#[async_trait]
pub trait Backend {
    fn schedule(
        &self,
        request: ValidGenerateRequest,
    ) -> Result<UnboundedReceiverStream<Result<InferStreamResponse, InferError>>, InferError>;

    async fn health(&self, current_health: bool) -> bool;

    /// The state of the health on startup
    /// Typically false, or true if the backend includes
    /// a warmup phase.
    fn start_health(&self) -> bool {
        false
    }

    fn name(&self) -> &'static str;
}

/// Inference struct
#[derive(Clone)]
pub struct Infer {
    /// Validation
    validation: Validation,
    /// Request backend
    backend: Arc<dyn Backend + Send + Sync>,
    /// Chat template
    pub(crate) chat_template: Option<ChatTemplate>,
    /// Inference limit
    limit_concurrent_requests: Arc<Semaphore>,
    /// Backend health
    backend_health: Arc<AtomicBool>,
}

impl Infer {
    #[allow(clippy::too_many_arguments)]
    pub(crate) fn new(
        backend: impl Backend + Send + Sync + 'static,
        validation: Validation,
        max_concurrent_requests: usize,
        tokenizer_config: HubTokenizerConfig,
        processor_config: HubProcessorConfig,
    ) -> Self {
        let chat_template = tokenizer_config
            .chat_template
            .or(processor_config.chat_template)
            .and_then(|t| match t {
                ChatTemplateVersions::Single(template) => Some(template),
                ChatTemplateVersions::Multiple(templates) => templates
                    .into_iter()
                    .find(|t| t.name == "default")
                    .map(|t| t.template),
            })
            .map(|t| ChatTemplate::new(t, tokenizer_config.bos_token, tokenizer_config.eos_token));

        // Inference limit with a semaphore
        let semaphore = Arc::new(Semaphore::new(max_concurrent_requests));

        // Backend health
        let backend_health = Arc::new(AtomicBool::new(backend.start_health()));

        Self {
            validation,
            backend: Arc::new(backend),
            chat_template,
            limit_concurrent_requests: semaphore,
            backend_health,
        }
    }

    /// Add a new request to the queue and return a stream of InferStreamResponse
    #[instrument(skip_all)]
    pub(crate) async fn generate_stream<'a>(
        &'a self,
        request: GenerateRequest,
    ) -> Result<
        (
            OwnedSemaphorePermit,
            u32, // input_length
            impl Stream<Item = Result<InferStreamResponse, InferError>> + 'a,
        ),
        InferError,
    > {
        // Limit concurrent requests by acquiring a permit from the semaphore
        let permit = self
            .clone()
            .limit_concurrent_requests
            .try_acquire_owned()
            .map_err(|err| {
                metrics::counter!("tgi_request_failure", "err" => "overloaded").increment(1);
                tracing::error!("{err}");
                err
            })?;

        // Validate request
        let mut local_request = request.clone();
        let valid_request = self.validation.validate(request).await.map_err(|err| {
            metrics::counter!("tgi_request_failure", "err" => "validation").increment(1);
            tracing::error!("{err}");
            err
        })?;

        let seed = valid_request.parameters.seed;
        local_request.parameters.seed = Some(seed);
        let input_length = valid_request.input_length;
        let max_total_new_tokens = valid_request.stopping_parameters.max_total_new_tokens;
        let mut generation_stream = self.backend.schedule(valid_request)?;

        // Wrap generation stream to update the backend health if the stream contains an error
        let final_stream = stream! {
            let mut total_generated_tokens = 0;
            let mut first_start = None;
            let mut first_queued = None;
            let mut all_generated_text: Option<GeneratedText> = None;

            while let Some(response) = generation_stream.next().await {
                let response = response.inspect_err(|_err| {
                    self.backend_health.store(false, Ordering::SeqCst);
                })?;

                match response {
                    InferStreamResponse::Prefill(_) => yield Ok(response),
                    InferStreamResponse::Intermediate { .. } => {
                        total_generated_tokens += 1;
                        yield Ok(response);
                    }
                    InferStreamResponse::End { token, top_tokens,generated_text, start, queued  } => {
                        total_generated_tokens += 1;
                        first_start = first_start.or(Some(start));
                        first_queued = first_queued.or(Some(queued));
                        if let Some(v) = all_generated_text.as_mut() {
                                v.text.push_str(&generated_text.text);
                                v.generated_tokens = total_generated_tokens;
                                v.finish_reason = generated_text.finish_reason.clone();
                        };

                        if matches!(generated_text.finish_reason, FinishReason::Length) && total_generated_tokens < max_total_new_tokens {
                            local_request.inputs.push_str(&generated_text.text);
                            all_generated_text = all_generated_text.or(Some(generated_text));

                            let valid_request = match self.validation.validate(local_request.clone()).await {
                                Ok(valid_request) => valid_request,
                                Err(err) => {
                                    tracing::debug!("Failed to continue request: {err}");
                                    yield Ok(InferStreamResponse::End {token, top_tokens, generated_text: all_generated_text.unwrap(), start: first_start.unwrap(), queued: first_queued.unwrap() });
                                    break;
                                }
                            };

                            generation_stream = match self.backend.schedule(valid_request) {
                                Ok(stream) => {
                                    tracing::debug!("Continue request");
                                    yield Ok(InferStreamResponse::Intermediate { token, top_tokens } );
                                    stream
                                },
                                Err(err) => {
                                    tracing::debug!("Failed to continue request: {err}");
                                    yield Ok(InferStreamResponse::End {token, top_tokens, generated_text: all_generated_text.unwrap(), start: first_start.unwrap(), queued: first_queued.unwrap() });
                                    break;
                                }
                            }
                        } else {
                            yield Ok(InferStreamResponse::End {token, top_tokens, generated_text: all_generated_text.unwrap_or(generated_text), start: first_start.unwrap(), queued: first_queued.unwrap() });
                            break;
                        }

                    }
                }
            }
        };

        Ok((permit, input_length, final_stream))
    }

    /// Tokenizer the input
    #[instrument(skip_all)]
    pub(crate) async fn tokenize(
        &self,
        request: GenerateRequest,
    ) -> Result<tokenizers::Encoding, InferError> {
        // Tokenize request
        let inputs = request.inputs;
        let add_special_tokens = request.add_special_tokens;
        let truncate = request.parameters.truncate;
        let encoding = self
            .validation
            .tokenize(inputs, add_special_tokens, truncate)
            .await
            .map_err(|err| {
                tracing::error!("Tokenization {err}");
                err
            })?;

        // Return Encoding
        Ok(encoding.0)
    }

    /// Apply the chat template to the chat request
    #[instrument(skip_all)]
    pub(crate) fn apply_chat_template(
        &self,
        messages: Vec<Message>,
        tools_and_prompt: Option<(Vec<Tool>, String)>,
    ) -> Result<String, InferError> {
        self.chat_template
            .as_ref()
            .ok_or_else(|| InferError::TemplateError(ErrorKind::TemplateNotFound.into()))?
            .apply(messages, tools_and_prompt)
            .map_err(|e| {
                metrics::counter!("tgi_request_failure", "err" => "template").increment(1);
                tracing::error!("{e}");
                e
            })
    }

    /// Add a new request to the queue and return a InferResponse
    #[instrument(skip_all)]
    pub(crate) async fn generate(
        &self,
        request: GenerateRequest,
    ) -> Result<InferResponse, InferError> {
        let use_top_tokens = request.parameters.top_n_tokens.is_some_and(|x| x > 0);

        // Create stream and keep semaphore permit as long as generate lives
        let (_permit, _input_length, stream) = self.generate_stream(request).await?;

        // Return values
        let mut result_prefill = Vec::new();
        let mut result_tokens = Vec::new();
        let mut result_top_tokens = Vec::new();
        let mut result_generated_text = None;
        let mut result_start = None;
        let mut result_queued = None;

        let mut stream = Box::pin(stream);

        // Iterate on stream
        while let Some(response) = stream.next().await {
            match response? {
                // Add prefill tokens
                InferStreamResponse::Prefill(prefill_tokens) => {
                    result_prefill = prefill_tokens;
                }
                // Push last token
                InferStreamResponse::Intermediate { token, top_tokens } => {
                    result_tokens.push(token);
                    result_top_tokens.push(top_tokens);
                }
                // Final message
                // Set return values
                InferStreamResponse::End {
                    token,
                    generated_text,
                    start,
                    queued,
                    top_tokens,
                } => {
                    result_tokens.push(token);
                    result_top_tokens.push(top_tokens);
                    result_generated_text = Some(generated_text);
                    result_start = Some(start);
                    result_queued = Some(queued)
                }
            }
        }

        // Check that we received a `InferStreamResponse::End` message
        if let (Some(generated_text), Some(queued), Some(start)) =
            (result_generated_text, result_queued, result_start)
        {
            Ok(InferResponse {
                prefill: result_prefill,
                _input_length,
                tokens: result_tokens,
                generated_text,
                queued,
                start,
                top_tokens: if use_top_tokens {
                    result_top_tokens
                } else {
                    Vec::new()
                },
            })
        } else {
            let err = InferError::IncompleteGeneration;
            metrics::counter!("tgi_request_failure", "err" => "incomplete").increment(1);
            tracing::error!("{err}");
            Err(err)
        }
    }
    /// Add best_of new requests to the queue and return a InferResponse of the sequence with
    /// the highest log probability per token
    #[instrument(skip(self, request))]
    pub(crate) async fn generate_best_of(
        &self,
        request: GenerateRequest,
        best_of: usize,
    ) -> Result<(InferResponse, Vec<InferResponse>), InferError> {
        // validate  best_of parameter separately
        let best_of = self.validation.validate_best_of(best_of)?;

        // create multiple generate requests
        let mut infer_responses: Vec<InferResponse> =
            try_join_all((0..best_of).map(|_| self.generate(request.clone()))).await?;

        // get the sequence with the highest log probability per token
        let mut max_index = 0;
        let mut max_logprob: f32 = f32::MIN;

        for (i, response) in infer_responses.iter().enumerate() {
            // mean logprobs of the generated tokens
            let sequence_logprob = response
                .tokens
                .iter()
                .map(|token| token.logprob)
                .sum::<f32>()
                / response.tokens.len() as f32;

            // set best sequence
            if sequence_logprob > max_logprob {
                max_index = i;
                max_logprob = sequence_logprob;
            }
        }
        let best_response = infer_responses.remove(max_index);
        Ok((best_response, infer_responses))
    }

    #[instrument(skip(self))]
    pub(crate) async fn health(&self) -> bool {
        let health = self
            .backend
            .health(self.backend_health.load(Ordering::SeqCst))
            .await;
        self.backend_health.store(health, Ordering::SeqCst);
        health
    }
}

#[derive(Debug)]
pub struct GeneratedText {
    pub text: String,
    pub generated_tokens: u32,
    pub finish_reason: FinishReason,
    pub seed: Option<u64>,
}

#[derive(Debug)]
pub enum InferStreamResponse {
    // Optional first message
    Prefill(Vec<PrefillToken>),
    // Intermediate messages
    Intermediate {
        token: Token,
        top_tokens: Vec<Token>,
    },
    // Last message
    End {
        token: Token,
        top_tokens: Vec<Token>,
        generated_text: GeneratedText,
        start: Instant,
        queued: Instant,
    },
}

#[derive(Debug)]
pub(crate) struct InferResponse {
    /// input_length is the input as perceived by the rust tokenizer in the
    /// validation pathway. It is redundant with prefill.len() but prefill
    /// has data only if the user asked for it. This will always be filled.
    pub(crate) _input_length: u32,
    pub(crate) prefill: Vec<PrefillToken>,
    pub(crate) tokens: Vec<Token>,
    pub(crate) generated_text: GeneratedText,
    pub(crate) queued: Instant,
    pub(crate) start: Instant,
    pub(crate) top_tokens: Vec<Vec<Token>>,
}

#[derive(Debug, Error)]
pub enum InferError {
    #[error("Request failed during generation: {0}")]
    GenerationError(String),
    #[error("Model is overloaded")]
    Overloaded(#[from] TryAcquireError),
    #[error("Input validation error: {0}")]
    ValidationError(#[from] ValidationError),
    #[error("Incomplete generation")]
    IncompleteGeneration,
    #[error("Incomplete generation stream")]
    IncompleteGenerationStream,
    #[error("Template error: {0}")]
    TemplateError(#[from] minijinja::Error),
    #[error("Missing template vatiable: {0}")]
    MissingTemplateVariable(String),
    #[error("Tool error: {0}")]
    ToolError(String),
    #[error("Stream event serialization error")]
    StreamSerializationError(String),
}

impl InferError {
    pub(crate) fn error_type(&self) -> &str {
        match self {
            InferError::GenerationError(_) => "generation",
            InferError::Overloaded(_) => "overloaded",
            InferError::ValidationError(_) => "validation",
            InferError::IncompleteGeneration => "incomplete_generation",
            InferError::IncompleteGenerationStream => "incomplete_generation_stream",
            InferError::TemplateError(_) => "template_error",
            InferError::MissingTemplateVariable(_) => "missing_template_variable",
            InferError::ToolError(_) => "tool_error",
            InferError::StreamSerializationError(_) => "stream_serialization_error",
        }
    }

    pub(crate) fn into_openai_event(self) -> Event {
        Event::default()
            .json_data(OpenaiErrorEvent {
                error: APIError {
                    message: self.to_string(),
                    http_status_code: 422,
                },
            })
            .unwrap()
    }
}

#[derive(Serialize)]
pub struct APIError {
    message: String,
    http_status_code: usize,
}

#[derive(Serialize)]
pub struct OpenaiErrorEvent {
    error: APIError,
}


================================================
FILE: router/src/infer/tool_grammar.rs
================================================
use crate::infer::InferError;
use crate::{
    FunctionDefinition, FunctionRef, FunctionsMap, JsonSchemaTool, Properties, Tool, ToolChoice,
};
use serde_json::{json, Map, Value};
use std::collections::HashMap;

pub(crate) struct ToolGrammar {}

impl ToolGrammar {
    // find a tool by name
    fn find_tool_by_name(tools: &[Tool], name: &str) -> Result<Tool, InferError> {
        tools
            .iter()
            .find(|tool| tool.function.name == name)
            .cloned()
            .ok_or_else(|| InferError::ToolError(format!("Tool with name {} not found", name)))
    }

    pub fn apply(
        tools: Vec<Tool>,
        tool_choice: ToolChoice,
    ) -> Result<Option<(Vec<Tool>, JsonSchemaTool)>, InferError> {
        let tools_to_use = match tool_choice {
            ToolChoice::Function(function) => {
                vec![Self::find_tool_by_name(&tools, &function.name)?]
            }
            ToolChoice::Required => tools,
            ToolChoice::Auto => {
                // only add the no_tool function if the user has selected the auto option
                tools
                    .iter()
                    .cloned()
                    .chain(std::iter::once(Tool {
                        r#type: "function".to_string(),
                        function: FunctionDefinition {
                            name: "no_tool".to_string(),
                            description: Some(
                                "Open ended response with no specific tool selected".to_string(),
                            ),
                            arguments: json!({
                                "type": "object",
                                // "properties": {
                                //     "content": {
                                //         "type": "string",
                                //         "description": "The response content",
                                //     }
                                // },
                                // "required": ["content"]
                            }),
                        },
                    }))
                    .collect::<Vec<_>>()
            }
            ToolChoice::NoTool => vec![],
        };

        // if no tools are provided or if the user has selected the no_tool option, return None
        if tools_to_use.is_empty() {
            return Ok(None);
        }

        let functions: HashMap<String, serde_json::Value> = tools_to_use
            .iter()
            .map(|tool| {
                let func = tool.function.clone();

                let mut params = Map::new();

                params.insert(
                    "description".to_string(),
                    Value::String(func.description.unwrap_or_default()),
                );

                let mut properties = Map::new();
                let mut required = vec![Value::String("_name".to_string())];

                properties.insert(
                    "_name".to_string(),
                    json!({
                        "type": "string",
                        "const": func.name.clone(),
                    }),
                );

                if let Value::Object(args) = func.arguments {
                    if let Some(Value::Object(props)) = args.get("properties") {
                        properties.extend(props.clone());
                    }
                    if let Some(Value::Array(reqs)) = args.get("required") {
                        required.extend(reqs.clone());
                    }
                    params.insert(
                        "additionalProperties".to_string(),
                        Value::Bool(
                            args.get("additionalProperties").and_then(|v| v.as_str())
                                == Some("true"),
                        ),
                    );
                }

                params.insert("properties".to_string(), Value::Object(properties));
                params.insert("required".to_string(), Value::Array(required));

                (func.name, Value::Object(params))
            })
            .collect();

        let tool_schema = JsonSchemaTool {
            functions_map: FunctionsMap { functions },
            properties: Properties {
                function: tools_to_use
                    .iter()
                    .map(|tool| FunctionRef {
                        ref_path: format!("#/$functions/{}", tool.function.name.clone()),
                    })
                    .collect(),
            },
        };

        Ok(Some((tools_to_use, tool_schema)))
    }
}


================================================
FILE: router/src/kserve.rs
================================================
use crate::infer::Infer;
use crate::{
    default_parameters,
    server::{generate_internal, ComputeType},
    Deserialize, ErrorResponse, GenerateParameters, GenerateRequest, Serialize, ToSchema,
};
use axum::extract::{Extension, Path};
use axum::http::{HeaderMap, StatusCode};
use axum::response::IntoResponse;
use axum::Json;
use futures::stream::FuturesUnordered;
use futures::TryStreamExt;

#[derive(Debug, Serialize, Deserialize, ToSchema)]
pub struct OutputChunk {
    pub name: String,
    pub shape: Vec<usize>,
    pub datatype: String,
    pub data: Vec<u8>,
}

#[derive(Debug, Serialize, Deserialize, ToSchema)]
pub struct InferenceOutput {
    pub id: String,
    pub outputs: Vec<OutputChunk>,
}

#[derive(Debug, Deserialize, ToSchema)]
pub(crate) struct InferenceRequest {
    pub id: String,
    #[serde(default = "default_parameters")]
    pub parameters: GenerateParameters,
    pub inputs: Vec<Input>,
    pub outputs: Vec<Output>,
}

#[derive(Debug, Serialize, Deserialize, ToSchema)]
pub(crate) struct Input {
    pub name: String,
    pub shape: Vec<usize>,
    pub datatype: String,
    pub data: Vec<u8>,
}

#[derive(Debug, Serialize, Deserialize, ToSchema)]
pub(crate) struct Output {
    pub name: String,
}

#[derive(Debug, Serialize, Deserialize, ToSchema)]
pub struct LiveResponse {
    pub live: bool,
}

#[derive(Debug, Serialize, Deserialize, ToSchema)]
pub struct ReadyResponse {
    pub live: bool,
}

#[derive(Debug, Serialize, Deserialize, ToSchema)]
pub struct MetadataServerResponse {
    pub name: String,
    pub version: String,
    pub extensions: Vec<String>,
}

#[utoipa::path(
    post,
    tag = "Text Generation Inference",
    path = "/v2/health/live",
    responses(
        (status = 200, description = "Service is live", body = LiveReponse),
        (status = 404, description = "Service not found", body = ErrorResponse,
            example = json!({"error": "No response"}))
    )
)]
pub async fn kserve_health_live() -> Json<LiveResponse> {
    let data = LiveResponse { live: true };
    Json(data)
}

#[utoipa::path(
    get,
    tag = "Text Generation Inference",
    path = "/v2/health/ready",
    responses(
        (status = 200, description = "Service is ready", body = ReadyResponse),
        (status = 404, description = "Service not found", body = ErrorResponse,
            example = json!({"error": "No response"}))
    )
)]
pub async fn kserve_health_ready() -> Json<ReadyResponse> {
    let data = ReadyResponse { live: true };
    Json(data)
}

#[utoipa::path(
    get,
    tag = "Text Generation Inference",
    path = "/v2",
    responses(
        (status = 200, description = "Metadata retrieved", body = MetadataServerResponse),
        (status = 404, description = "Service not found", body = ErrorResponse,
            example = json!({"error": "No response"}))
    )
)]
pub async fn kerve_server_metadata() -> Json<MetadataServerResponse> {
    let data = MetadataServerResponse {
        name: "text-generation-inference".to_string(),
        version: env!("CARGO_PKG_VERSION").to_string(),
        extensions: vec![
            "health".to_string(),
            "models".to_string(),
            "metrics".to_string(),
        ],
    };
    Json(data)
}

#[utoipa::path(
    get,
    tag = "Text Generation Inference",
    path = "/v2/models/{model_name}/versions/{model_version}",
    responses(
        (status = 200, description = "Model version metadata retrieved", body = MetadataServerResponse),
        (status = 404, description = "Model or version not found", body = ErrorResponse,
            example = json!({"error": "No response"}))
    )
)]
pub async fn kserve_model_metadata(
    Path((model_name, model_version)): Path<(String, String)>,
) -> Json<MetadataServerResponse> {
    let data = MetadataServerResponse {
        name: model_name,
        version: model_version,
        extensions: vec!["infer".to_string(), "ready".to_string()],
    };
    Json(data)
}

#[utoipa::path(
    get,
    tag = "Text Generation Inference",
    path = "/v2/models/{model_name}/versions/{model_version}/ready",
    responses(
        (status = 200, description = "Model version is ready", body = ReadyResponse),
        (status = 404, description = "Model or version not found", body = ErrorResponse,
            example = json!({"error": "No response"}))
    )
)]
pub async fn kserve_model_metadata_ready(
    Path((_model_name, _model_version)): Path<(String, String)>,
) -> Json<ReadyResponse> {
    let data = ReadyResponse { live: true };
    Json(data)
}

#[utoipa::path(
    post,
    tag = "Text Generation Inference",
    path = "/v2/models/{model_name}/versions/{model_version}/infer",
    request_body = Json<InferenceRequest>,
    responses(
        (status = 200, description = "Inference executed successfully", body = InferenceOutput),
        (status = 404, description = "Model or version not found", body = ErrorResponse,
            example = json!({"error": "No response"}))
    )
)]
pub async fn kserve_model_infer(
    infer: Extension<Infer>,
    Extension(compute_type): Extension<ComputeType>,
    Json(payload): Json<InferenceRequest>,
) -> Result<impl IntoResponse, (StatusCode, Json<ErrorResponse>)> {
    let id = payload.id.clone();
    let str_inputs = payload
        .inputs
        .iter()
        .map(|input| {
            std::str::from_utf8(&input.data).map_err(|e| {
                (
                    StatusCode::UNPROCESSABLE_ENTITY,
                    Json(ErrorResponse {
                        error: e.to_string(),
                        error_type: "utf8".to_string(),
                    }),
                )
            })
        })
        .collect::<Result<Vec<_>, _>>()?;

    if str_inputs.len() != payload.outputs.len() {
        return Err((
            StatusCode::UNPROCESSABLE_ENTITY,
            Json(ErrorResponse {
                error: "Inputs and outputs length mismatch".to_string(),
                error_type: "length mismatch".to_string(),
            }),
        ));
    }

    let output_chunks = str_inputs
        .iter()
        .zip(&payload.outputs)
        .map(|(str_input, output)| {
            let generate_request = GenerateRequest {
                inputs: str_input.to_string(),
                parameters: payload.parameters.clone(),
                add_special_tokens: true,
            };
            let infer = infer.clone();
            let compute_type = compute_type.clone();
            let span = tracing::Span::current();
            async move {
                generate_internal(infer, compute_type, Json(generate_request), span)
                    .await
                    .map(|(_, _, Json(generation))| {
                        let generation_as_bytes = generation.generated_text.as_bytes().to_vec();
                        OutputChunk {
                            name: output.name.clone(),
                            shape: vec![1, generation_as_bytes.len()],
                            datatype: "BYTES".to_string(),
                            data: generation_as_bytes,
                        }
                    })
                    .map_err(|_| {
                        (
                            StatusCode::INTERNAL_SERVER_ERROR,
                            Json(ErrorResponse {
                                error: "Incomplete generation".into(),
                                error_type: "Incomplete generation".into(),
                            }),
                        )
                    })
            }
        })
        .collect::<FuturesUnordered<_>>()
        .try_collect::<Vec<_>>()
        .await?;

    let inference_output = InferenceOutput {
        id: id.clone(),
        outputs: output_chunks,
    };

    Ok((HeaderMap::new(), Json(inference_output)))
}


================================================
FILE: router/src/lib.rs
================================================
/// Text Generation Inference Webserver
pub mod config;
pub mod infer;
pub mod server;
pub mod validation;

#[cfg(feature = "kserve")]
mod kserve;
pub mod logging;

mod chat;
mod sagemaker;
pub mod usage_stats;
mod vertex;

use crate::infer::tool_grammar::ToolGrammar;
use crate::infer::{Infer, InferError};
use pyo3::prelude::*;
use pyo3::types::IntoPyDict;
use serde::{Deserialize, Serialize};
use tokenizers::Encoding;
use tracing::warn;
use utoipa::ToSchema;
use uuid::Uuid;
use validation::Validation;

#[allow(clippy::large_enum_variant)]
#[derive(Clone)]
pub enum Tokenizer {
    Python {
        tokenizer_name: String,
        revision: Option<String>,
        trust_remote_code: bool,
    },
    Rust(tokenizers::Tokenizer),
}

pub struct PyTokenizer<'a>(pyo3::Bound<'a, pyo3::PyAny>);

impl<'a> PyTokenizer<'a> {
    fn from_py(
        py: Python<'a>,
        tokenizer_name: String,
        revision: Option<String>,
        trust_remote_code: bool,
    ) -> PyResult<PyTokenizer<'a>> {
        let transformers = py.import_bound("transformers")?;
        let auto = transformers.getattr("AutoTokenizer")?;
        let from_pretrained = auto.getattr("from_pretrained")?;
        let args = (tokenizer_name,);
        let kwargs = if let Some(rev) = &revision {
            [
                ("revision", rev.to_string().into_py(py)),
                ("trust_remote_code", trust_remote_code.into_py(py)),
            ]
            .into_py_dict_bound(py)
        } else {
            [("trust_remote_code", trust_remote_code.into_py(py))].into_py_dict_bound(py)
        };
        let tokenizer = from_pretrained.call(args, Some(&kwargs))?;
        tracing::info!("Loaded a python tokenizer");
        Ok(PyTokenizer(tokenizer))
    }
}

trait TokenizerTrait {
    fn encode_trait(
        &self,
        query: String,
        add_special_tokens: bool,
    ) -> Result<tokenizers::Encoding, Box<dyn std::error::Error + Send + Sync>>;
}

impl TokenizerTrait for tokenizers::Tokenizer {
    fn encode_trait(
        &self,
        query: String,
        add_special_tokens: bool,
    ) -> Result<tokenizers::Encoding, Box<dyn std::error::Error + Send + Sync>> {
        self.encode(query, add_special_tokens)
    }
}

impl TokenizerTrait for PyTokenizer<'_> {
    fn encode_trait(
        &self,
        query: String,
        add_special_tokens: bool,
    ) -> Result<tokenizers::Encoding, Box<dyn std::error::Error + Send + Sync>> {
        let py = self.0.py();
        let kwargs = [
            ("text", query.into_py(py)),
            ("add_special_tokens", add_special_tokens.into_py(py)),
        ]
        .into_py_dict_bound(py);
        let encode = self.0.getattr("encode")?;
        let input_ids: Vec<u32> = encode.call((), Some(&kwargs))?.extract()?;
        Ok(Encoding::new(
            input_ids,
            vec![],                           // type ids
            vec![],                           // tokens (strings)
            vec![],                           // words
            vec![],                           // offsets
            vec![],                           // special_tokens_mask
            vec![],                           // attention_mask
            vec![],                           // overflowing
            std::collections::HashMap::new(), //sequence_ranges
        ))
    }
}

/// Hub type
#[derive(Clone, Debug, Deserialize)]
pub struct HubModelInfo {
    #[serde(rename(deserialize = "id"))]
    pub model_id: String,
    pub sha: Option<String>,
    pub pipeline_tag: Option<String>,
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct ChatTemplate {
    name: String,
    template: String,
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[serde(untagged)]
pub enum ChatTemplateVersions {
    Single(String),
    Multiple(Vec<ChatTemplate>),
}

use std::path::Path;

#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct HubTokenizerConfig {
    pub chat_template: Option<ChatTemplateVersions>,
    pub completion_template: Option<String>,
    pub bos_token: Option<TokenizerConfigToken>,
    pub eos_token: Option<TokenizerConfigToken>,
    pub tokenizer_class: Option<String>,
    pub add_bos_token: Option<bool>,
    pub add_eos_token: Option<bool>,
}

impl HubTokenizerConfig {
    pub fn from_file<P: AsRef<Path>>(filename: P) -> Option<Self> {
        std::fs::read_to_string(filename)
            .ok()
            .and_then(|content| serde_json::from_str(&content).ok())
    }
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct ChatTemplateStandalone {
    pub chat_template: ChatTemplateVersions,
}

#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
#[serde(untagged)]
pub enum TokenizerConfigToken {
    String(String),
    Object { content: String },
}

impl TokenizerConfigToken {
    pub fn as_str(&self) -> &str {
        match self {
            TokenizerConfigToken::String(s) => s,
            TokenizerConfigToken::Object { content } => content,
        }
    }
}

#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "processor_class")]
pub enum HubPreprocessorConfig {
    Idefics2Processor(Idefics2Preprocessor),
    Idefics3Processor(Idefics2Preprocessor),
    Gemma3Processor(Gemma3Processor),
    Llama4Processor(Llama4Processor),
}

impl HubPreprocessorConfig {
    pub fn from_file<P: AsRef<std::path::Path>>(filename: P) -> Option<Self> {
        let content = std::fs::read_to_string(filename).ok()?;
        serde_json::from_str(&content).ok()
    }
}

#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct Idefics2Preprocessor {
    #[serde(default)]
    do_image_splitting: bool,
}

#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct Gemma3Processor {
    #[serde(default)]
    do_image_splitting: bool,
}

#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct Llama4Processor {
    #[serde(default)]
    max_patches: usize,
}

#[derive(Debug, Clone, Deserialize, Default)]
pub struct HubProcessorConfig {
    pub chat_template: Option<ChatTemplateVersions>,
    pub image_seq_len: usize,
    pub processor_class: Option<String>,
}

impl HubProcessorConfig {
    pub fn from_file<P: AsRef<Path>>(filename: P) -> Option<Self> {
        std::fs::read_to_string(filename)
            .ok()
            .and_then(|content| serde_json::from_str(&content).ok())
    }
}

#[derive(Clone, Debug, Deserialize, ToSchema, Serialize)]
#[cfg_attr(test, derive(PartialEq))]
struct JsonSchemaConfig {
    /// Optional name identifier for the schema
    #[serde(skip_serializing_if = "Option::is_none")]
    name: Option<String>,

    /// The actual JSON schema definition
    schema: serde_json::Value,
}

#[derive(Clone, Debug, Deserialize, ToSchema, Serialize)]
#[cfg_attr(test, derive(PartialEq))]
#[serde(tag = "type", content = "value")]
pub(crate) enum GrammarType {
    /// A string that represents a [JSON Schema](https://json-schema.org/).
    ///
    /// JSON Schema is a declarative language that allows to annotate JSON documents
    /// with types and descriptions.
    #[serde(rename = "json")]
    #[serde(alias = "json_object")]
    #[schema(example = json ! ({"properties": {"location":{"type": "string"}}}))]
    Json(serde_json::Value),

    #[serde(rename = "regex")]
    Regex(String),

    /// A JSON Schema specification with additional metadata.
    ///
    /// Includes an optional name for the schema, an optional strict flag, and the required schema definition.
    #[serde(rename = "json_schema")]
    #[schema(example = json ! ({"schema": {"properties": {"name": {"type": "string"}, "age": {"type": "integer"}}}, "name": "person_info", "strict": true}))]
    JsonSchema(JsonSchemaConfig),
}

#[derive(Clone, Debug, Serialize, ToSchema)]
pub struct Info {
    /// Model info
    #[schema(example = "bigscience/blomm-560m")]
    pub model_id: String,
    #[schema(nullable = true, example = "e985a63cdc139290c5f700ff1929f0b5942cced2")]
    pub model_sha: Option<String>,
    // #[schema(example = "torch.float16")]
    // pub model_dtype: String,
    // #[schema(example = "cuda")]
    // pub model_device_type: String,
    #[schema(nullable = true, example = "text-generation")]
    pub model_pipeline_tag: Option<String>,

    /// Router Parameters
    #[schema(example = "128")]
    pub max_concurrent_requests: usize,
    #[schema(example = "2")]
    pub max_best_of: usize,
    #[schema(example = "4")]
    pub max_stop_sequences: usize,
    #[schema(example = "1024")]
    pub max_input_tokens: usize,
    #[schema(example = "2048")]
    pub max_total_tokens: usize,
    #[schema(example = "2")]
    pub validation_workers: usize,
    #[schema(example = "32")]
    pub max_client_batch_size: usize,

    /// Router Info
    #[schema(example = "text-generation-router")]
    pub router: &'static str,
    #[schema(example = "0.5.0")]
    pub version: &'static str,
    #[schema(nullable = true, example = "null")]
    pub sha: Option<&'static str>,
    #[schema(nullable = true, example = "null")]
    pub docker_label: Option<&'static str>,
}

#[derive(Clone, Debug, Deserialize, ToSchema, Default)]
#[cfg_attr(test, derive(PartialEq))]
pub(crate) struct GenerateParameters {
    /// Generate best_of sequences and return the one if the highest token logprobs.
    #[serde(default)]
    #[schema(exclusive_minimum = 0, nullable = true, default = "null", example = 1)]
    pub best_of: Option<usize>,

    /// The value used to module the logits distribution.
    #[serde(default)]
    #[schema(
        exclusive_minimum = 0.0,
        nullable = true,
        default = "null",
        example = 0.5
    )]
    pub temperature: Option<f32>,

    /// The parameter for repetition penalty. 1.0 means no penalty.
    /// See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
    #[serde(default)]
    #[schema(
        exclusive_minimum = 0.0,
        nullable = true,
        default = "null",
        example = 1.03
    )]
    pub repetition_penalty: Option<f32>,

    /// The parameter for frequency penalty. 1.0 means no penalty
    /// Penalize new tokens based on their existing frequency in the text so far,
    /// decreasing the model's likelihood to repeat the same line verbatim.
    #[serde(default)]
    #[schema(
        exclusive_minimum = -2.0,
        nullable = true,
        default = "null",
        example = 0.1
    )]
    pub frequency_penalty: Option<f32>,

    /// The number of highest probability vocabulary tokens to keep for top-k-filtering.
    #[serde(default)]
    #[schema(exclusive_minimum = 0, nullable = true, default = "null", example = 10)]
    pub top_k: Option<i32>,

    /// Top-p value for nucleus sampling.
    #[serde(default)]
    #[schema(
        exclusive_minimum = 0.0,
        maximum = 1.0,
        nullable = true,
        default = "null",
        example = 0.95
    )]
    pub top_p: Option<f32>,

    /// Typical Decoding mass
    /// See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information.
    #[serde(default)]
    #[schema(
        exclusive_minimum = 0.0,
        maximum = 1.0,
        nullable = true,
        default = "null",
        example = 0.95
    )]
    pub typical_p: Option<f32>,

    /// Activate logits sampling.
    #[serde(default)]
    #[schema(default = "false", example = true)]
    pub do_sample: bool,

    /// Maximum number of tokens to generate.
    #[serde(default)]
    #[schema(nullable = true, default = "1024", example = "20")]
    pub max_new_tokens: Option<u32>,

    /// Whether to prepend the prompt to the generated text
    #[serde(default)]
    #[schema(nullable = true, default = "null", example = false)]
    pub return_full_text: Option<bool>,

    /// Stop generating tokens if a member of `stop` is generated.
    #[serde(default)]
    #[schema(inline, max_items = 4, example = json ! (["photographer"]))]
    pub stop: Vec<String>,

    /// Truncate inputs tokens to the given size.
    #[serde(default)]
    #[schema(nullable = true, default = "null", example = "null")]
    pub truncate: Option<usize>,

    /// Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226).
    #[serde(default)]
    #[schema(default = "false", example = true)]
    pub watermark: bool,

    /// Whether to return generation details.
    #[serde(default)]
    #[schema(default = "true")]
    pub details: bool,

    /// Whether to return decoder input token logprobs and ids.
    #[serde(default)]
    #[schema(default = "false")]
    pub decoder_input_details: bool,

    /// Random sampling seed.
    #[serde(default)]
    #[schema(
        exclusive_minimum = 0,
        nullable = true,
        default = "null",
        example = "null"
    )]
    pub seed: Option<u64>,

    /// The number of highest probability vocabulary tokens to keep for top-n-filtering.
    #[serde(default)]
    #[schema(exclusive_minimum = 0, nullable = true, default = "null", example = 5)]
    pub top_n_tokens: Option<u32>,

    /// Grammar constraints for the generation.
    #[serde(default)]
    #[schema(nullable = true, default = "null", example = "null")]
    pub grammar: Option<GrammarType>,

    /// Lora adapter id
    #[serde(default)]
    #[schema(nullable = true, default = "null", example = "null")]
    pub adapter_id: Option<String>,
}

fn default_parameters() -> GenerateParameters {
    GenerateParameters {
        best_of: None,
        temperature: None,
        repetition_penalty: None,
        frequency_penalty: None,
        top_k: None,
        top_p: None,
        typical_p: None,
        do_sample: true,
        max_new_tokens: None,
        return_full_text: None,
        stop: Vec::new(),
        truncate: None,
        watermark: false,
        details: false,
        decoder_input_details: false,
        seed: None,
        top_n_tokens: None,
        grammar: None,
        adapter_id: None,
    }
}

#[derive(Clone, Deserialize, Serialize, ToSchema, Debug)]
#[serde(try_from = "PromptDeserializer")]
pub struct Prompt(pub Vec<String>);

#[derive(Deserialize)]
#[serde(untagged)]
enum PromptDeserializer {
    Single(String),
    Multiple(Vec<String>),
}

impl TryFrom<PromptDeserializer> for Prompt {
    type Error = String;

    fn try_from(value: PromptDeserializer) -> Result<Self, Self::Error> {
        match value {
            PromptDeserializer::Single(s) => Ok(Prompt(vec![s])),
            PromptDeserializer::Multiple(v) => {
                if v.is_empty() {
                    Err(
                        "Empty array detected. Do not use an empty array for the prompt."
                            .to_string(),
                    )
                } else {
                    Ok(Prompt(v))
                }
            }
        }
    }
}

#[derive(Clone, Deserialize, Serialize, ToSchema, Debug)]
pub struct CompletionRequest {
    /// UNUSED
    #[schema(example = "mistralai/Mistral-7B-Instruct-v0.2")]
    /// ID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.
    pub model: Option<String>,

    /// The prompt to generate completions for.
    #[schema(example = "What is Deep Learning?")]
    pub prompt: Prompt,

    /// The maximum number of tokens that can be generated in the chat completion.
    #[serde(default)]
    #[schema(default = "1024", example = "32")]
    pub max_tokens: Option<u32>,

    /// What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while
    /// lower values like 0.2 will make it more focused and deterministic. We generally recommend altering this or `top_p` but not both.
    #[serde(default)]
    #[schema(nullable = true, example = 1.0)]
    pub temperature: Option<f32>,

    /// An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the
    /// tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.
    #[serde(default)]
    #[schema(nullable = true, example = 0.95)]
    pub top_p: Option<f32>,

    #[serde(default = "bool::default")]
    pub stream: bool,

    #[schema(nullable = true, example = 42)]
    pub seed: Option<u64>,

    /// The text to append to the prompt. This is useful for completing sentences or generating a paragraph of text.
    /// please see the completion_template field in the model's tokenizer_config.json file for completion template.
    #[serde(default)]
    pub suffix: Option<String>,

    #[serde(default)]
    pub repetition_penalty: Option<f32>,

    /// Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far,
    /// decreasing the model's likelihood to repeat the same line verbatim.
    #[serde(default)]
    #[schema(example = "1.0")]
    pub frequency_penalty: Option<f32>,

    /// Up to 4 sequences where the API will stop generating further tokens.
    #[serde(default)]
    #[schema(nullable = true, example = "null")]
    pub stop: Option<Vec<String>>,
}

#[derive(Clone, Serialize, ToSchema)]
#[serde(tag = "object")]
enum Completion {
    #[serde(rename = "text_completion")]
    Chunk(Chunk),
    #[serde(rename = "text_completion")]
    Final(CompletionFinal),
}

#[derive(Clone, Deserialize, Serialize, ToSchema, Default)]
pub(crate) struct CompletionFinal {
    pub id: String,
    #[schema(example = "1706270835")]
    pub created: u64,
    #[schema(example = "mistralai/Mistral-7B-Instruct-v0.2")]
    pub model: String,
    pub system_fingerprint: String,
    pub choices: Vec<CompletionComplete>,
    pub usage: Usage,
}

#[derive(Clone, Deserialize, Serialize, ToSchema)]
pub(crate) struct CompletionComplete {
    pub index: u32,
    pub text: String,
    pub logprobs: Option<Vec<f32>>,
    pub finish_reason: String,
}

#[derive(Clone, Deserialize, Serialize, ToSchema)]
pub(crate) struct Chunk {
    pub id: String,
    pub created: u64,
    pub choices: Vec<CompletionComplete>,
    pub model: String,
    pub system_fingerprint: String,
}

#[derive(Clone, Deserialize, Serialize, ToSchema)]
#[cfg_attr(test, derive(Debug))]
pub(crate) struct ChatCompletion {
    pub id: String,
    #[schema(example = "1706270835")]
    pub created: u64,
    #[schema(example = "mistralai/Mistral-7B-Instruct-v0.2")]
    pub model: String,
    pub system_fingerprint: String,
    pub choices: Vec<ChatCompletionComplete>,
    pub usage: Usage,
}

#[derive(Clone, Deserialize, Serialize, ToSchema)]
#[cfg_attr(test, derive(Debug))]
pub(crate) struct ChatCompletionComplete {
    pub index: u32,
    pub message: OutputMessage,
    pub logprobs: Option<ChatCompletionLogprobs>,
    pub finish_reason: String,
}

#[derive(Clone, Deserialize, Serialize, ToSchema)]
#[cfg_attr(test, derive(Debug, PartialEq))]
pub(crate) struct ChatCompletionLogprobs {
    content: Vec<ChatCompletionLogprob>,
}

impl From<(Token, Vec<Token>)> for ChatCompletionLogprobs {
    fn from(value: (Token, Vec<Token>)) -> Self {
        let (token, top_tokens) = value;

        Self {
            content: vec![ChatCompletionLogprob {
                token: token.text,
                logprob: token.logprob,
                top_logprobs: top_tokens
                    .into_iter()
                    .map(|t| ChatCompletionTopLogprob {
                        token: t.text,
                        logprob: t.logprob,
                    })
                    .collect(),
            }],
        }
    }
}

impl From<(Vec<Token>, Vec<Vec<Token>>)> for ChatCompletionLogprobs {
    fn from(value: (Vec<Token>, Vec<Vec<Token>>)) -> Self {
        let (tokens, top_tokens) = value;

        // Create an iterator that produces None for top_tokens once it's exhausted
        let top_tokens_iter = top_tokens
            .into_iter()
            .map(Some)
            .chain(std::iter::repeat(None));

        let content = tokens
            .into_iter()
            .zip(top_tokens_iter)
            .map(|(t, top_t_option)| ChatCompletionLogprob {
                token: t.text,
                logprob: t.logprob,
                top_logprobs: match top_t_option {
                    Some(top_t) => top_t
                        .into_iter()
                        .map(|t| ChatCompletionTopLogprob {
                            token: t.text,
                            logprob: t.logprob,
                        })
                        .collect(),
                    None => vec![], // Handle the case where there are no top tokens
                },
            })
            .collect();

        Self { content }
    }
}

#[derive(Clone, Deserialize, Serialize, ToSchema)]
#[cfg_attr(test, derive(Debug, PartialEq))]
pub(crate) struct ChatCompletionLogprob {
    token: String,
    logprob: f32,
    top_logprobs: Vec<ChatCompletionTopLogprob>,
}

#[derive(Clone, Deserialize, Serialize, ToSchema)]
#[cfg_attr(test, derive(Debug, PartialEq))]
pub(crate) struct ChatCompletionTopLogprob {
    token: String,
    logprob: f32,
}

#[derive(Clone, Deserialize, Serialize, ToSchema, Default)]
#[cfg_attr(test, derive(Debug, PartialEq))]
pub(crate) struct Usage {
    pub prompt_tokens: u32,
    pub completion_tokens: u32,
    pub total_tokens: u32,
}

#[derive(Clone, Serialize, ToSchema)]
#[serde(tag = "object")]
#[cfg_attr(test, derive(Debug))]
enum CompletionType {
    #[serde(rename = "chat.completion.chunk")]
    ChatCompletionChunk(ChatCompletionChunk),
    #[serde(rename = "chat.completion")]
    ChatCompletion(ChatCompletion),
}

impl ChatCompletion {
    #[allow(clippy::too_many_arguments)]
    pub(crate) fn new(
        model: String,
        system_fingerprint: String,
        output: Option<String>,
        created: u64,
        details: Details,
        return_logprobs: bool,
        tool_calls: Option<Vec<ToolCall>>,
        prompt_tokens: u32,
    ) -> Self {
        let message = match (output, tool_calls) {
            (Some(content), None) => OutputMessage::ChatMessage(TextMessage {
                role: "assistant".into(),
                content,
                ..Default::default()
            }),
            (None, Some(tool_calls)) => OutputMessage::ToolCall(ToolCallMessage {
                role: "assistant".to_string(),
                tool_calls,
            }),
            (Some(output), Some(_)) => {
                warn!("Received both chat and tool call");
                OutputMessage::ChatMessage(TextMessage {
                    role: "assistant".into(),
                    content: output,
                    ..Default::default()
                })
            }
            (None, None) => {
                warn!("Didn't receive an answer");
                OutputMessage::ChatMessage(TextMessage {
                    role: "assistant".into(),
                    content: "".to_string(),
                    ..Default::default()
                })
            }
        };
        Self {
            id: String::new(),
            created,
            model,
            system_fingerprint,
            choices: vec![ChatCompletionComplete {
                index: 0,
                message,
                logprobs: return_logprobs
                    .then(|| ChatCompletionLogprobs::from((details.tokens, details.top_tokens))),
                finish_reason: details.finish_reason.format(true),
            }],
            usage: Usage {
                prompt_tokens,
                completion_tokens: details.generated_tokens,
                total_tokens: prompt_tokens + details.generated_tokens,
            },
        }
    }
}
#[derive(Clone, Serialize, ToSchema)]
#[cfg_attr(test, derive(Debug))]
pub(crate) struct ChatCompletionChunk {
    pub id: String,
    #[schema(example = "1706270978")]
    pub created: u64,
    #[schema(example = "mistralai/Mistral-7B-Instruct-v0.2")]
    pub model: String,
    pub system_fingerprint: String,
    pub choices: Vec<ChatCompletionChoice>,
    pub usage: Option<Usage>,
}

#[derive(Clone, Serialize, ToSchema)]
#[cfg_attr(test, derive(Debug, PartialEq))]
pub(crate) struct ChatCompletionChoice {
    pub index: u32,
    pub delta: ChatCompletionDelta,
    pub logprobs: Option<ChatCompletionLogprobs>,
    pub finish_reason: Option<String>,
}

#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)]
pub struct ToolCallDelta {
    #[schema(example = "assistant")]
    role: String,
    tool_calls: Vec<DeltaToolCall>,
}

#[derive(Clone, Debug, Serialize, ToSchema)]
#[serde(untagged)]
#[cfg_attr(test, derive(PartialEq))]
enum ChatCompletionDelta {
    Chat(TextMessage),
    Tool(ToolCallDelta),
}

#[derive(Clone, Deserialize, Serialize, ToSchema, Debug, PartialEq)]
pub(crate) struct DeltaToolCall {
    pub index: u32,
    pub id: String,
    pub r#type: String,
    pub function: Function,
}

#[derive(Clone, Deserialize, Serialize, ToSchema, Debug, PartialEq)]
pub(crate) struct Function {
    pub name: Option<String>,
    pub arguments: String,
}

#[allow(clippy::too_many_arguments)]
impl ChatCompletionChunk {
    pub(crate) fn new(
        model: String,
        system_fingerprint: String,
        created: u64,
        choices: Vec<ChatCompletionChoice>,
        usage: Option<Usage>,
    ) -> Self {
        Self {
            id: String::new(),
            created,
            model,
            system_fingerprint,
            choices,
            usage,
        }
    }
}

#[derive(Clone, Deserialize, ToSchema, Serialize)]
#[cfg_attr(test, derive(Debug, PartialEq, Default))]
pub(crate) struct ChatRequest {
    #[schema(example = "mistralai/Mistral-7B-Instruct-v0.2")]
    /// [UNUSED] ID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.
    pub model: Option<String>,

    /// A list of messages comprising the conversation so far.
    #[schema(example = "[{\"role\": \"user\", \"content\": \"What is Deep Learning?\"}]")]
    pub messages: Vec<Message>,

    /// Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far,
    /// decreasing the model's likelihood to repeat the same line verbatim.
    #[serde(default)]
    #[schema(example = "1.0")]
    pub frequency_penalty: Option<f32>,

    /// UNUSED
    /// Modify the likelihood of specified tokens appearing in the completion. Accepts a JSON object that maps tokens
    /// (specified by their token ID in the tokenizer) to an associated bias value from -100 to 100. Mathematically,
    /// the bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model,
    /// but values between -1 and 1 should decrease or increase likelihood of selection; values like -100 or 100 should
    /// result in a ban or exclusive selection of the relevant token.
    #[serde(default)]
    pub logit_bias: Option<Vec<f32>>,

    /// Whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each
    /// output token returned in the content of message.
    #[serde(default)]
    #[schema(example = "false")]
    pub logprobs: Option<bool>,

    /// An integer between 0 and 5 specifying the number of most likely tokens to return at each token position, each with
    /// an associated log probability. logprobs must be set to true if this parameter is used.
    #[serde(default)]
    #[schema(example = "5")]
    pub top_logprobs: Option<u32>,

    /// The maximum number of tokens that can be generated in the chat completion.
    #[serde(default, alias = "max_completion_tokens")]
    #[schema(default = "1024", example = "32")]
    pub max_tokens: Option<u32>,

    /// UNUSED
    /// How many chat completion choices to generate for each input message. Note that you will be charged based on the
    /// number of generated tokens across all of the choices. Keep n as 1 to minimize costs.
    #[serde(default)]
    #[schema(nullable = true, example = "2")]
    pub n: Option<u32>,

    /// Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far,
    /// increasing the model's likelihood to talk about new topics
    #[serde(default)]
    #[schema(nullable = true, example = 0.1)]
    pub presence_penalty: Option<f32>,

    /// Up to 4 sequences where the API will stop generating further tokens.
    #[serde(default)]
    #[schema(nullable = true, example = "null")]
    pub stop: Option<Vec<String>>,

    #[serde(default = "bool::default")]
    pub stream: bool,

    #[schema(nullable = true, example = 42)]
    pub seed: Option<u64>,

    /// What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while
    /// lower values like 0.2 will make it more focused and deterministic.
    ///
    /// We generally recommend altering this or `top_p` but not both.
    #[serde(default)]
    #[schema(nullable = true, example = 1.0)]
    pub temperature: Option<f32>,

    /// An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the
    /// tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.
    #[serde(default)]
    #[schema(nullable = true, example = 0.95)]
    pub top_p: Option<f32>,

    /// A list of tools the model may call. Currently, only functions are supported as a tool. Use this to provide a list of
    /// functions the model may generate JSON inputs for.
    #[serde(default)]
    #[schema(nullable = true, example = "null")]
    pub tools: Option<Vec<Tool>>,

    /// A prompt to be appended before the tools
    #[serde(default)]
    #[schema(
        nullable = true,
        example = "Given the functions available, please respond with a JSON for a function call with its proper arguments that best answers the given prompt. Respond in the format {name: function name, parameters: dictionary of argument name and its value}.Do not use variables."
    )]
    pub tool_prompt: Option<String>,

    /// A specific tool to use. If not provided, the model will default to use any of the tools provided in the tools parameter.
    #[serde(default)]
    #[schema(nullable = true, default = "auto", example = "auto")]
    pub tool_choice: ToolChoice,

    /// Response format constraints for the generation.
    ///
    /// NOTE: A request can use `response_format` OR `tools` but not both.
    #[serde(default)]
    #[schema(nullable = true, default = "null", example = "null")]
    pub response_format: Option<GrammarType>,

    /// Options for streaming response. Only set this when you set stream: true.
    #[serde(default)]
    #[schema(nullable = true, example = "null")]
    pub stream_options: StreamOptions,
}

impl ChatRequest {
    fn try_into_generate(self, infer: &Infer) -> Result<(GenerateRequest, bool), InferError> {
        let ChatRequest {
            model,
            max_tokens,
            messages,
            seed,
            stop,
            tools,
            tool_choice,
            tool_prompt,
            temperature,
            response_format,
            presence_penalty,
            frequency_penalty,
            top_p,
            top_logprobs,
            ..
        } = self;

        let repetition_penalty = presence_penalty.map(|x| x + 2.0);
        let max_new_tokens = max_tokens;
        let tool_prompt = tool_prompt
            .filter(|s| !s.is_empty())
            .unwrap_or_else(default_tool_prompt);
        let stop = stop.unwrap_or_default();
        // enable greedy only when temperature is 0
        let (do_sample, temperature) = match temperature {
            Some(0.0) => (false, None),
            other => (true, other),
        };

        if response_format.is_some() && tools.is_some() {
            return Err(InferError::ToolError(
                "Grammar and tools are mutually exclusive".into(),
            ));
        }

        let (inputs, grammar, using_tools) = match response_format {
            Some(format) => {
                let inputs = infer.apply_chat_template(messages, None)?;
                (inputs, Some(format), false)
            }
            None => {
                if let Some(tools) = tools {
                    match ToolGrammar::apply(tools, tool_choice)? {
                        Some((updated_tools, tool_schema)) => {
                            let grammar = GrammarType::Json(serde_json::json!(tool_schema));
                            let inputs: String = infer.apply_chat_template(
                                messages,
                                Some((updated_tools, tool_prompt)),
                            )?;
                            (inputs, Some(grammar), true)
                        }
                        None => {
                            // same as if no response_format or tools are set
                            let inputs = infer.apply_chat_template(messages, None)?;
                            (inputs, None, false)
                        }
                    }
                } else {
                    // if no response_format or tools are set simply apply the chat template to generate inputs
                    let inputs = infer.apply_chat_template(messages, None)?;
                    (inputs, None, false)
                }
            }
        };

        Ok((
            GenerateRequest {
                inputs: inputs.to_string(),
                add_special_tokens: false,
                parameters: GenerateParameters {
                    best_of: None,
                    temperature,
                    repetition_penalty,
                    frequency_penalty,
                    top_k: None,
                    top_p,
                    typical_p: None,
                    do_sample,
                    max_new_tokens,
                    return_full_text: None,
                    stop,
                    truncate: None,
                    watermark: false,
                    details: true,
                    decoder_input_details: false,
                    seed,
                    top_n_tokens: top_logprobs,
                    grammar,
                    adapter_id: model.filter(|m| *m != "tgi"),
                },
            },
            using_tools,
        ))
    }

    fn next_int_id(&self) -> Result<String, Box<dyn std::error::Error>> {
        let mut id: usize = 0;
        for message in &self.messages {
            if let MessageBody::Tool { tool_calls } = &message.body {
                for tool_call in tool_calls {
                    let new_id: usize = tool_call.id.parse()?;
                    id = std::cmp::max(id, new_id + 1);
                }
            }
        }
        Ok(id.to_string())
    }

    /// Try to have linearly increasing id
    /// or resort to using Uuid if the initial
    /// scheme is not understood
    fn next_tool_call_id(&self) -> String {
        self.next_int_id().unwrap_or_else(|_| {
            let uid = Uuid::new_v4().to_string();
            uid.to_string()
        })
    }
}

#[derive(Clone, Deserialize, ToSchema, Serialize, Default)]
#[cfg_attr(test, derive(Debug, PartialEq))]
struct StreamOptions {
    /// If set, an additional chunk will be streamed before the data: [DONE] message. The usage field on this chunk shows the token usage statistics for the entire request, and the choices field will always be an empty array. All other chunks will also include a usage field, but with a null value.
    #[schema(example = "true")]
    #[serde(default)]
    include_usage: bool,
}

pub fn default_tool_prompt() -> String {
    "\nGiven the functions available, please respond with a JSON for a function call with its proper arguments that best answers the given prompt. Respond in the format {name: function name, parameters: dictionary of argument name and its value}.Do not use variables.\n".to_string()
}

#[derive(Clone, Debug, Deserialize, ToSchema, PartialEq, Serialize)]
#[serde(tag = "type")]
pub enum TypedChoice {
    #[serde(rename = "function")]
    Function { function: FunctionName },
}

#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, ToSchema)]
pub struct FunctionName {
    pub name: String,
}

#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, ToSchema, Default)]
#[serde(from = "ToolTypeDeserializer")]
#[serde(rename_all = "snake_case")]
/// <https://platform.openai.com/docs/guides/function-calling/configuring-function-calling-behavior-using-the-tool_choice-parameter>
pub enum ToolChoice {
    /// Means the model can pick between generating a message or calling one or more tools.
    #[default]
    Auto,
    /// Means the model will not call any tool and instead generates a message.
    #[serde(rename = "none")]
    NoTool,
    /// Means the model must call one or more tools.
    Required,
    /// Forces the model to call a specific tool. This structure aligns with the `OpenAI` API schema to force a specific tool.
    Function(FunctionName),
}

#[derive(Deserialize, ToSchema)]
#[serde(untagged)]
/// Controls which (if any) tool is called by the model.
/// - `none` means the model will not call any tool and instead generates a message.
/// - `auto` means the model can pick between generating a message or calling one or more tools.
/// - `required` means the model must call one or more tools.
/// - Specifying a particular tool via `{\"type\": \"function\", \"function\": {\"name\": \"my_function\"}}` forces the model to call that tool.
///
/// `none` is the default when no tools are present. `auto` is the default if tools are present."
enum ToolTypeDeserializer {
    /// None means `null` was passed in the JSON, and the default choice is applied based on the presence of tools.
    Null,

    /// `auto` means the model can pick between generating a message or calling one or more tools.
    #[schema(example = "auto")]
    String(String),

    /// Specifying a particular tool forces the model to call that tool, with structured function details.
    #[schema(example = r#"{"type": "function", "function": {"name": "my_function"}}"#)]
    TypedChoice(TypedChoice),
}

impl From<ToolTypeDeserializer> for ToolChoice {
    fn from(value: ToolTypeDeserializer) -> Self {
        match value {
            ToolTypeDeserializer::Null => ToolChoice::Auto,
            ToolTypeDeserializer::String(s) => match s.as_str() {
                "none" => ToolChoice::NoTool,
                "auto" => ToolChoice::Auto,
                "required" => ToolChoice::Required,
                _ => ToolChoice::Function(FunctionName { name: s }),
            },
            ToolTypeDeserializer::TypedChoice(TypedChoice::Function { function }) => {
                ToolChoice::Function(function)
            }
        }
    }
}

#[derive(Debug, Deserialize, Serialize, ToSchema, PartialEq)]
pub struct JsonSchemaTool {
    #[serde(flatten)]
    functions_map: FunctionsMap,
    properties: Properties,
}

#[derive(Debug, Serialize, Deserialize, ToSchema, PartialEq)]
struct FunctionsMap {
    #[serde(rename = "$functions")]
    functions: std::collections::HashMap<String, serde_json::Value>,
}

#[derive(Debug, Serialize, Deserialize, ToSchema, PartialEq)]
struct FunctionRef {
    #[serde(rename = "$ref")]
    ref_path: String,
}

#[derive(Debug, Serialize, Deserialize, ToSchema, PartialEq)]
struct Properties {
    #[serde(serialize_with = "serialize_function")]
    function: Vec<FunctionRef>,
}

fn serialize_function<S>(functions: &Vec<FunctionRef>, serializer: S) -> Result<S::Ok, S::Error>
where
    S: serde::Serializer,
{
    use serde::ser::SerializeStruct;
    let mut state = serializer.serialize_struct("Function", 1)?;
    state.serialize_field("anyOf", functions)?;
    state.end()
}

#[derive(Clone, Debug, Deserialize, Serialize, ToSchema, Default, PartialEq)]
pub struct FunctionDefinition {
    #[serde(default)]
    pub description: Option<String>,
    pub name: String,
    #[serde(alias = "parameters", serialize_with = "serialize_as_string")]
    pub arguments: serde_json::Value,
}

fn serialize_as_string<S>(value: &serde_json::Value, serializer: S) -> Result<S::Ok, S::Error>
where
    S: serde::Serializer,
{
    serializer.serialize_str(&value.to_string())
}

#[derive(Clone, Debug, Deserialize, Serialize, ToSchema)]
#[cfg_attr(test, derive(PartialEq))]
pub(crate) struct Tool {
    // The type of the tool. Currently, only 'function' is supported.
    #[schema(example = "function")]
    pub r#type: String,
    // Grab the tool as generic JSON for debugging purposes.
    pub function: FunctionDefinition,
}

#[derive(Clone, Serialize, Deserialize, Default)]
pub(crate) struct ChatTemplateInputs<'a> {
    messages: Vec<TextMessage>,
    bos_token: Option<&'a str>,
    eos_token: Option<&'a str>,
    add_generation_prompt: bool,
    tools: Option<Vec<Tool>>,
}

#[derive(Clone, Deserialize, Serialize, ToSchema, Default, Debug, PartialEq)]
pub struct ToolCall {
    pub id: String,
    pub r#type: String,
    pub function: FunctionDefinition,
}

#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)]
pub struct Url {
    url: String,
}

#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)]
#[serde(tag = "type")]
#[serde(rename_all = "snake_case")]
pub enum MessageChunk {
    Text { text: String },
    ImageUrl { image_url: Url },
}

#[derive(Clone, Deserialize, Serialize, ToSchema, Debug, PartialEq)]
pub struct Message {
    #[schema(example = "user")]
    pub role: String,
    #[serde(flatten)]
    #[schema(example = "My name is David and I")]
    pub body: MessageBody,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    #[schema(example = "\"David\"")]
    pub name: Option<String>,
}

#[derive(Clone, Deserialize, Serialize, ToSchema, Debug, PartialEq)]
#[serde(untagged)]
pub enum MessageBody {
    // When a regular text message is provided.
    Content {
        #[serde(rename = "content")]
        content: MessageContent,
    },
    // When tool calls are provided.
    Tool {
        #[serde(rename = "tool_calls")]
        tool_calls: Vec<ToolCall>,
    },
}

#[derive(Clone, Deserialize, Serialize, ToSchema, Debug, PartialEq)]
#[serde(untagged)]
pub enum MessageContent {
    SingleText(String),
    MultipleChunks(Vec<MessageChunk>),
}

// Pushing a chunk to a single text message will convert it to a multiple chunks message
impl MessageContent {
    pub fn push(&mut self, chunk: MessageChunk) {
        match self {
            MessageContent::SingleText(text) => {
                *self = MessageContent::MultipleChunks(vec![
                    MessageChunk::Text { text: text.clone() },
                    chunk,
                ]);
            }
            MessageContent::MultipleChunks(chunks) => {
                chunks.push(chunk);
            }
        }
    }
}

#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq, Default)]
pub struct TextMessage {
    #[schema(example = "user")]
    pub role: String,
    #[schema(example = "My name is David and I")]
    pub content: String,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub tool_call_id: Option<String>,
}

impl From<Message> for TextMessage {
    fn from(value: Message) -> Self {
        let content = match value.body {
            MessageBody::Content { content } => content,
            MessageBody::Tool { tool_calls } => {
                let content = serde_json::to_string(&tool_calls).unwrap_or_default();
                MessageContent::SingleText(content)
            }
        };
        TextMessage {
            role: value.role,
            content: match content {
                MessageContent::SingleText(text) => text,
                MessageContent::MultipleChunks(chunks) => chunks
                    .into_iter()
                    .map(|chunk| match chunk {
                        MessageChunk::Text { text } => text,
                        MessageChunk::ImageUrl { image_url } => format!("![]({})", image_url.url),
                    })
                    .collect::<Vec<_>>()
                    .join(""),
            },
            ..Default::default()
        }
    }
}

#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)]
pub struct ToolCallMessage {
    #[schema(example = "assistant")]
    role: String,
    tool_calls: Vec<ToolCall>,
}

#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)]
#[serde(untagged)]
pub(crate) enum OutputMessage {
    ChatMessage(TextMessage),
    ToolCall(ToolCallMessage),
}

#[derive(Clone, Debug, Deserialize, ToSchema)]
#[cfg_attr(test, derive(PartialEq))]
pub(crate) struct GenerateRequest {
    #[schema(example = "My name is Olivier and I")]
    pub inputs: String,
    #[serde(default = "default_parameters")]
    pub parameters: GenerateParameters,

    /// This is used internally because some requests
    /// already contain the templated input therefore
    /// we shouldn't add the special tokens.
    #[serde(default = "default_true", skip)]
    pub add_special_tokens: bool,
}

fn default_true() -> bool {
    true
}

#[derive(Clone, Debug, Deserialize, ToSchema)]
pub(crate) struct CompatGenerateRequest {
    #[schema(example = "My name is Olivier and I")]
    pub inputs: String,
    #[serde(default = "default_parameters")]
    pub parameters: GenerateParameters,
    #[serde(default)]
    #[schema(default = "false")]
    pub stream: bool,
}

impl From<CompatGenerateRequest> for GenerateRequest {
    fn from(req: CompatGenerateRequest) -> Self {
        Self {
            inputs: req.inputs,
            add_special_tokens: true,
            parameters: req.parameters,
        }
    }
}

#[derive(Debug, Serialize, ToSchema)]
pub struct PrefillToken {
    #[schema(example = 0)]
    pub id: u32,
    #[schema(example = "test")]
    pub text: String,
    #[schema(nullable = true, example = - 0.34)]
    pub logprob: f32,
}

#[derive(Debug, Serialize, ToSchema, Clone)]
pub struct Token {
    #[schema(example = 0)]
    pub id: u32,
    #[schema(example = "test")]
    pub text: String,
    #[schema(nullable = true, example = - 0.34)]
    pub logprob: f32,
    #[schema(example = "false")]
    pub special: bool,
}

#[derive(Debug, Serialize, ToSchema)]
pub struct SimpleToken {
    #[schema(example = 0)]
    id: u32,
    #[schema(example = "test")]
    text: String,
    #[schema(example = 0)]
    start: usize,
    #[schema(example = 2)]
    stop: usize,
}

#[derive(Debug, Serialize, ToSchema, Clone)]
#[serde(rename_all(serialize = "snake_case"))]
#[schema(example = "Length")]
pub enum FinishReason {
    #[schema(rename = "length")]
    Length,
    #[serde(rename = "eos_token")]
    #[schema(rename = "eos_token")]
    EndOfSequenceToken,
    #[schema(rename = "stop_sequence")]
    StopSequence,
}

impl std::fmt::Display for FinishReason {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            FinishReason::Length => write!(f, "length"),
            FinishReason::EndOfSequenceToken => write!(f, "eos_token"),
            FinishReason::StopSequence => write!(f, "stop_sequence"),
        }
    }
}

impl FinishReason {
    pub fn format(&self, use_stop: bool) -> String {
        match self {
            FinishReason::EndOfSequenceToken if use_stop => "stop".to_string(),
            _ => self.to_string(),
        }
    }
}

#[derive(Serialize, ToSchema)]
pub(crate) struct BestOfSequence {
    #[schema(example = "test")]
    pub generated_text: String,
    #[schema(example = "length")]
    pub finish_reason: FinishReason,
    #[schema(example = 1)]
    pub generated_tokens: u32,
    #[schema(nullable = true, example = 42)]
    pub seed: Option<u64>,
    pub prefill: Vec<PrefillToken>,
    pub tokens: Vec<Token>,
    #[serde(skip_serializing_if = "Vec::is_empty")]
    pub top_tokens: Vec<Vec<Token>>,
}

#[derive(Serialize, ToSchema)]
pub(crate) struct Details {
    #[schema(example = "length")]
    pub finish_reason: FinishReason,
    #[schema(example = 1)]
    pub generated_tokens: u32,
    #[schema(nullable = true, example = 42)]
    pub seed: Option<u64>,
    pub prefill: Vec<PrefillToken>,
    pub tokens: Vec<Token>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub best_of_sequences: Option<Vec<BestOfSequence>>,
    #[serde(skip_serializing_if = "Vec::is_empty")]
    pub top_tokens: Vec<Vec<Token>>,
}

#[derive(Serialize, ToSchema)]
pub(crate) struct GenerateResponse {
    #[schema(example = "test")]
    pub generated_text: String,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub details: Option<Details>,
}

#[derive(Serialize, ToSchema)]
pub(crate) struct ChatTokenizeResponse {
    pub(crate) tokenize_response: TokenizeResponse,
    pub(crate) templated_text: String,
}

#[derive(Serialize, ToSchema)]
#[serde(transparent)]
pub(crate) struct TokenizeResponse(Vec<SimpleToken>);

#[derive(Serialize, ToSchema, Clone)]
pub(crate) struct StreamDetails {
    #[schema(example = "length")]
    pub finish_reason: FinishReason,
    #[schema(example = 1)]
    pub generated_tokens: u32,
    #[schema(nullable = true, example = 42)]
    pub seed: Option<u64>,
    #[schema(example = 1)]
    pub input_length: u32,
}

#[derive(Serialize, ToSchema, Clone)]
pub(crate) struct StreamResponse {
    pub index: u32,
    pub token: Token,
    #[serde(skip_serializing_if = "Vec::is_empty")]
    pub top_tokens: Vec<Token>,
    #[schema(nullable = true, default = "null", example = "test")]
    pub generated_text: Option<String>,
    #[schema(nullable = true, default = "null")]
    pub details: Option<StreamDetails>,
}

#[derive(Serialize, ToSchema)]
pub(crate) struct ErrorResponse {
    pub error: String,
    pub error_type: String,
}

#[derive(Serialize, Deserialize, ToSchema)]
pub(crate) struct ModelInfo {
    #[schema(example = "gpt2")]
    pub id: String,
    #[schema(example = "model")]
    pub object: String,
    #[schema(example = 1686935002)]
    pub created: u64,
    #[schema(example = "openai")]
    pub owned_by: String,
}

#[derive(Serialize, Deserialize, ToSchema)]
pub(crate) struct ModelsInfo {
    #[schema(example = "list")]
    pub object: String,
    pub data: Vec<ModelInfo>,
}

impl Default for ModelsInfo {
    fn default() -> Self {
        ModelsInfo {
            object: "list".to_string(),
            data: Vec::new(),
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use serde_json::json;

    pub(crate) fn get_tokenizer() -> Tokenizer {
        let api = hf_hub::api::sync::Api::new().unwrap();
        let repo = api.model("gpt2".to_string());
        let filename = repo.get("tokenizer.json").unwrap();
        Tokenizer::Rust(tokenizers::Tokenizer::from_file(filename).unwrap())
    }

    #[test]
    fn test_hub_nested_tokens_tokenizer_config() {
        // this is a subset of the tokenizer.json file
        // in this case we expect the tokens to be encoded as simple strings
        let json_content = r#"{
            "chat_template": "test",
            "bos_token": "<｜begin▁of▁sentence｜>",
            "eos_token": "<｜end▁of▁sentence｜>"
        }"#;

        let config: HubTokenizerConfig = serde_json::from_str(json_content).unwrap();

        // check that we successfully parsed the tokens
        assert_eq!(
            config.chat_template,
            Some(ChatTemplateVersions::Single("test".to_string()))
        );
        assert_eq!(
            config.bos_token,
            Some(TokenizerConfigToken::String(
                "<｜begin▁of▁sentence｜>".to_string()
            ))
        );
        assert_eq!(
            config.eos_token,
            Some(TokenizerConfigToken::String(
                "<｜end▁of▁sentence｜>".to_string()
            ))
        );

        // in this case we expect the tokens to be encoded as structured tokens
        // we want the content of the structured token
        let json_content = r#"{
            "chat_template": "test",
            "bos_token": {
              "__type": "AddedToken",
              "content": "<｜begin▁of▁sentence｜>",
              "lstrip": false,
              "normalized": true,
              "rstrip": false,
              "single_word": false
            },
            "eos_token": {
              "__type": "AddedToken",
              "content": "<｜end▁of▁sentence｜>",
              "lstrip": false,
              "normalized": true,
              "rstrip": false,
              "single_word": false
            }
        }"#;

        let config: HubTokenizerConfig = serde_json::from_str(json_content).unwrap();

        // check that we successfully parsed the tokens
        assert_eq!(
            config.chat_template,
            Some(ChatTemplateVersions::Single("test".to_string()))
        );
        assert_eq!(
            config.bos_token,
            Some(TokenizerConfigToken::Object {
                content: "<｜begin▁of▁sentence｜>".to_string()
            })
        );
        assert_eq!(
            config.eos_token,
            Some(TokenizerConfigToken::Object {
                content: "<｜end▁of▁sentence｜>".to_string()
            })
        );
    }

    #[test]
    fn test_chat_simple_string() {
        let json = json!({
            "model": "",
            "messages": [{
                "role": "user",
                "content": "What is Deep Learning?"
            }]
        });
        let request: ChatRequest = serde_json::from_str(json.to_string().as_str()).unwrap();

        assert_eq!(
            request.messages[0],
            Message {
                name: None,
                role: "user".to_string(),
                body: MessageBody::Content {
                    content: MessageContent::SingleText("What is Deep Learning?".to_string())
                },
            }
        );
    }

    #[test]
    fn test_message_content_append() {
        let mut content = MessageContent::SingleText("Initial text".to_string());
        let chunk = MessageChunk::Text {
            text: "Additional text".to_string(),
        };

        content.push(chunk);

        match content {
            MessageContent::MultipleChunks(chunks) => {
                assert_eq!(chunks.len(), 2);
                assert_eq!(
                    chunks[0],
                    MessageChunk::Text {
                        text: "Initial text".to_string()
                    }
                );
                assert_eq!(
                    chunks[1],
                    MessageChunk::Text {
                        text: "Additional text".to_string()
                    }
                );
            }
            _ => panic!("Expected MultipleChunks, but got a different variant"),
        }
    }

    #[test]
    fn test_chat_request() {
        let json = json!({
            "model": "",
            "messages": [{
                "role": "user",
                "content": [
                    {"type": "text", "text": "Whats in this image?"},
                    {"type": "image_url", "image_url": {"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"}},
                ]
            }]
        });
        let request: ChatRequest = serde_json::from_str(json.to_string().as_str()).unwrap();

        assert_eq!(
            request.messages[0],
            Message {
                name: None,
                role: "user".to_string(),

                body: MessageBody::Content {
                    content: MessageContent::MultipleChunks(vec![
                        MessageChunk::Text { text: "Whats in this image?".to_string() },
                        MessageChunk::ImageUrl { image_url: Url { url: "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png".to_string() }},
                    ]),
                },
            }
        );
    }

    #[test]
    fn text_message_convert() {
        let message = Message{
            name: None,
                role: "user".to_string(),
                body: MessageBody::Content {
                    content: MessageContent::MultipleChunks(vec![
                        MessageChunk::Text { text: "Whats in this image?".to_string() },
                        MessageChunk::ImageUrl { image_url: Url { url: "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png".to_string() } }
                    ]),
                }
            };
        let textmsg: TextMessage = message.into();
        assert_eq!(textmsg.content, "Whats in this image?![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png)");
    }

    #[test]
    fn test_chat_stream_options() {
        let json = json!({
            "model": "",
            "stream_options": {"include_usage": true},
            "messages": [{
                "role": "user",
                "content": "Hello"
            }]
        });
        let request: ChatRequest = serde_json::from_str(json.to_string().as_str()).unwrap();

        assert!(matches!(
            request.stream_options,
            StreamOptions {
                include_usage: true
            }
        ));

        let json = json!({
            "model": "",
            "messages": [{
                "role": "user",
                "content": "Hello"
            }]
        });
        let request: ChatRequest = serde_json::from_str(json.to_string().as_str()).unwrap();

        assert!(matches!(
            request.stream_options,
            StreamOptions {
                include_usage: false
            }
        ));
    }

    #[test]
    fn openai_output() {
        let message = OutputMessage::ChatMessage(TextMessage {
            role: "assistant".to_string(),
            content: "This is the answer".to_string(),
            ..Default::default()
        });
        let serialized = serde_json::to_string(&message).unwrap();
        assert_eq!(
            serialized,
            r#"{"role":"assistant","content":"This is the answer"}"#
        );

        let message = OutputMessage::ToolCall(ToolCallMessage {
            role: "assistant".to_string(),
            tool_calls: vec![ToolCall {
                id: "0".to_string(),
                r#type: "function".to_string(),
                function: FunctionDefinition {
                    description: None,
                    name: "myfn".to_string(),
                    arguments: json!({
                        "format": "csv"
                    }),
                },
            }],
        });
        let serialized = serde_json::to_string(&message).unwrap();
        assert_eq!(
            serialized,
            r#"{"role":"assistant","tool_calls":[{"id":"0","type":"function","function":{"description":null,"name":"myfn","arguments":"{\"format\":\"csv\"}"}}]}"#
        );
    }

    #[test]
    fn tool_choice_formats() {
        #[derive(Deserialize)]
        struct TestRequest {
            tool_choice: ToolChoice,
        }

        let de_none: TestRequest = serde_json::from_str(r#"{"tool_choice":"none"}"#).unwrap();
        assert_eq!(de_none.tool_choice, ToolChoice::NoTool);

        let de_auto: TestRequest = serde_json::from_str(r#"{"tool_choice":"auto"}"#).unwrap();
        assert_eq!(de_auto.tool_choice, ToolChoice::Auto);

        let de_required: TestRequest =
            serde_json::from_str(r#"{"tool_choice":"required"}"#).unwrap();
        assert_eq!(de_required.tool_choice, ToolChoice::Required);

        let de_named: TestRequest = serde_json::from_str(r#"{"tool_choice":"myfn"}"#).unwrap();
        assert_eq!(
            de_named.tool_choice,
            ToolChoice::Function(FunctionName {
                name: "myfn".to_string(),
            })
        );

        let de_openai_named: TestRequest = serde_json::from_str(
            r#"{"tool_choice":{"type":"function","function":{"name":"myfn"}}}"#,
        )
        .unwrap();
        assert_eq!(
            de_openai_named.tool_choice,
            ToolChoice::Function(FunctionName {
                name: "myfn".to_string(),
            })
        );
    }
}


================================================
FILE: router/src/logging.rs
================================================
use axum::{extract::Request, middleware::Next, response::Response};
use opentelemetry::sdk::propagation::TraceContextPropagator;
use opentelemetry::sdk::trace;
use opentelemetry::sdk::trace::Sampler;
use opentelemetry::sdk::Resource;
use opentelemetry::trace::{SpanContext, SpanId, TraceContextExt, TraceFlags, TraceId};
use opentelemetry::Context;
use opentelemetry::{global, KeyValue};
use opentelemetry_otlp::WithExportConfig;
use tracing_subscriber::layer::SubscriberExt;
use tracing_subscriber::util::SubscriberInitExt;
use tracing_subscriber::{filter::LevelFilter, EnvFilter, Layer};

struct TraceParent {
    #[allow(dead_code)]
    version: u8,
    trace_id: TraceId,
    parent_id: SpanId,
    trace_flags: TraceFlags,
}

fn parse_traceparent(header_value: &str) -> Option<TraceParent> {
    let parts: Vec<&str> = header_value.split('-').collect();
    if parts.len() != 4 {
        return None;
    }

    let version = u8::from_str_radix(parts[0], 16).ok()?;
    if version == 0xff {
        return None;
    }

    let trace_id = TraceId::from_hex(parts[1]).ok()?;
    let parent_id = SpanId::from_hex(parts[2]).ok()?;
    let trace_flags = u8::from_str_radix(parts[3], 16).ok()?;

    Some(TraceParent {
        version,
        trace_id,
        parent_id,
        trace_flags: TraceFlags::new(trace_flags),
    })
}

pub async fn trace_context_middleware(mut request: Request, next: Next) -> Response {
    let context = request
        .headers()
        .get("traceparent")
        .and_then(|v| v.to_str().ok())
        .and_then(parse_traceparent)
        .map(|traceparent| {
            Context::new().with_remote_span_context(SpanContext::new(
                traceparent.trace_id,
                traceparent.parent_id,
                traceparent.trace_flags,
                true,
                Default::default(),
            ))
        });

    request.extensions_mut().insert(context);

    next.run(request).await
}

/// Init logging using env variables LOG_LEVEL and LOG_FORMAT:
///     - otlp_endpoint is an optional URL to an Open Telemetry collector
///     - otlp_service_name service name to appear in APM
///     - LOG_LEVEL may be TRACE, DEBUG, INFO, WARN or ERROR (default to INFO)
///     - LOG_FORMAT may be TEXT or JSON (default to TEXT)
///     - LOG_COLORIZE may be "false" or "true" (default to "true" or ansi supported platforms)
pub fn init_logging(otlp_endpoint: Option<String>, otlp_service_name: String, json_output: bool) {
    let mut layers = Vec::new();

    // STDOUT/STDERR layer
    let ansi = std::env::var("LOG_COLORIZE") != Ok("1".to_string());
    let fmt_layer = tracing_subscriber::fmt::layer()
        .with_file(true)
        .with_ansi(ansi)
        .with_line_number(true);

    let fmt_layer = match json_output {
        true => fmt_layer.json().flatten_event(true).boxed(),
        false => fmt_layer.boxed(),
    };
    layers.push(fmt_layer);

    // OpenTelemetry tracing layer
    if let Some(otlp_endpoint) = otlp_endpoint {
        global::set_text_map_propagator(TraceContextPropagator::new());

        let tracer = opentelemetry_otlp::new_pipeline()
            .tracing()
            .with_exporter(
                opentelemetry_otlp::new_exporter()
                    .tonic()
                    .with_endpoint(otlp_endpoint),
            )
            .with_trace_config(
                trace::config()
                    .with_resource(Resource::new(vec![KeyValue::new(
                        "service.name",
                        otlp_service_name,
                    )]))
                    .with_sampler(Sampler::AlwaysOn),
            )
            .install_batch(opentelemetry::runtime::Tokio);

        if let Ok(tracer) = tracer {
            layers.push(tracing_opentelemetry::layer().with_tracer(tracer).boxed());
            init_tracing_opentelemetry::init_propagator().unwrap();
        };
    }

    // Filter events with LOG_LEVEL
    let varname = "LOG_LEVEL";
    let env_filter = if let Ok(log_level) = std::env::var(varname) {
        // Override to avoid simple logs to be spammed with tokio level informations
        let log_level = match &log_level[..] {
            "warn" => "text_generation_launcher=warn,text_generation_router=warn",
            "info" => "text_generation_launcher=info,text_generation_router=info",
            "debug" => "text_generation_launcher=debug,text_generation_router=debug",
            log_level => log_level,
        };
        EnvFilter::builder()
            .with_default_directive(LevelFilter::INFO.into())
            .parse_lossy(log_level)
    } else {
        EnvFilter::new("info")
    };

    tracing_subscriber::registry()
        .with(env_filter)
        .with(layers)
        .init();
}


================================================
FILE: router/src/sagemaker.rs
================================================
use crate::infer::Infer;
use crate::server::{chat_completions, compat_generate, completions, ComputeType};
use crate::{
    ChatCompletion, ChatCompletionChunk, ChatRequest, Chunk, CompatGenerateRequest,
    CompletionFinal, CompletionRequest, ErrorResponse, GenerateResponse, Info, StreamResponse,
};
use axum::extract::Extension;
use axum::http::StatusCode;
use axum::response::Response;
use axum::Json;
use serde::{Deserialize, Serialize};
use tracing::instrument;
use utoipa::ToSchema;

#[derive(Clone, Deserialize, ToSchema)]
#[serde(untagged)]
pub(crate) enum SagemakerRequest {
    Generate(CompatGenerateRequest),
    Chat(ChatRequest),
    Completion(CompletionRequest),
}

// Used for OpenAPI specs
#[allow(dead_code)]
#[derive(Serialize, ToSchema)]
#[serde(untagged)]
pub(crate) enum SagemakerResponse {
    Generate(GenerateResponse),
    Chat(ChatCompletion),
    Completion(CompletionFinal),
}

// Used for OpenAPI specs
#[allow(dead_code)]
#[derive(Serialize, ToSchema)]
#[serde(untagged)]
pub(crate) enum SagemakerStreamResponse {
    Generate(StreamResponse),
    Chat(ChatCompletionChunk),
    Completion(Chunk),
}

/// Generate tokens from Sagemaker request
#[utoipa::path(
post,
tag = "Text Generation Inference",
path = "/invocations",
request_body = SagemakerRequest,
responses(
(status = 200, description = "Generated Chat Completion",
content(
("application/json" = SagemakerResponse),
("text/event-stream" = SagemakerStreamResponse),
)),
(status = 424, description = "Generation Error", body = ErrorResponse,
example = json ! ({"error": "Request failed during generation", "error_type": "generation"})),
(status = 429, description = "Model is overloaded", body = ErrorResponse,
example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"})),
(status = 422, description = "Input validation error", body = ErrorResponse,
example = json ! ({"error": "Input validation error", "error_type": "validation"})),
(status = 500, description = "Incomplete generation", body = ErrorResponse,
example = json ! ({"error": "Incomplete generation", "error_type": "incomplete_generation"})),
)
)]
#[instrument(skip_all)]
pub(crate) async fn sagemaker_compatibility(
    default_return_full_text: Extension<bool>,
    infer: Extension<Infer>,
    compute_type: Extension<ComputeType>,
    context: Extension<Option<opentelemetry::Context>>,
    info: Extension<Info>,
    Json(req): Json<SagemakerRequest>,
) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
    match req {
        SagemakerRequest::Generate(req) => {
            compat_generate(
                default_return_full_text,
                infer,
                compute_type,
                context,
                Json(req),
            )
            .await
        }
        SagemakerRequest::Chat(req) => {
            chat_completions(infer, compute_type, info, context, Json(req)).await
        }
        SagemakerRequest::Completion(req) => {
            completions(infer, compute_type, info, context, Json(req)).await
        }
    }
}


================================================
FILE: router/src/server.rs
================================================
use crate::chat::{ChatChoice, ChatEvent, ChatState};
/// HTTP Server logic
use crate::config::Config;
use crate::infer::{Backend, Infer, InferError, InferResponse, InferStreamResponse};
#[cfg(feature = "kserve")]
use crate::kserve::{
    kerve_server_metadata, kserve_health_live, kserve_health_ready, kserve_model_infer,
    kserve_model_metadata, kserve_model_metadata_ready,
};
use crate::logging::trace_context_middleware;
use crate::sagemaker::{
    sagemaker_compatibility, SagemakerRequest, SagemakerResponse, SagemakerStreamResponse,
    __path_sagemaker_compatibility,
};
use crate::validation::ValidationError;
use crate::vertex::vertex_compatibility;
use crate::{
    usage_stats, BestOfSequence, Details, ErrorResponse, FinishReason, FunctionName,
    GenerateParameters, GenerateRequest, GenerateResponse, GrammarType, HubModelInfo,
    HubProcessorConfig, HubTokenizerConfig, Info, Message, MessageChunk, MessageContent,
    OutputMessage, PrefillToken, SimpleToken, StreamDetails, StreamOptions, StreamResponse,
    TextMessage, Token, TokenizeResponse, Tokenizer, ToolCallDelta, ToolCallMessage, Url, Usage,
    Validation,
};
use crate::{
    ChatCompletion, ChatCompletionChoice, ChatCompletionChunk, ChatCompletionComplete,
    ChatCompletionDelta, ChatCompletionLogprob, ChatCompletionLogprobs, ChatCompletionTopLogprob,
    ChatRequest, Chunk, CompatGenerateRequest, Completion, CompletionComplete, CompletionFinal,
    CompletionRequest, CompletionType, DeltaToolCall, Function, Prompt, Tool,
};
use crate::{ChatTokenizeResponse, JsonSchemaConfig};
use crate::{FunctionDefinition, HubPreprocessorConfig, ToolCall, ToolChoice};
use crate::{MessageBody, ModelInfo, ModelsInfo};
use async_stream::__private::AsyncStream;
use axum::extract::{DefaultBodyLimit, Extension};
use axum::http::{HeaderMap, HeaderValue, Method, StatusCode};
use axum::response::sse::{Event, KeepAlive, Sse};
use axum::response::{IntoResponse, Response};
use axum::routing::{get, post};
use axum::{http, Json, Router};
use axum_tracing_opentelemetry::middleware::OtelAxumLayer;
use futures::stream::StreamExt;
use futures::stream::{FuturesOrdered, FuturesUnordered};
use futures::Stream;
use futures::TryStreamExt;
use hf_hub::api::tokio::{Api, ApiBuilder, ApiRepo};
use hf_hub::{Cache, Repo, RepoType};
use http::header::AUTHORIZATION;
use metrics_exporter_prometheus::{Matcher, PrometheusBuilder, PrometheusHandle};
use pyo3::prelude::*;
use pyo3::types::IntoPyDict;
use std::convert::Infallible;
use std::fs::File;
use std::io::BufReader;
use std::net::{IpAddr, Ipv4Addr, SocketAddr};
use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use std::time::Duration;
use thiserror::Error;
use tokio::select;
use tokio::signal;
use tokio::sync::oneshot;
use tokio::time::Instant;
use tower_http::cors::{AllowOrigin, CorsLayer};
use tracing::{info_span, instrument, Instrument};
use tracing_opentelemetry::OpenTelemetrySpanExt;
use utoipa::OpenApi;
use utoipa_swagger_ui::SwaggerUi;

fn encoding_to_tokens(encoding: &tokenizers::Encoding, input: &str) -> Vec<SimpleToken> {
    let offsets = encoding.get_offsets();
    let input_ids = encoding.get_ids();
    if offsets.len() == input_ids.len() {
        input_ids
            .iter()
            .zip(offsets)
            .map(|(&id, &(start, stop))| {
                let text: Vec<u8> = input.bytes().skip(start).take(stop - start).collect();
                let text: String = String::from_utf8_lossy(&text).to_string();
                SimpleToken {
                    id,
                    text,
                    start,
                    stop,
                }
            })
            .collect()
    } else {
        encoding
            .get_ids()
            .iter()
            .map(|&id| SimpleToken {
                id,
                text: "".to_string(),
                start: 0,
                stop: 0,
            })
            .collect()
    }
}

/// Generate tokens if `stream == false` or a stream of token if `stream == true`
#[utoipa::path(
post,
tag = "Text Generation Inference",
path = "/",
request_body = CompatGenerateRequest,
responses(
(status = 200, description = "Generated Text",
content(
("application/json" = Vec<GenerateResponse>),
("text/event-stream" = StreamResponse),
)),
(status = 424, description = "Generation Error", body = ErrorResponse,
example = json ! ({"error": "Request failed during generation", "error_type": "generation"})),
(status = 429, description = "Model is overloaded", body = ErrorResponse,
example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"})),
(status = 422, description = "Input validation error", body = ErrorResponse,
example = json ! ({"error": "Input validation error", "error_type": "validation"})),
(status = 500, description = "Incomplete generation", body = ErrorResponse,
example = json ! ({"error": "Incomplete generation", "error_type": "incomplete_generation"})),
)
)]
#[instrument(skip(infer, req))]
pub(crate) async fn compat_generate(
    Extension(default_return_full_text): Extension<bool>,
    infer: Extension<Infer>,
    compute_type: Extension<ComputeType>,
    context: Extension<Option<opentelemetry::Context>>,
    Json(mut req): Json<CompatGenerateRequest>,
) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
    // default return_full_text given the pipeline_tag
    if req.parameters.return_full_text.is_none() {
        req.parameters.return_full_text = Some(default_return_full_text)
    }

    // switch on stream
    if req.stream {
        Ok(
            generate_stream(infer, compute_type, context, Json(req.into()))
                .await
                .into_response(),
        )
    } else {
        let (headers, Json(generation)) =
            generate(infer, compute_type, context, Json(req.into())).await?;
        // wrap generation inside a Vec to match api-inference
        Ok((headers, Json(vec![generation])).into_response())
    }
}

/// Text Generation Inference endpoint info
#[utoipa::path(
get,
tag = "Text Generation Inference",
path = "/info",
responses((status = 200, description = "Served model info", body = Info))
)]
#[instrument]
async fn get_model_info(info: Extension<Info>) -> Json<Info> {
    Json(info.0)
}

#[utoipa::path(
get,
tag = "Text Generation Inference",
path = "/v1/models",
responses(
(status = 200, description = "Served model info", body = ModelInfo),
(status = 404, description = "Model not found", body = ErrorResponse),
)
)]
#[instrument(skip(info))]
/// Get model info
async fn openai_get_model_info(info: Extension<Info>) -> Json<ModelsInfo> {
    Json(ModelsInfo {
        data: vec![ModelInfo {
            id: info.0.model_id.clone(),
            object: "model".to_string(),
            created: 0, // TODO: determine how to get this
            owned_by: info.0.model_id.clone(),
        }],
        ..Default::default()
    })
}

/// Template and tokenize ChatRequest
#[utoipa::path(
    post,
    tag = "Text Generation Inference",
    path = "/chat_tokenize",
    request_body = ChatRequest,
    responses(
    (status = 200, description = "Templated and tokenized ChatRequest", body = ChatTokenizeResponse),
    (status = 404, description = "Failed to tokenize ChatRequest", body = ErrorResponse),
    )
)]
async fn get_chat_tokenize(
    Extension(infer): Extension<Infer>,
    Json(chat): Json<ChatRequest>,
) -> Result<(HeaderMap, Json<ChatTokenizeResponse>), (StatusCode, Json<ErrorResponse>)> {
    metrics::counter!("tgi_request_count").increment(1);

    let generate_request: GenerateRequest = chat.try_into_generate(&infer)?.0;
    let input = generate_request.inputs.clone();
    let encoding = infer.tokenize(generate_request).await?;

    let tokens = encoding_to_tokens(&encoding, &input);

    let resp = ChatTokenizeResponse {
        tokenize_response: TokenizeResponse(tokens),
        templated_text: input,
    };
    Ok((HeaderMap::new(), Json(resp)))
}

#[utoipa::path(
get,
tag = "Text Generation Inference",
path = "/health",
responses(
(status = 200, description = "Everything is working fine"),
(status = 503, description = "Text generation inference is down", body = ErrorResponse,
example = json ! ({"error": "unhealthy", "error_type": "healthcheck"})),
)
)]
#[instrument(skip(infer))]
/// Health check method
async fn health(infer: Extension<Infer>) -> Result<(), (StatusCode, Json<ErrorResponse>)> {
    match infer.health().await {
        true => Ok(()),
        false => Err((
            StatusCode::SERVICE_UNAVAILABLE,
            Json(ErrorResponse {
                error: "unhealthy".to_string(),
                error_type: "healthcheck".to_string(),
            }),
        )),
    }
}

/// Generate tokens
#[utoipa::path(
post,
tag = "Text Generation Inference",
path = "/generate",
request_body = GenerateRequest,
responses(
(status = 200, description = "Generated Text", body = GenerateResponse),
(status = 424, description = "Generation Error", body = ErrorResponse,
example = json ! ({"error": "Request failed during generation", "error_type": "generation"})),
(status = 429, description = "Model is overloaded", body = ErrorResponse,
example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"})),
(status = 422, description = "Input validation error", body = ErrorResponse,
example = json ! ({"error": "Input validation error", "error_type": "validation"})),
(status = 500, description = "Incomplete generation", body = ErrorResponse,
example = json ! ({"error": "Incomplete generation", "error_type": "incomplete_generation"})),
)
)]
#[instrument(
skip_all,
fields(
parameters = ? req.parameters,
total_time,
validation_time,
queue_time,
inference_time,
time_per_token,
seed,
)
)]
async fn generate(
    infer: Extension<Infer>,
    Extension(ComputeType(compute_type)): Extension<ComputeType>,
    Extension(context): Extension<Option<opentelemetry::Context>>,
    Json(req): Json<GenerateRequest>,
) -> Result<(HeaderMap, Json<GenerateResponse>), (StatusCode, Json<ErrorResponse>)> {
    let span = tracing::Span::current();
    if let Some(context) = context {
        span.set_parent(context);
    }

    let (headers, _, response) =
        generate_internal(infer, ComputeType(compute_type), Json(req), span).await?;
    Ok((headers, response))
}

pub(crate) async fn generate_internal(
    infer: Extension<Infer>,
    ComputeType(compute_type): ComputeType,
    Json(req): Json<GenerateRequest>,
    span: tracing::Span,
) -> Result<(HeaderMap, u32, Json<GenerateResponse>), (StatusCode, Json<ErrorResponse>)> {
    let start_time = Instant::now();
    metrics::counter!("tgi_request_count").increment(1);

    // Do not long ultra long inputs, like image payloads.
    tracing::debug!(
        "Input: {}",
        &req.inputs.chars().take(1000).collect::<String>()
    );

    let compute_characters = req.inputs.chars().count();
    let mut add_prompt = None;
    if req.parameters.return_full_text.unwrap_or(false) {
        add_prompt = Some(req.inputs.clone());
    }

    let details: bool = req.parameters.details || req.parameters.decoder_input_details;

    // Inference
    let (response, best_of_responses) = match req.parameters.best_of {
        Some(best_of) if best_of > 1 => {
            let (response, best_of_responses) = infer.generate_best_of(req, best_of).await?;
            (response, Some(best_of_responses))
        }
        _ => (infer.generate(req).await?, None),
    };

    // Token details
    let input_length = response._input_length;
    let details = match details {
        true => {
            // convert best_of_responses
            let best_of_sequences = best_of_responses.map(|responses: Vec<InferResponse>| {
                responses
                    .into_iter()
                    .map(|response: InferResponse| {
                        // Add prompt if return_full_text
                        let mut output_text = response.generated_text.text;
                        if let Some(prompt) = &add_prompt {
                            output_text = prompt.clone() + &output_text;
                        }

                        BestOfSequence {
                            generated_text: output_text,
                            finish_reason: response.generated_text.finish_reason,
                            generated_tokens: response.generated_text.generated_tokens,
                            prefill: response.prefill,
                            tokens: response.tokens,
                            top_tokens: response.top_tokens,
                            seed: response.generated_text.seed,
                        }
                    })
                    .collect()
            });

            Some(Details {
                finish_reason: response.generated_text.finish_reason,
                generated_tokens: response.generated_text.generated_tokens,
                prefill: response.prefill,
                tokens: response.tokens,
                seed: response.generated_text.seed,
                best_of_sequences,
                top_tokens: response.top_tokens,
            })
        }
        false => None,
    };

    // Timings
    let total_time = start_time.elapsed();
    let validation_time = response.queued - start_time;
    let queue_time = response.start - response.queued;
    let inference_time = Instant::now() - response.start;
    let time_per_token = inference_time / response.generated_text.generated_tokens;

    // Tracing metadata
    span.record("total_time", format!("{total_time:?}"));
    span.record("validation_time", format!("{validation_time:?}"));
    span.record("queue_time", format!("{queue_time:?}"));
    span.record("inference_time", format!("{inference_time:?}"));
    span.record("time_per_token", format!("{time_per_token:?}"));
    span.record("seed", format!("{:?}", response.generated_text.seed));

    // Headers
    let mut headers = HeaderMap::new();
    headers.insert("x-compute-type", compute_type.parse().unwrap());
    headers.insert(
        "x-compute-time",
        total_time.as_secs_f64().to_string().parse().unwrap(),
    );
    headers.insert(
        "x-compute-characters",
        compute_characters.to_string().parse().unwrap(),
    );
    headers.insert(
        "x-total-time",
        total_time.as_millis().to_string().parse().unwrap(),
    );
    headers.insert(
        "x-validation-time",
        validation_time.as_millis().to_string().parse().unwrap(),
    );
    headers.insert(
        "x-queue-time",
        queue_time.as_millis().to_string().parse().unwrap(),
    );
    headers.insert(
        "x-inference-time",
        inference_time.as_millis().to_string().parse().unwrap(),
    );
    headers.insert(
        "x-time-per-token",
        time_per_token.as_millis().to_string().parse().unwrap(),
    );
    headers.insert("x-prompt-tokens", input_length.into());
    headers.insert(
        "x-generated-tokens",
        response.generated_text.generated_tokens.into(),
    );

    // Metrics
    metrics::counter!("tgi_request_success").increment(1);
    metrics::histogram!("tgi_request_duration").record(total_time.as_secs_f64());
    metrics::histogram!("tgi_request_validation_duration").record(validation_time.as_secs_f64());
    metrics::histogram!("tgi_request_queue_duration").record(queue_time.as_secs_f64());
    metrics::histogram!("tgi_request_inference_duration").record(inference_time.as_secs_f64());
    metrics::histogram!("tgi_request_mean_time_per_token_duration")
        .record(time_per_token.as_secs_f64());
    metrics::histogram!("tgi_request_generated_tokens")
        .record(response.generated_text.generated_tokens as f64);

    // Send response
    let mut output_text = response.generated_text.text;
    if let Some(prompt) = add_prompt {
        output_text = prompt + &output_text;
    }

    tracing::debug!("Output: {}", output_text);
    tracing::info!("Success");

    let response = GenerateResponse {
        generated_text: output_text,
        details,
    };
    Ok((headers, input_length, Json(response)))
}

/// Generate a stream of token using Server-Sent Events
#[utoipa::path(
post,
tag = "Text Generation Inference",
path = "/generate_stream",
request_body = GenerateRequest,
responses(
(status = 200, description = "Generated Text", body = StreamResponse,
content_type = "text/event-stream"),
(status = 424, description = "Generation Error", body = ErrorResponse,
example = json ! ({"error": "Request failed during generation", "error_type": "generation"}),
content_type = "text/event-stream"),
(status = 429, description = "Model is overloaded", body = ErrorResponse,
example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"}),
content_type = "text/event-stream"),
(status = 422, description = "Input validation error", body = ErrorResponse,
example = json ! ({"error": "Input validation error", "error_type": "validation"}),
content_type = "text/event-stream"),
(status = 500, description = "Incomplete generation", body = ErrorResponse,
example = json ! ({"error": "Incomplete generation", "error_type": "incomplete_generation"}),
content_type = "text/event-stream"),
)
)]
#[instrument(
skip_all,
fields(
parameters = ? req.parameters,
total_time,
validation_time,
queue_time,
inference_time,
time_per_token,
seed,
)
)]
async fn generate_stream(
    Extension(infer): Extension<Infer>,
    Extension(compute_type): Extension<ComputeType>,
    Extension(context): Extension<Option<opentelemetry::Context>>,
    Json(req): Json<GenerateRequest>,
) -> (
    HeaderMap,
    Sse<impl Stream<Item = Result<Event, Infallible>>>,
) {
    let span = tracing::Span::current();
    if let Some(context) = context {
        span.set_parent(context);
    }

    let (headers, response_stream) =
        generate_stream_internal(infer, compute_type, Json(req), span).await;

    let response_stream = async_stream::stream! {
        let mut response_stream = Box::pin(response_stream);
        while let Some(raw_event) = response_stream.next().await {
            yield Ok(raw_event.map_or_else(Event::from, |token| {
                Event::default()
                    .json_data(token)
                    .unwrap_or_else(|e| InferError::StreamSerializationError(e.to_string()).into())
            }));
        }
    };

    let sse = Sse::new(response_stream).keep_alive(KeepAlive::default());
    (headers, sse)
}

async fn generate_stream_internal(
    infer: Infer,
    ComputeType(compute_type): ComputeType,
    Json(req): Json<GenerateRequest>,
    span: tracing::Span,
) -> (
    HeaderMap,
    impl Stream<Item = Result<StreamResponse, InferError>>,
) {
    let start_time = Instant::now();
    metrics::counter!("tgi_request_count").increment(1);

    tracing::debug!("Input: {}", req.inputs);

    let compute_characters = req.inputs.chars().count();

    let mut headers = HeaderMap::new();
    headers.insert("x-compute-type", compute_type.parse().unwrap());
    headers.insert(
        "x-compute-characters",
        compute_characters.to_string().parse().unwrap(),
    );
    headers.insert("X-Accel-Buffering", "no".parse().unwrap());

    let stream = async_stream::stream! {
        // Inference
        let mut end_reached = false;
        let mut error = false;

        let mut add_prompt = None;
        if req.parameters.return_full_text.unwrap_or(false) {
            add_prompt = Some(req.inputs.clone());
        }
        let details = req.parameters.details;

        let best_of = req.parameters.best_of.unwrap_or(1);
        if best_of != 1 {
            let err = InferError::from(ValidationError::BestOfStream);
            metrics::counter!("tgi_request_failure", "err" => "validation").increment(1);
            tracing::error!("{err}");
            yield Err(err);
        } else if req.parameters.decoder_input_details {
            let err = InferError::from(ValidationError::PrefillDetailsStream);
            metrics::counter!("tgi_request_failure", "err" => "validation").increment(1);
            tracing::error!("{err}");
            yield Err(err);
        } else {
            match infer.generate_stream(req).instrument(info_span!(parent: &span, "async_stream")).await {
                // Keep permit as long as generate_stream lives
                Ok((_permit, input_length, response_stream)) => {
                    let mut index = 0;
                    let mut response_stream = Box::pin(response_stream);
                    // Server-Sent Event stream
                    while let Some(response) = response_stream.next().await {
                        index += 1;
                        match response {
                            Ok(response) => {
                                match response {
                                    // Prefill is ignored
                                    InferStreamResponse::Prefill(_) => {}
                                    // Yield event for every new token
                                    InferStreamResponse::Intermediate{
                                        token,
                                        top_tokens,
                                    } => {
                                        tracing::debug!(parent: &span, "Token: {:?}", token);

                                        // StreamResponse
                                        let stream_token = StreamResponse {
                                            index,
                                            token,
                                            top_tokens,
                                            generated_text: None,
                                            details: None,
                                        };
                                        yield Ok(stream_token);
                                    }
                                    // Yield event for last token and compute timings
                                    InferStreamResponse::End {
                                        token,
                                        generated_text,
                                        start,
                                        queued,
                                        top_tokens,
                                    } => {
                                        // Token details
                                        let details = match details {
                                            true => Some(StreamDetails {
                                                finish_reason: generated_text.finish_reason,
                                                generated_tokens: generated_text.generated_tokens,
                                                seed: generated_text.seed,
                                                input_length,
                                            }),
                                            false => None,
                                        };

                                        // Timings
                                        let total_time = start_time.elapsed();
                                        let validation_time = queued - start_time;
                                        let queue_time = start - queued;
                                        let inference_time = Instant::now() - start;
                                        let time_per_token = inference_time / generated_text.generated_tokens;

                                        // Tracing metadata
                                        span.record("total_time", format!("{total_time:?}"));
                                        span.record("validation_time", format!("{validation_time:?}"));
                                        span.record("queue_time", format!("{queue_time:?}"));
                                        span.record("inference_time", format!("{inference_time:?}"));
                                        span.record("time_per_token", format!("{time_per_token:?}"));
                                        span.record("seed", format!("{:?}", generated_text.seed));

                                        // Metrics
                                        metrics::counter!("tgi_request_success").increment(1);
                                        metrics::histogram!("tgi_request_duration").record(total_time.as_secs_f64());
                                        metrics::histogram!("tgi_request_validation_duration").record(validation_time.as_secs_f64());
                                        metrics::histogram!("tgi_request_queue_duration").record(queue_time.as_secs_f64());
                                        metrics::histogram!("tgi_request_inference_duration").record(inference_time.as_secs_f64());
                                        metrics::histogram!("tgi_request_mean_time_per_token_duration").record(time_per_token.as_secs_f64());
                                        metrics::histogram!("tgi_request_generated_tokens").record(generated_text.generated_tokens as f64);

                                        // StreamResponse
                                        end_reached = true;

                                        let mut output_text = generated_text.text;
                                        if let Some(prompt) = add_prompt {
                                            output_text = prompt + &output_text;
                                        }

                                        tracing::debug!(parent: &span, "Output: {}", output_text);
                                        tracing::info!(parent: &span, "Success");

                                        let stream_token = StreamResponse {
                                            index,
                                            token,
                                            top_tokens,
                                            generated_text: Some(output_text),
                                            details
                                        };

                                        yield Ok(stream_token);
                                        break;
                                    }
                                }
                            }
                            // yield error
                            Err(err) => {
                                error = true;
                                yield Err(err);
                                break;
                            }
                        }
                    }
                },
                // yield error
                Err(err) => {
                    error = true;
                    yield Err(err);
                }
            }
            // Check if generation reached the end
            // Skip if we already sent an error
            if !end_reached && !error {
                let err = InferError::IncompleteGenerationStream;
                metrics::counter!("tgi_request_failure", "err" => "incomplete").increment(1);
                tracing::error!("{err}");
                yield Err(err);
            }
        }
    };

    (headers, stream)
}

/// Generate tokens
#[utoipa::path(
post,
tag = "Text Generation Inference",
path = "/v1/completions",
request_body = CompletionRequest,
responses(
(status = 200, description = "Generated Chat Completion",
content(
("application/json" = CompletionFinal),
("text/event-stream" = Chunk),
)),
(status = 424, description = "Generation Error", body = ErrorResponse,
example = json ! ({"error": "Request failed during generation", "error_type": "generation"})),
(status = 429, description = "Model is overloaded", body = ErrorResponse,
example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"})),
(status = 422, description = "Input validation error", body = ErrorResponse,
example = json ! ({"error": "Input validation error", "error_type": "validation"})),
(status = 500, description = "Incomplete generation", body = ErrorResponse,
example = json ! ({"error": "Incomplete generation", "error_type": "incomplete_generation"})),
)
)]
#[instrument(
skip_all,
fields(
// parameters = ? req.parameters,
total_time,
validation_time,
queue_time,
inference_time,
time_per_token,
seed,
)
)]
pub(crate) async fn completions(
    Extension(infer): Extension<Infer>,
    Extension(compute_type): Extension<ComputeType>,
    Extension(info): Extension<Info>,
    Extension(context): Extension<Option<opentelemetry::Context>>,
    Json(req): Json<CompletionRequest>,
) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
    let span = tracing::Span::current();
    if let Some(context) = context {
        span.set_parent(context);
    }

    metrics::counter!("tgi_request_count").increment(1);

    let CompletionRequest {
        model,
        max_tokens,
        seed,
        stop,
        stream,
        temperature,
        ..
    } = req;

    let max_new_tokens = max_tokens;
    let stop = stop.unwrap_or_default();
    // enable greedy only when temperature is 0
    let (do_sample, temperature) = match temperature {
        Some(0.0) => (false, None),
        other => (true, other),
    };

    // if suffix is present throw an error
    if req.suffix.is_some() {
        metrics::counter!("tgi_request_failure", "err" => "validation").increment(1);
        return Err((
            StatusCode::UNPROCESSABLE_ENTITY,
            Json(ErrorResponse {
                error: "Suffix is not supported and can be achieved by preprocessing the prompt."
                    .to_string(),
                error_type: "suffix not supported".to_string(),
            }),
        ));
    }

    if req.prompt.0.len() > info.max_client_batch_size {
        metrics::counter!("tgi_request_failure", "err" => "validation").increment(1);
        return Err((
            StatusCode::UNPROCESSABLE_ENTITY,
            Json(ErrorResponse {
                error: format!(
                    "Number of prompts exceeds the maximum allowed batch size of {}",
                    info.max_client_batch_size
                ),
                error_type: "batch size exceeded".to_string(),
            }),
        ));
    }

    let generate_requests: Vec<GenerateRequest> = req
        .prompt
        .0
        .iter()
        .map(|prompt| GenerateRequest {
            inputs: prompt.to_string(),
            add_special_tokens: true,
            parameters: GenerateParameters {
                best_of: None,
                temperature,
                repetition_penalty: req.repetition_penalty,
                frequency_penalty: req.frequency_penalty,
                top_k: None,
                top_p: req.top_p,
                typical_p: None,
                do_sample,
                max_new_tokens,
                return_full_text: None,
                stop: stop.clone(),
                truncate: None,
                watermark: false,
                details: true,
                decoder_input_details: !stream,
                seed,
                top_n_tokens: None,
                grammar: None,
                adapter_id: model.as_ref().filter(|m| *m != "tgi").map(String::from),
            },
        })
        .collect();

    let mut x_compute_type = None;
    let mut x_compute_characters = 0u32;
    let mut x_accel_buffering = None;

    if stream {
        let mut response_streams = FuturesOrdered::new();
        for (index, generate_request) in generate_requests.into_iter().enumerate() {
            let model_id = info.model_id.clone();
            let system_fingerprint =
                format!("{}-{}", info.version, info.docker_label.unwrap_or("native"));
            let infer_clone = infer.clone();
            let compute_type_clone = compute_type.clone();
            let span_clone = span.clone();

            // Create a future for each generate_stream_internal call.
            let generate_future = async move {
                let (header_tx, header_rx) = oneshot::channel();
                let (sse_tx, sse_rx) = tokio::sync::mpsc::unbounded_channel();

                tokio::spawn(async move {
                    let (headers, response_stream) = generate_stream_internal(
                        infer_clone.clone(),
                        compute_type_clone.clone(),
                        Json(generate_request),
                        span_clone.clone(),
                    )
                    .await;

                    let response_stream = async_stream::stream! {
                        let mut response_stream = Box::pin(response_stream);

                        while let Some(stream_token) = response_stream.next().await {
                            match stream_token {
                                Ok(stream_token) => {
                                    let event = Event::default();

                                    let current_time = std::time::SystemTime::now()
                                        .duration_since(std::time::UNIX_EPOCH)
                                        .unwrap_or_else(|_| std::time::Duration::from_secs(0))
                                        .as_secs();

                                    let message = match stream_token.details {
                                        Some(details) => {
                                            let completion_tokens = details.generated_tokens;
                                            let prompt_tokens = details.input_length;
                                            let total_tokens = prompt_tokens + completion_tokens;

                                            Completion::Final(CompletionFinal {
                                                id: String::new(),
                                                created: current_time,
                                                model: model_id.clone(),
                                                system_fingerprint: system_fingerprint.clone(),
                                                choices: vec![CompletionComplete {
                                                    finish_reason: details.finish_reason.to_string(),
                                                    index: index as u32,
                                                    logprobs: None,
                                                    text: stream_token.token.text,
                                                }],
                                                usage: Usage {
                                                    prompt_tokens,
                                                    completion_tokens,
                                                    total_tokens,
                                                },
                                            })
                                        }
                                        None => Completion::Chunk(Chunk {
                                            id: String::new(),
                                            created: current_time,
                                            choices: vec![CompletionComplete {
                                                finish_reason: String::new(),
                                                index: index as u32,
                                                logprobs: None,
                                                text: stream_token.token.text,
                                            }],
                                            model: model_id.clone(),
                                            system_fingerprint: system_fingerprint.clone(),
                                        }),
                                    };

                                    let event = event
                                        .json_data(message)
                                        .unwrap_or_else(|_e| Event::default());

                                    yield Ok(event);
                                }
                                Err(err) => yield Ok(err.into_openai_event()),
                            }
                        }
                    };

                    // send and dont wait for response
                    let _ = header_tx.send(headers);

                    // pin an emit messages to the sse_tx
                    let mut sse = Box::pin(response_stream);
                    while let Some(event) = sse.next().await {
                        if sse_tx.send(event).is_err() {
                            tracing::error!("Failed to send event. Receiver dropped.");
                            break;
                        }
                    }
                });

                (header_rx, sse_rx)
            };
            response_streams.push_back(generate_future);
        }

        let mut all_rxs = vec![];

        while let Some((header_rx, sse_rx)) = response_streams.next().await {
            all_rxs.push(sse_rx);

            // get the headers from the first response of each stream
            let headers = header_rx.await.map_err(|e| {
                tracing::error!("Failed to get headers: {:?}", e);
                (
                    StatusCode::INTERNAL_SERVER_ERROR,
                    Json(ErrorResponse {
                        error: "Failed to get headers".to_string(),
                        error_type: "headers".to_string(),
                    }),
                )
            })?;
            if x_compute_type.is_none() {
                x_compute_type = headers
                    .get("x-compute-type")
                    .and_then(|v| v.to_str().ok())
                    .map(|v| v.to_string());

                x_accel_buffering = headers
                    .get("x-accel-buffering")
                    .and_then(|v| v.to_str().ok())
                    .map(|v| v.to_string());
            }
            x_compute_characters += headers
                .get("x-compute-characters")
                .and_then(|v| v.to_str().ok())
                .and_then(|v| v.parse().ok())
                .unwrap_or(0);
        }

        let mut headers = HeaderMap::new();
        if let Some(x_compute_type) = x_compute_type {
            headers.insert("x-compute-type", x_compute_type.parse().unwrap());
        }
        headers.insert("x-compute-characters", x_compute_characters.into());
        if let Some(x_accel_buffering) = x_accel_buffering {
            headers.insert("x-accel-buffering", x_accel_buffering.parse().unwrap());
        }

        // now sink the sse streams into a single stream and remove the ones that are done
        let stream: AsyncStream<Result<Event, Infallible>, _> = async_stream::stream! {
            loop {
                let mut i = 0;
                while i < all_rxs.len() {
                    let rx = &mut all_rxs[i];
                    select! {
                        Some(event) = rx.recv() => {
                            yield event;
                        }
                        else => {
                            all_rxs.remove(i);
                            continue; // skip the increment to handle the next element at the same index
                        }
                    }
                    i += 1; // only increment when no element was removed
                }

                if all_rxs.is_empty() {
                    break;
                }
            }
        };

        let stream = stream.chain(futures::stream::once(async {
            Ok(Event::default().data("[DONE]"))
        }));

        let sse = Sse::new(stream).keep_alive(KeepAlive::default());
        Ok((headers, sse).into_response())
    } else {
        let current_time = std::time::SystemTime::now()
            .duration_since(std::time::UNIX_EPOCH)
            .unwrap_or_else(|_| std::time::Duration::from_secs(0))
            .as_secs();

        let responses = FuturesUnordered::new();
        for (index, generate_request) in generate_requests.into_iter().enumerate() {
            let infer_clone = infer.clone();
            let compute_type_clone = compute_type.clone();
            let span_clone = span.clone();
            let response_future = async move {
                let result = generate_internal(
                    Extension(infer_clone),
                    compute_type_clone,
                    Json(generate_request),
                    span_clone,
                )
                .await;
                result.map(|(headers, input_length, generation)| {
                    (index, headers, input_length, generation)
                })
            };
            responses.push(response_future);
        }
        let generate_responses = responses.try_collect::<Vec<_>>().await?;

        let mut prompt_tokens = 0u32;
        let mut completion_tokens = 0u32;
        let mut total_tokens = 0u32;

        let mut x_compute_time = 0u32;
        let mut x_total_time = 0u32;
        let mut x_validation_time = 0u32;
        let mut x_queue_time = 0u32;
        let mut x_inference_time = 0u32;
        let mut x_time_per_token = 0u32;
        let mut x_prompt_tokens = 0u32;
        let mut x_generated_tokens = 0u32;

        let choices = generate_responses
            .into_iter()
            .map(|(index, headers, input_length, Json(generation))| {
                let details = generation.details.ok_or((
                    // this should never happen but handle if details are missing unexpectedly
                    StatusCode::INTERNAL_SERVER_ERROR,
                    Json(ErrorResponse {
                        error: "No details in generation".to_string(),
                        error_type: "no details".to_string(),
                    }),
                ))?;

                if x_compute_type.is_none() {
                    x_compute_type = headers
                        .get("x-compute-type")
                        .and_then(|v| v.to_str().ok())
                        .map(|v| v.to_string());
                }

                // accumulate headers and usage from each response
                x_compute_time += headers
                    .get("x-compute-time")
                    .and_then(|v| v.to_str().ok()?.parse().ok())
                    .unwrap_or(0);
                x_compute_characters += headers
                    .get("x-compute-characters")
                    .and_then(|v| v.to_str().ok()?.parse().ok())
                    .unwrap_or(0);
                x_total_time += headers
                    .get("x-total-time")
                    .and_then(|v| v.to_str().ok()?.parse().ok())
                    .unwrap_or(0);
                x_validation_time += headers
                    .get("x-validation-time")
                    .and_then(|v| v.to_str().ok()?.parse().ok())
                    .unwrap_or(0);
                x_queue_time += headers
                    .get("x-queue-time")
                    .and_then(|v| v.to_str().ok()?.parse().ok())
                    .unwrap_or(0);
                x_inference_time += headers
                    .get("x-inference-time")
                    .and_then(|v| v.to_str().ok()?.parse().ok())
                    .unwrap_or(0);
                x_time_per_token += headers
                    .get("x-time-per-token")
                    .and_then(|v| v.to_str().ok()?.parse().ok())
                    .unwrap_or(0);
                x_prompt_tokens += headers
                    .get("x-prompt-tokens")
                    .and_then(|v| v.to_str().ok()?.parse().ok())
                    .unwrap_or(0);
                x_generated_tokens += headers
                    .get("x-generated-tokens")
                    .and_then(|v| v.to_str().ok()?.parse().ok())
                    .unwrap_or(0);

                prompt_tokens += input_length;
                completion_tokens += details.generated_tokens;
                total_tokens += input_length + details.generated_tokens;

                Ok(CompletionComplete {
                    finish_reason: details.finish_reason.format(true),
                    index: index as u32,
                    logprobs: None,
                    text: generation.generated_text,
                })
            })
            .collect::<Result<Vec<_>, _>>()
            .map_err(|(status, Json(err))| (status, Json(err)))?;

        let response = Completion::Final(CompletionFinal {
            id: "".to_string(),
            created: current_time,
            model: info.model_id.clone(),
            system_fingerprint: format!(
                "{}-{}",
                info.version,
                info.docker_label.unwrap_or("native")
            ),
            choices,
            usage: Usage {
                prompt_tokens,
                completion_tokens,
                total_tokens,
            },
        });

        // headers similar to `generate` but aggregated
        let mut headers = HeaderMap::new();
        if let Some(x_compute_type) = x_compute_type {
            headers.insert("x-compute-type", x_compute_type.parse().unwrap());
        }
        headers.insert("x-compute-characters", x_compute_characters.into());
        headers.insert("x-total-time", x_total_time.into());
        headers.insert("x-validation-time", x_validation_time.into());
        headers.insert("x-queue-time", x_queue_time.into());
        headers.insert("x-inference-time", x_inference_time.into());
        headers.insert("x-time-per-token", x_time_per_token.into());
        headers.insert("x-prompt-tokens", x_prompt_tokens.into());
        headers.insert("x-generated-tokens", x_generated_tokens.into());
        if let Some(x_accel_buffering) = x_accel_buffering {
            headers.insert("x-accel-buffering", x_accel_buffering.parse().unwrap());
        }
        Ok((headers, Json(response)).into_response())
    }
}

/// Generate tokens
#[utoipa::path(
post,
tag = "Text Generation Inference",
path = "/v1/chat/completions",
request_body = ChatRequest,
responses(
(status = 200, description = "Generated Chat Completion",
content(
("application/json" = ChatCompletion),
("text/event-stream" = ChatCompletionChunk),
)),
(status = 424, description = "Generation Error", body = ErrorResponse,
example = json ! ({"error": "Request failed during generation", "error_type": "generation"})),
(status = 429, description = "Model is overloaded", body = ErrorResponse,
example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"})),
(status = 422, description = "Input validation error", body = ErrorResponse,
example = json ! ({"error": "Input validation error", "error_type": "validation"})),
(status = 500, description = "Incomplete generation", body = ErrorResponse,
example = json ! ({"error": "Incomplete generation", "error_type": "incomplete_generation"})),
)
)]
#[instrument(
    skip_all,
    fields(
        parameters,
        total_time,
        validation_time,
        queue_time,
        inference_time,
        time_per_token,
        seed,
    )
)]
pub(crate) async fn chat_completions(
    Extension(infer): Extension<Infer>,
    Extension(compute_type): Extension<ComputeType>,
    Extension(info): Extension<Info>,
    Extension(context): Extension<Option<opentelemetry::Context>>,
    Json(mut chat): Json<ChatRequest>,
) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
    let span = tracing::Span::current();
    if let Some(context) = context {
        span.set_parent(context);
    }

    metrics::counter!("tgi_request_count").increment(1);
    let ChatRequest {
        model,
        stream,
        stream_options,
        logprobs,
        ..
    } = chat.clone();

    tracing::debug!("Got chat_template {:?}", infer.chat_template);
    let id = chat.next_tool_call_id();
    let (generate_request, using_tools): (GenerateRequest, bool) =
        chat.clone().try_into_generate(&infer)?;
    span.record("parameters", format!("{:?}", generate_request.parameters));
    let logprobs = logprobs.unwrap_or_default();

    // extract model id from request if specified
    let model_id = match model.as_deref() {
        Some("tgi") | None => info.model_id.clone(),
        Some(m_id) => m_id.to_string(),
    };
    let system_fingerprint = format!("{}-{}", info.version, info.docker_label.unwrap_or("native"));
    // switch on stream
    if stream {
        let (headers, response_stream) = generate_stream_internal(
            infer.clone(),
            compute_type.clone(),
            Json(generate_request),
            span.clone(),
        )
        .await;

        let response_stream = async_stream::stream! {
            let mut response_stream = Box::pin(response_stream);
            let mut state = ChatState::new(using_tools, stream_options.clone(), system_fingerprint.clone(), model_id.clone(), logprobs, id.clone());
            while let Some(result) = response_stream.next().await {
                match result{
                Ok(stream_token) => {
                    let events = state.push(stream_token);
                    match events{
                        ChatEvent::NoTool => {
                            chat.tools = None;
                            chat.response_format = None;
                            let (generate_request, using_tools): (GenerateRequest, bool) =
                                chat.clone().try_into_generate(&infer).unwrap();
                            assert!(!using_tools);
                            let (_headers, response_stream2) =
                                generate_stream_internal(infer.clone(), compute_type.clone(), Json(generate_request), span.clone()).await;
                            state = ChatState::new(using_tools, stream_options.clone(), system_fingerprint.clone(), model_id.clone(), logprobs, id.clone());
                            response_stream = Box::pin(response_stream2);
                        }
                        ChatEvent::Events(events) => {
                            for chat_complete in events{
                                yield Ok(Event::default().json_data(chat_complete).unwrap_or_else(|e| {
                                    tracing::error!("Failed to serialize ChatCompletionChunk: {:?}", e);
                                    Event::default()
                                }));
                            }
                        }
                    }
                }
                Err(err) => yield Ok(err.into_openai_event())
                }
            }
            yield Ok::<Event, Infallible>(Event::default().data("[DONE]"));
        };

        let sse = Sse::new(response_stream).keep_alive(KeepAlive::default());
        Ok((headers, sse).into_response())
    } else {
        let (mut headers, mut input_length, Json(generation)) = generate_internal(
            Extension(infer.clone()),
            compute_type.clone(),
            Json(generate_request),
            span.clone(),
        )
        .await?;

        let current_time = std::time::SystemTime::now()
            .duration_since(std::time::UNIX_EPOCH)
            .unwrap_or_else(|_| std::time::Duration::from_secs(0))
            .as_secs();

        let (tool_calls, output) = if using_tools {
            match crate::chat::parse_output(&generation.generated_text)? {
                ChatChoice::NoTool => {
                    chat.tools = None;
                    chat.response_format = None;
                    let (generate_request, using_tools): (GenerateRequest, bool) =
                        chat.clone().try_into_generate(&infer)?;
                    assert!(!using_tools);
                    let (headers_final, input_length_final, Json(generation)) = generate_internal(
                        Extension(infer),
                        compute_type,
                        Json(generate_request),
                        span,
                    )
                    .await?;
                    headers = headers_final;
                    input_length = input_length_final;
                    (None, Some(generation.generated_text))
                }
                ChatChoice::ToolCalls(tool_calls) => (Some(tool_calls), None),
            }
        } else {
            (None, Some(generation.generated_text))
        };
        // build the complete response object with the full text
        let response = CompletionType::ChatCompletion(ChatCompletion::new(
            model_id,
            system_fingerprint,
            output,
            current_time,
            generation.details.unwrap(),
            logprobs,
            tool_calls,
            input_length,
        ));

        // wrap generation inside a Vec to match api-inference
        Ok((headers, Json(response)).into_response())
    }
}

/// Tokenize inputs
#[utoipa::path(
post,
tag = "Text Generation Inference",
path = "/tokenize",
request_body = GenerateRequest,
responses(
(status = 200, description = "Tokenized ids", body = TokenizeResponse),
(status = 404, description = "No tokenizer found", body = ErrorResponse,
example = json ! ({"error": "No fast tokenizer available"})),
)
)]
#[instrument(skip_all)]
async fn tokenize(
    Extension(infer): Extension<Infer>,
    Json(req): Json<GenerateRequest>,
) -> Result<Json<TokenizeResponse>, (StatusCode, Json<ErrorResponse>)> {
    let input = req.inputs.clone();
    let encoding = infer.tokenize(req).await?;
    let tokens = encoding_to_tokens(&encoding, &input);
    Ok(Json(TokenizeResponse(tokens)))
}

/// Prometheus metrics scrape endpoint
#[utoipa::path(
    get,
    tag = "Text Generation Inference",
    path = "/metrics",
    responses((status = 200, description = "Prometheus Metrics", body = String))
)]
async fn metrics(prom_handle: Extension<PrometheusHandle>) -> String {
    prom_handle.render()
}

#[derive(Clone, Debug)]
pub(crate) struct ComputeType(String);

// OpenAPI documentation
#[derive(OpenApi)]
#[openapi(
paths(
health,
get_model_info,
compat_generate,
generate,
generate_stream,
chat_completions,
completions,
tokenize,
metrics,
openai_get_model_info,
sagemaker_compatibility,
get_chat_tokenize,
),
components(
schemas(
Info,
CompatGenerateRequest,
SagemakerRequest,
GenerateRequest,
GrammarType,
JsonSchemaConfig,
ChatRequest,
Message,
MessageContent,
MessageChunk,
Url,
FunctionName,
OutputMessage,
TextMessage,
ToolCallMessage,
ToolCallDelta,
ChatCompletionComplete,
ChatCompletionChoice,
ChatCompletionDelta,
ChatCompletionChunk,
ChatCompletionLogprob,
ChatCompletionLogprobs,
ChatCompletionTopLogprob,
ChatCompletion,
CompletionRequest,
CompletionComplete,
SagemakerResponse,
SagemakerStreamResponse,
Chunk,
Completion,
CompletionFinal,
Prompt,
GenerateParameters,
PrefillToken,
Token,
GenerateResponse,
TokenizeResponse,
SimpleToken,
BestOfSequence,
Details,
FinishReason,
StreamResponse,
StreamDetails,
ErrorResponse,
GrammarType,
Usage,
StreamOptions,
DeltaToolCall,
Tool,
ToolCall,
Function,
FunctionDefinition,
ToolChoice,
ModelInfo,
ChatTokenizeResponse,
MessageBody,
)
),
tags(
(name = "Text Generation Inference", description = "Hugging Face Text Generation Inference API")
),
info(
title = "Text Generation Inference",
license(
name = "Apache 2.0",
url = "https://www.apache.org/licenses/LICENSE-2.0"
)
)
)]
pub struct ApiDoc;

pub fn schema() -> ApiDoc {
    ApiDoc
}

pub fn py_resolve_tokenizer(
    py: pyo3::Python,
    tokenizer_name: &str,
    revision: Option<&str>,
    trust_remote_code: bool,
) -> pyo3::PyResult<()> {
    let transformers = py.import_bound("transformers")?;
    let auto = transformers.getattr("AutoTokenizer")?;
    let from_pretrained = auto.getattr("from_pretrained")?;
    let args = (tokenizer_name,);
    let kwargs = if let Some(rev) = &revision {
        [
            ("revision", rev.to_string().into_py(py)),
            ("trust_remote_code", trust_remote_code.into_py(py)),
        ]
        .into_py_dict_bound(py)
    } else {
        [("trust_remote_code", trust_remote_code.into_py(py))].into_py_dict_bound(py)
    };
    let tokenizer = from_pretrained.call(args, Some(&kwargs))?;
    let save = tokenizer.getattr("save_pretrained")?;
    let args = ("out".to_string(),);
    save.call1(args)?;
    Ok(())
}

pub fn legacy_tokenizer_handle(config_filename: Option<&PathBuf>) -> Option<()> {
    // XXX Legacy case for FasterDecoding/medusa-vicuna-7b-v1.3
    // and state-spaces/mamba-130m
    tracing::warn!("Odd tokenizer detected, falling back on legacy tokenization");

    #[derive(serde::Deserialize)]
    struct FallbackConfig {
        base_model_name_or_path: Option<String>,
        model_type: Option<String>,
        ssm_config: Option<serde_json::Value>,
    }
    config_filename.and_then(|filename| {
        std::fs::read_to_string(filename)
            .ok()
            .as_ref()
            .and_then(|c| {
                let config: Result<FallbackConfig, _> = serde_json::from_str(c);
                if let Ok(config) = config {
                    if config.model_type.is_none() {
                        if let Some(base) = config.base_model_name_or_path {
                            pyo3::Python::with_gil(|py| -> PyResult<()> {
                                py_resolve_tokenizer(py, &base, Some("main"), false)
                            })
                            .ok()?;
                        }
                    }
                    if config.ssm_config.is_some() {
                        // XXX Legacy mamba
                        pyo3::Python::with_gil(|py| -> PyResult<()> {
                            py_resolve_tokenizer(py, "EleutherAI/gpt-neox-20b", Some("main"), false)
                        })
                        .ok()?;
                    }
                }
                Some(())
            })
    })
}

/// Serving method
#[allow(clippy::too_many_arguments)]
pub async fn run(
    backend: impl Backend + Send + Sync + 'static,
    max_concurrent_requests: usize,
    max_best_of: usize,
    max_stop_sequences: usize,
    max_top_n_tokens: u32,
    max_input_tokens: usize,
    max_total_tokens: usize,
    validation_workers: usize,
    api_key: Option<String>,
    tokenizer_name: String,
    tokenizer_config_path: Option<String>,
    revision: Option<String>,
    trust_remote_code: bool,
    hostname: String,
    port: u16,
    cors_allow_origin: Option<Vec<String>>,
    ngrok: bool,
    _ngrok_authtoken: Option<String>,
    _ngrok_edge: Option<String>,
    disable_grammar_support: bool,
    max_client_batch_size: usize,
    usage_stats_level: usage_stats::UsageStatsLevel,
    payload_limit: usize,
    max_image_fetch_size: usize,
    prometheus_port: u16,
) -> Result<(), WebServerError> {
    // CORS allowed origins
    // map to go inside the option and then map to parse from String to HeaderValue
    // Finally, convert to AllowOrigin
    let allow_origin: Option<AllowOrigin> = cors_allow_origin.map(|cors_allow_origin| {
        AllowOrigin::list(
            cors_allow_origin
                .iter()
                .map(|origin| origin.parse::<HeaderValue>().unwrap()),
        )
    });

    // Parse Huggingface hub token
    let authorization_token = std::env::var("HF_TOKEN")
        .or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN"))
        .ok();

    // Tokenizer instance
    // This will only be used to validate payloads
    let local_path = Path::new(&tokenizer_name);

    // Shared API builder initialization
    let api_builder = || {
        let mut builder = ApiBuilder::from_env().with_progress(false);
        if let Some(token) = authorization_token {
            builder = builder.with_token(Some(token));
        }

        if let Ok(cache_dir) = std::env::var("HUGGINGFACE_HUB_CACHE") {
            builder = builder.with_cache_dir(cache_dir.into());
        }

        if let Ok(origin) = std::env::var("HF_HUB_USER_AGENT_ORIGIN") {
            builder = builder.with_user_agent("origin", origin.as_str());
        }

        builder
    };

    // Decide if we need to use the API based on the revision and local path
    let use_api = revision.is_some() || !local_path.exists() || !local_path.is_dir();

    // Initialize API if needed
    #[derive(Clone)]
    enum Type {
        Api(Api),
        Cache(Cache),
        None,
    }
    let api = if use_api {
        if std::env::var("HF_HUB_OFFLINE") == Ok("1".to_string()) {
            let cache = std::env::var("HUGGINGFACE_HUB_CACHE")
                .map_err(|_| ())
                .map(|cache_dir| Cache::new(cache_dir.into()))
                .unwrap_or_else(|_| Cache::from_env());
            tracing::warn!("Offline mode active using cache defaults");
            Type::Cache(cache)
        } else {
            tracing::info!("Using the Hugging Face API");
            match api_builder().build() {
                Ok(api) => Type::Api(api),
                Err(_) => {
                    tracing::warn!("Unable to build the Hugging Face API");
                    Type::None
                }
            }
        }
    } else {
        Type::None
    };

    // Load tokenizer and model info
    let (
        config_filename,
        tokenizer_config_filename,
        preprocessor_config_filename,
        processor_config_filename,
        chat_template_filename,
        model_info,
    ) = match api {
        Type::None => (
            Some(local_path.join("config.json")),
            Some(local_path.join("tokenizer_config.json")),
            Some(local_path.join("preprocessor_config.json")),
            Some(local_path.join("processor_config.json")),
            Some(local_path.join("chat_template.json")),
            None,
        ),
        Type::Api(api) => {
            let api_repo = api.repo(Repo::with_revision(
                tokenizer_name.to_string(),
                RepoType::Model,
                revision.clone().unwrap_or_else(|| "main".to_string()),
            ));

            let config_filename = api_repo.get("config.json").await.ok();
            let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok();
            let preprocessor_config_filename = api_repo.get("preprocessor_config.json").await.ok();
            let processor_config_filename = api_repo.get("processor_config.json").await.ok();
            let chat_template_filename = api_repo.get("chat_template.json").await.ok();

            let model_info = if let Some(model_info) = get_hub_model_info(&api_repo).await {
                Some(model_info)
            } else {
                tracing::warn!("Could not retrieve model info from the Hugging Face hub.");
                None
            };
            (
                config_filename,
                tokenizer_config_filename,
                preprocessor_config_filename,
                processor_config_filename,
                chat_template_filename,
                model_info,
            )
        }
        Type::Cache(cache) => {
            tracing::info!("Cache {cache:?}");
            let repo = cache.repo(Repo::with_revision(
                tokenizer_name.to_string(),
                RepoType::Model,
                revision.clone().unwrap_or_else(|| "main".to_string()),
            ));
            (
                repo.get("config.json"),
                repo.get("tokenizer_config.json"),
                repo.get("preprocessor_config.json"),
                repo.get("processor_config.json"),
                repo.get("chat_template.json"),
                None,
            )
        }
    };

    // if chat_template_filename is present, load the chat template
    let chat_template: Option<crate::ChatTemplateVersions> = chat_template_filename
        .and_then(|f| std::fs::read_to_string(f).ok())
        .and_then(|c| {
            let res = serde_json::from_str::<crate::ChatTemplateStandalone>(&c);
            if let Err(e) = &res {
                tracing::warn!("Could not parse chat template {e:?}");
            }
            res.ok().map(|t| t.chat_template)
        });

    // Read the JSON contents of the file as an instance of 'HubTokenizerConfig'.
    tracing::warn!("Tokenizer_config {tokenizer_config_path:?} - {tokenizer_config_filename:?}");
    let tokenizer_config: Option<HubTokenizerConfig> = if let Some(filename) = tokenizer_config_path
    {
        HubTokenizerConfig::from_file(filename)
    } else {
        tokenizer_config_filename.and_then(HubTokenizerConfig::from_file)
    };
    let mut tokenizer_config = tokenizer_config.unwrap_or_else(|| {
        tracing::warn!("Could not find tokenizer config locally and no API specified");
        HubTokenizerConfig::default()
    });

    if chat_template.is_some() {
        tracing::info!("Using chat template from chat_template.json");
        tokenizer_config.chat_template = chat_template;
    }

    let tokenizer: Result<Tokenizer, WebServerError> = {
        use pyo3::prelude::*;
        Python::with_gil(|py| -> PyResult<()> {
            py_resolve_tokenizer(py, &tokenizer_name, revision.as_deref(), trust_remote_code)?;
            Ok(())
        })
        .inspect_err(|err| {
            tracing::error!("Failed to import python tokenizer {err}");
        })
        .or_else(|err| {
            let out = legacy_tokenizer_handle(config_filename.as_ref());
            out.ok_or(err)
        })
        .map_err(|_| WebServerError::Tokenizer("Unable to load tokenizer.".to_string()))?;
        let filename = "out/tokenizer.json";
        if let Ok(tok) = tokenizers::Tokenizer::from_file(filename) {
            Ok(Tokenizer::Rust(tok))
        } else {
            Ok(Tokenizer::Python {
                tokenizer_name: tokenizer_name.clone(),
                revision: revision.clone(),
                trust_remote_code,
            })
        }
    };

    let config: Option<Config> = config_filename.and_then(|filename| {
        std::fs::read_to_string(filename)
            .ok()
            .as_ref()
            .and_then(|c| {
                let config: Result<Config, _> = serde_json::from_str(c);
                if let Err(err) = &config {
                    tracing::warn!("Could not parse config {err:?}");
                }
                config.ok()
            })
    });
    let model_info = model_info.unwrap_or_else(|| HubModelInfo {
        model_id: tokenizer_name.to_string(),
        sha: None,
        pipeline_tag: None,
    });

    let processor_config = processor_config_filename
        .and_then(HubProcessorConfig::from_file)
        .unwrap_or_default();

    let preprocessor_config: Option<HubPreprocessorConfig> =
        preprocessor_config_filename.and_then(HubPreprocessorConfig::from_file);

    tracing::info!("Using config {config:?}");

    // Only send usage stats when TGI is run in container and the function returns Some
    let is_container = matches!(usage_stats::is_container(), Ok(true));
    // retrieve the huggingface_hub user agent origin if set, and add the origin to telemetry
    let origin = std::env::var("HF_HUB_USER_AGENT_ORIGIN").ok();
    let user_agent = match (usage_stats_level, is_container) {
        (usage_stats::UsageStatsLevel::On | usage_stats::UsageStatsLevel::NoStack, true) => {
            let reduced_args = usage_stats::Args::new(
                config.clone(),
                tokenizer_config.tokenizer_class.clone(),
                max_concurrent_requests,
                max_best_of,
                max_stop_sequences,
                max_top_n_tokens,
                max_input_tokens,
                max_total_tokens,
                // waiting_served_ratio,
                // max_batch_prefill_tokens,
                // max_batch_total_tokens,
                // max_waiting_tokens,
                // max_batch_size,
                revision.clone(),
                validation_workers,
                disable_grammar_support,
                max_client_batch_size,
                usage_stats_level,
                backend.name(),
                origin,
            );
            Some(usage_stats::UserAgent::new(reduced_args))
        }
        _ => None,
    };

    let stop_usage_thread = Arc::new(AtomicBool::new(false));
    let stop_usage_thread_clone = stop_usage_thread.clone();
    if let Some(ua) = user_agent.clone() {
        let start_event =
            usage_stats::UsageStatsEvent::new(ua.clone(), usage_stats::EventType::Start, None);
        tokio::spawn(async move {
            // send start event
            start_event.send().await;
            let mut last_report = Instant::now();
            while !stop_usage_thread_clone.load(Ordering::Relaxed) {
                if last_report.elapsed() > Duration::from_secs(900) {
                    let report_event = usage_stats::UsageStatsEvent::new(
                        ua.clone(),
                        usage_stats::EventType::Ping,
                        None,
                    );
                    report_event.send().await;
                    last_report = Instant::now();
                }
                tokio::time::sleep(Duration::from_secs(1)).await;
            }
        });
    };
    let compat_return_full_text = match &model_info.pipeline_tag {
        None => {
            tracing::warn!("no pipeline tag found for model {tokenizer_name}");
            true
        }
        Some(pipeline_tag) => pipeline_tag.as_str() == "text-generation",
    };
    let result = start(
        backend,
        max_concurrent_requests,
        max_best_of,
        max_stop_sequences,
        max_top_n_tokens,
        max_input_tokens,
        max_total_tokens,
        validation_workers,
        api_key,
        config,
        (tokenizer?, tokenizer_config),
        (preprocessor_config, processor_config),
        hostname,
        port,
        ngrok,
        _ngrok_authtoken,
        _ngrok_edge,
        disable_grammar_support,
        max_client_batch_size,
        model_info,
        compat_return_full_text,
        allow_origin,
        payload_limit,
        max_image_fetch_size,
        prometheus_port,
    )
    .await;

    if let Some(ua) = user_agent {
        stop_usage_thread.store(true, Ordering::Relaxed);
        match result {
            Ok(_) => {
                let stop_event = usage_stats::UsageStatsEvent::new(
                    ua.clone(),
                    usage_stats::EventType::Stop,
                    None,
                );
                stop_event.send().await;
                Ok(())
            }
            Err(e) => {
                let description = match usage_stats_level {
                    usage_stats::UsageStatsLevel::On => Some(e.to_string()),
                    usage_stats::UsageStatsLevel::NoStack => Some("unknow_error".to_string()),
                    _ => None,
                };
                let event = usage_stats::UsageStatsEvent::new(
                    ua.clone(),
                    usage_stats::EventType::Error,
                    description,
                );
                event.send().await;

                Err(e)
            }
        }
    } else {
        result
    }
}

#[allow(clippy::too_many_arguments)]
async fn start(
    backend: impl Backend + Send + Sync + 'static,
    max_concurrent_requests: usize,
    max_best_of: usize,
    max_stop_sequences: usize,
    max_top_n_tokens: u32,
    max_input_tokens: usize,
    max_total_tokens: usize,
    validation_workers: usize,
    api_key: Option<String>,
    config: Option<Config>,
    (tokenizer, tokenizer_config): (Tokenizer, HubTokenizerConfig),
    (preprocessor_config, processor_config): (Option<HubPreprocessorConfig>, HubProcessorConfig),
    hostname: String,
    port: u16,
    ngrok: bool,
    _ngrok_authtoken: Option<String>,
    _ngrok_edge: Option<String>,
    disable_grammar_support: bool,
    max_client_batch_size: usize,
    model_info: HubModelInfo,
    compat_return_full_text: bool,
    allow_origin: Option<AllowOrigin>,
    payload_limit: usize,
    max_image_fetch_size: usize,
    prometheus_port: u16,
) -> Result<(), WebServerError> {
    // Determine the server port based on the feature and environment variable.
    let port = if cfg!(feature = "google") {
        std::env::var("AIP_HTTP_PORT")
            .map(|aip_http_port| aip_http_port.parse::<u16>().unwrap_or(port))
            .unwrap_or(port)
    } else {
        port
    };

    let addr = match hostname.parse() {
        Ok(ip) => SocketAddr::new(ip, port),
        Err(_) => {
            tracing::warn!("Invalid hostname, defaulting to 0.0.0.0");
            SocketAddr::new(IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)), port)
        }
    };

    // Create state
    let validation = Validation::new(
        validation_workers,
        tokenizer,
        config,
        preprocessor_config,
        max_best_of,
        max_stop_sequences,
        max_top_n_tokens,
        max_input_tokens,
        max_total_tokens,
        disable_grammar_support,
        max_image_fetch_size,
    );

    let infer = Infer::new(
        backend,
        validation,
        max_concurrent_requests,
        tokenizer_config,
        processor_config,
    );

    // Duration buckets
    let duration_matcher = Matcher::Suffix(String::from("duration"));
    let n_duration_buckets = 35;
    let mut duration_buckets = Vec::with_capacity(n_duration_buckets);
    // Minimum duration in seconds
    let mut value = 0.0001;
    for _ in 0..n_duration_buckets {
        // geometric sequence
        value *= 1.5;
        duration_buckets.push(value);
    }
    // Input Length buckets
    let input_length_matcher = Matcher::Full(String::from("tgi_request_input_length"));
    let input_length_buckets: Vec<f64> = (0..100)
        .map(|x| (max_input_tokens as f64 / 100.0) * (x + 1) as f64)
        .collect();
    // Generated tokens buckets
    let generated_tokens_matcher = Matcher::Full(String::from("tgi_request_generated_tokens"));
    let generated_tokens_buckets: Vec<f64> = (0..100)
        .map(|x| (max_total_tokens as f64 / 100.0) * (x + 1) as f64)
        .collect();
    // Input Length buckets
    let max_new_tokens_matcher = Matcher::Full(String::from("tgi_request_max_new_tokens"));
    let max_new_tokens_buckets: Vec<f64> = (0..100)
        .map(|x| (max_total_tokens as f64 / 100.0) * (x + 1) as f64)
        .collect();
    // Batch size buckets
    let batch_size_matcher = Matcher::Full(String::from("tgi_batch_next_size"));
    let batch_size_buckets: Vec<f64> = (0..1024).map(|x| (x + 1) as f64).collect();
    // Speculated tokens buckets
    // let skipped_matcher = Matcher::Full(String::from("tgi_request_skipped_tokens"));
    // let skipped_buckets: Vec<f64> = (0..shard_info.speculate + 1).map(|x| x as f64).collect();

    let mut p_addr = addr;
    p_addr.set_port(prometheus_port);

    // Prometheus handler
    let builder = PrometheusBuilder::new()
        .with_http_listener(p_addr)
        .set_buckets_for_metric(duration_matcher, &duration_buckets)
        .unwrap()
        .set_buckets_for_metric(input_length_matcher, &input_length_buckets)
        .unwrap()
        .set_buckets_for_metric(generated_tokens_matcher, &generated_tokens_buckets)
        .unwrap()
        .set_buckets_for_metric(max_new_tokens_matcher, &max_new_tokens_buckets)
        .unwrap()
        .set_buckets_for_metric(batch_size_matcher, &batch_size_buckets)
        .unwrap();
    // .set_buckets_for_metric(skipped_matcher, &skipped_buckets)
    // .unwrap();
    // See: https://github.com/metrics-rs/metrics/issues/467#issuecomment-2022755151
    let (recorder, _) = builder
        .build()
        .expect("failed to build prometheus recorder");
    let prom_handle = recorder.handle();
    metrics::set_global_recorder(recorder).expect("Failed to set global recorder");

    // Metrics descriptions
    metrics::describe_counter!("tgi_request_success", "Number of successful requests");
    metrics::describe_histogram!(
        "tgi_request_duration",
        metrics::Unit::Seconds,
        "Request duration"
    );
    metrics::describe_histogram!(
        "tgi_request_validation_duration",
        metrics::Unit::Seconds,
        "Request validation duration"
    );
    metrics::describe_histogram!(
        "tgi_request_queue_duration",
        metrics::Unit::Seconds,
        "Request queue duration"
    );
    metrics::describe_histogram!(
        "tgi_request_inference_duration",
        metrics::Unit::Seconds,
        "Request inference duration"
    );
    metrics::describe_histogram!(
        "tgi_request_mean_time_per_token_duration",
        metrics::Unit::Seconds,
        "Mean time per token per request"
    );
    metrics::describe_histogram!(
        "tgi_request_generated_tokens",
        metrics::Unit::Count,
        "Generated tokens per request"
    );
    metrics::describe_counter!(
        "tgi_batch_inference_count",
        metrics::Unit::Count,
        "Inference calls per method (prefill or decode)"
    );
    metrics::describe_counter!(
        "tgi_request_count",
        metrics::Unit::Count,
        "Total number of requests"
    );
    metrics::describe_counter!(
        "tgi_batch_inference_success",
        metrics::Unit::Count,
        "Number of successful inference calls per method (prefill or decode)"
    );
    metrics::describe_gauge!(
        "tgi_batch_current_size",
        metrics::Unit::Count,
        "Current batch size"
    );
    metrics::describe_gauge!("tgi_queue_size", metrics::Unit::Count, "Current queue size");
    metrics::describe_gauge!(
        "tgi_batch_current_max_tokens",
        metrics::Unit::Count,
        "Maximum tokens for the current batch"
    );
    metrics::describe_gauge!(
        "tgi_batch_total_tokens",
        metrics::Unit::Count,
        "Maximum amount of tokens in total."
    );
    metrics::describe_histogram!(
        "tgi_request_max_new_tokens",
        metrics::Unit::Count,
        "Maximum new tokens per request"
    );
    metrics::describe_histogram!(
        "tgi_batch_inference_duration",
        metrics::Unit::Seconds,
        "Batch inference duration"
    );
    metrics::describe_histogram!(
        "tgi_batch_forward_duration",
        metrics::Unit::Seconds,
        "Batch forward duration per method (prefill or decode)"
    );
    metrics::describe_histogram!(
        "tgi_request_skipped_tokens",
        metrics::Unit::Count,
        "Speculated tokens per request"
    );
    metrics::describe_histogram!(
        "tgi_batch_filter_duration",
        metrics::Unit::Seconds,
        "Time spent filtering batches and sending generated tokens per method (prefill or decode)"
    );
    metrics::describe_histogram!(
        "tgi_request_queue_duration",
        metrics::Unit::Seconds,
        "Time spent in the queue per request"
    );
    metrics::describe_histogram!(
        "tgi_request_validation_duration",
        metrics::Unit::Seconds,
        "Time spent validating the request"
    );
    metrics::describe_histogram!(
        "tgi_request_duration",
        metrics::Unit::Seconds,
        "Total time spent processing the request"
    );
    metrics::describe_histogram!(
        "tgi_batch_decode_duration",
        metrics::Unit::Seconds,
        "Time spent decoding a batch per method (prefill or decode)"
    );
    metrics::describe_histogram!(
        "tgi_request_input_length",
        metrics::Unit::Count,
        "Input token length per request"
    );
    metrics::describe_histogram!(
        "tgi_batch_next_size",
        metrics::Unit::Count,
        "Batch size of the next batch"
    );

    // CORS layer
    let allow_origin = allow_origin.unwrap_or(AllowOrigin::any());
    let cors_layer = CorsLayer::new()
        .allow_methods([Method::GET, Method::POST])
        .allow_headers([http::header::CONTENT_TYPE])
        .allow_origin(allow_origin);

    // Endpoint info
    let info = Info {
        model_id: model_info.model_id,
        model_sha: model_info.sha,
        // model_dtype: shard_info.dtype,
        // model_device_type: shard_info.device_type,
        model_pipeline_tag: model_info.pipeline_tag,
        max_concurrent_requests,
        max_best_of,
        max_stop_sequences,
        max_input_tokens,
        max_total_tokens,
        // waiting_served_ratio,
        // max_batch_total_tokens,
        // max_waiting_tokens,
        // max_batch_size,
        validation_workers,
        max_client_batch_size,
        router: env!("CARGO_PKG_NAME"),
        version: env!("CARGO_PKG_VERSION"),
        sha: option_env!("VERGEN_GIT_SHA"),
        docker_label: option_env!("DOCKER_LABEL"),
    };

    #[allow(unused_mut)] // mut is needed for conditional compilation
    let mut doc = ApiDoc::openapi();

    #[cfg(feature = "google")]
    {
        use crate::vertex::__path_vertex_compatibility;
        use crate::vertex::{VertexInstance, VertexRequest, VertexResponse};

        #[derive(OpenApi)]
        #[openapi(
            paths(vertex_compatibility),
            components(schemas(VertexInstance, VertexRequest, VertexResponse))
        )]
        struct VertexApiDoc;

        doc.merge(VertexApiDoc::openapi());
    }

    #[cfg(feature = "kserve")]
    {
        use crate::kserve::{
            InferenceOutput, InferenceRequest, LiveResponse, MetadataServerResponse, OutputChunk,
            ReadyResponse,
        };
        use crate::kserve::{
            __path_kerve_server_metadata, __path_kserve_health_live, __path_kserve_health_ready,
            __path_kserve_model_infer, __path_kserve_model_metadata,
            __path_kserve_model_metadata_ready,
        };

        #[derive(OpenApi)]
        #[openapi(
            paths(
                kserve_health_live,
                kserve_health_ready,
                kerve_server_metadata,
                kserve_model_metadata,
                kserve_model_metadata_ready,
                kserve_model_infer,
            ),
            components(schemas(
                InferenceOutput,
                InferenceRequest,
                LiveResponse,
                MetadataServerResponse,
                OutputChunk,
                ReadyResponse,
            ))
        )]
        struct KServeApiDoc;

        doc.merge(KServeApiDoc::openapi());
    }

    // Configure Swagger UI
    let swagger_ui = SwaggerUi::new("/docs").url("/api-doc/openapi.json", doc);

    // Define base and health routes
    let mut base_routes = Router::new()
        .route("/", post(compat_generate))
        .route("/generate", post(generate))
        .route("/generate_stream", post(generate_stream))
        .route("/v1/chat/completions", post(chat_completions))
        .route("/v1/completions", post(completions))
        .route("/vertex", post(vertex_compatibility))
        .route("/invocations", post(sagemaker_compatibility))
        .route("/tokenize", post(tokenize));

    if let Some(api_key) = api_key {
        let mut prefix = "Bearer ".to_string();
        prefix.push_str(&api_key);

        // Leak to allow FnMut
        let api_key: &'static str = prefix.leak();

        let auth = move |headers: HeaderMap,
                         request: axum::extract::Request,
                         next: axum::middleware::Next| async move {
            match headers.get(AUTHORIZATION) {
                Some(token) => match token.to_str() {
                    Ok(token_str) if token_str.to_lowercase() == api_key.to_lowercase() => {
                        let response = next.run(request).await;
                        Ok(response)
                    }
                    _ => Err(StatusCode::UNAUTHORIZED),
                },
                None => Err(StatusCode::UNAUTHORIZED),
            }
        };

        base_routes = base_routes.layer(axum::middleware::from_fn(auth))
    }
    let info_routes = Router::new()
        .route("/", get(health))
        .route("/chat_tokenize", post(get_chat_tokenize))
        .route("/info", get(get_model_info))
        .route("/health", get(health))
        .route("/ping", get(health))
        .route("/metrics", get(metrics))
        .route("/v1/models", get(openai_get_model_info));

    let compute_type =
        ComputeType(std::env::var("COMPUTE_TYPE").unwrap_or("gpu+optimized".to_string()));

    // Combine routes and layers
    let mut app = Router::new()
        .merge(swagger_ui)
        .merge(base_routes)
        .merge(info_routes);

    #[cfg(feature = "google")]
    {
        tracing::info!("Built with `google` feature");
        tracing::info!(
            "Environment variables `AIP_PREDICT_ROUTE` and `AIP_HEALTH_ROUTE` will be respected."
        );
        if let Ok(env_predict_route) = std::env::var("AIP_PREDICT_ROUTE") {
            app = app.route(&env_predict_route, post(vertex_compatibility));
        }
        if let Ok(env_health_route) = std::env::var("AIP_HEALTH_ROUTE") {
            app = app.route(&env_health_route, get(health));
        }
    }

    #[cfg(feature = "kserve")]
    {
        tracing::info!("Built with `kserve` feature");
        app = app
            .route(
                "/v2/models/:model_name/versions/:model_version/infer",
                post(kserve_model_infer),
            )
            .route(
                "/v2/models/:model_name/versions/:model_version",
                get(kserve_model_metadata),
            )
            .route("/v2/health/ready", get(kserve_health_ready))
            .route("/v2/health/live", get(kserve_health_live))
            .route("/v2", get(kerve_server_metadata))
            .route(
                "/v2/models/:model_name/versions/:model_version/ready",
                get(kserve_model_metadata_ready),
            );
    }

    // add layers after routes
    app = app
        .layer(Extension(info))
        .layer(Extension(compat_return_full_text))
        .layer(Extension(infer))
        .layer(Extension(compute_type))
        .layer(Extension(prom_handle.clone()))
        .layer(OtelAxumLayer::default())
        .layer(DefaultBodyLimit::max(payload_limit))
        .layer(axum::middleware::from_fn(trace_context_middleware))
        .layer(cors_layer);

    tracing::info!("Connected");

    if ngrok {
        #[cfg(feature = "ngrok")]
        {
            panic!("ngrok feature is not functional with axum=0.7 and hyper=1, waiting on https://github.com/ngrok/ngrok-rust/pull/137/files to re-enable.");

            // Run server
        }
        #[cfg(not(feature = "ngrok"))]
        {
            let _ngrok_authtoken = ngrok_authtoken;
            let _ngrok_domain = ngrok_domain;
            let _ngrok_username = ngrok_username;
            let _ngrok_password = ngrok_password;

            panic!("`text-generation-router` was compiled without the `ngrok` feature");
        }
    } else {
        // Run server
        let listener = match tokio::net::TcpListener::bind(&addr).await {
            Ok(listener) => listener,
            Err(e) => {
                tracing::error!("Failed to bind to {addr}: {e}");
                return Err(WebServerError::Axum(Box::new(e)));
            }
        };
        axum::serve(listener, app)
            .with_graceful_shutdown(shutdown_signal())
            .await
            .map_err(|err| WebServerError::Axum(Box::new(err)))?;
    }
    Ok(())
}

/// get model info from the Huggingface Hub
pub async fn get_hub_model_info(api: &ApiRepo) -> Option<HubModelInfo> {
    let response = api.info_request().send().await.ok()?;

    if response.status().is_success() {
        let hub_model_info: HubModelInfo =
            serde_json::from_str(&response.text().await.ok()?).ok()?;
        if let Some(sha) = &hub_model_info.sha {
            tracing::info!(
                "Serving revision {sha} of model {}",
                hub_model_info.model_id
            );
        }
        Some(hub_model_info)
    } else {
        None
    }
}

/// get tokenizer_config from the Huggingface Hub
pub async fn get_tokenizer_config(api_repo: &ApiRepo) -> Option<HubTokenizerConfig> {
    let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok()?;

    // Open the file in read-only mode with buffer.
    let file = File::open(tokenizer_config_filename).ok()?;
    let reader = BufReader::new(file);

    // Read the JSON contents of the file as an instance of 'HubTokenizerConfig'.
    let tokenizer_config: HubTokenizerConfig = serde_json::from_reader(reader)
        .map_err(|e| {
            tracing::warn!("Unable to parse tokenizer config: {}", e);
            e
        })
        .ok()?;

    Some(tokenizer_config)
}

/// Shutdown signal handler
async fn shutdown_signal() {
    let ctrl_c = async {
        signal::ctrl_c()
            .await
            .expect("failed to install Ctrl+C handler");
    };

    #[cfg(unix)]
    let terminate = async {
        signal::unix::signal(signal::unix::SignalKind::terminate())
            .expect("failed to install signal handler")
            .recv()
            .await;
    };

    #[cfg(not(unix))]
    let terminate = std::future::pending::<()>();

    tokio::select! {
        _ = ctrl_c => {},
        _ = terminate => {},
    }

    tracing::info!("signal received, starting graceful shutdown");
    opentelemetry::global::shutdown_tracer_provider();
}

/// Convert to Axum supported formats
impl From<InferError> for (StatusCode, Json<ErrorResponse>) {
    fn from(err: InferError) -> Self {
        let status_code = match err {
            InferError::GenerationError(_) => StatusCode::FAILED_DEPENDENCY,
            InferError::Overloaded(_) => StatusCode::TOO_MANY_REQUESTS,
            InferError::ValidationError(_) => StatusCode::UNPROCESSABLE_ENTITY,
            InferError::IncompleteGeneration => StatusCode::INTERNAL_SERVER_ERROR,
            InferError::IncompleteGenerationStream => StatusCode::INTERNAL_SERVER_ERROR,
            InferError::TemplateError(_) => StatusCode::UNPROCESSABLE_ENTITY,
            InferError::MissingTemplateVariable(_) => StatusCode::UNPROCESSABLE_ENTITY,
            InferError::ToolError(_) => StatusCode::UNPROCESSABLE_ENTITY,
            InferError::StreamSerializationError(_) => StatusCode::INTERNAL_SERVER_ERROR,
        };

        (
            status_code,
            Json(ErrorResponse {
                error: err.to_string(),
                error_type: err.error_type().to_string(),
            }),
        )
    }
}

impl From<InferError> for Event {
    fn from(err: InferError) -> Self {
        Event::default()
            .json_data(ErrorResponse {
                error: err.to_string(),
                error_type: err.error_type().to_string(),
            })
            .unwrap()
    }
}

#[derive(Debug, Error)]
pub enum WebServerError {
    #[error("Axum error: {0}")]
    Axum(#[from] axum::BoxError),
    #[error("Tokenizer error: {0}")]
    Tokenizer(String),
}


================================================
FILE: router/src/usage_stats.rs
================================================
use crate::config::Config;
use clap::ValueEnum;
use csv::ReaderBuilder;
use reqwest::header::HeaderMap;
use serde::Serialize;
use std::{
    fs::File,
    io::{self, BufRead},
    path::Path,
    process::Command,
    time::Duration,
};
use uuid::Uuid;

const TELEMETRY_URL: &str = "https://huggingface.co/api/telemetry/tgi";

#[derive(Copy, Clone, Debug, Serialize, ValueEnum)]
pub enum UsageStatsLevel {
    On,
    NoStack,
    Off,
}

#[derive(Debug, Clone, Serialize)]
pub struct UserAgent {
    pub uid: String,
    pub args: Args,
    pub env: Env,
}

impl UserAgent {
    pub fn new(reduced_args: Args) -> Self {
        Self {
            uid: Uuid::new_v4().to_string(),
            args: reduced_args,
            env: Env::new(),
        }
    }
}

#[derive(Serialize, Debug)]
pub enum EventType {
    Start,
    Stop,
    Error,
    Ping,
}

#[derive(Debug, Serialize)]
pub struct UsageStatsEvent {
    user_agent: UserAgent,
    event_type: EventType,
    #[serde(skip_serializing_if = "Option::is_none")]
    error_reason: Option<String>,
}

impl UsageStatsEvent {
    pub fn new(user_agent: UserAgent, event_type: EventType, error_reason: Option<String>) -> Self {
        Self {
            user_agent,
            event_type,
            error_reason,
        }
    }
    pub async fn send(&self) {
        let mut headers = HeaderMap::new();
        headers.insert("Content-Type", "application/json".parse().unwrap());
        let body = serde_json::to_string(&self).unwrap();
        let client = reqwest::Client::new();
        let _ = client
            .post(TELEMETRY_URL)
            .headers(headers)
            .body(body)
            .timeout(Duration::from_secs(10))
            .send()
            .await;
    }
}

#[derive(Debug, Clone, Serialize)]
pub struct Args {
    model_config: Option<Config>,
    tokenizer_class: Option<String>,
    max_concurrent_requests: usize,
    max_best_of: usize,
    max_stop_sequences: usize,
    max_top_n_tokens: u32,
    max_input_tokens: usize,
    max_total_tokens: usize,
    // waiting_served_ratio: f32,
    // max_batch_prefill_tokens: u32,
    // max_batch_total_tokens: Option<u32>,
    // max_waiting_tokens: usize,
    // max_batch_size: Option<usize>,
    revision: Option<String>,
    validation_workers: usize,
    disable_grammar_support: bool,
    max_client_batch_size: usize,
    usage_stats_level: UsageStatsLevel,
    backend_name: &'static str,
    origin: Option<String>,
}

impl Args {
    #[allow(clippy::too_many_arguments)]
    pub fn new(
        model_config: Option<Config>,
        tokenizer_class: Option<String>,
        max_concurrent_requests: usize,
        max_best_of: usize,
        max_stop_sequences: usize,
        max_top_n_tokens: u32,
        max_input_tokens: usize,
        max_total_tokens: usize,
        // waiting_served_ratio: f32,
        // max_batch_prefill_tokens: u32,
        // max_batch_total_tokens: Option<u32>,
        // max_waiting_tokens: usize,
        // max_batch_size: Option<usize>,
        revision: Option<String>,
        validation_workers: usize,
        disable_grammar_support: bool,
        max_client_batch_size: usize,
        usage_stats_level: UsageStatsLevel,
        backend_name: &'static str,
        origin: Option<String>,
    ) -> Self {
        Self {
            model_config,
            tokenizer_class,
            max_concurrent_requests,
            max_best_of,
            max_stop_sequences,
            max_top_n_tokens,
            max_input_tokens,
            max_total_tokens,
            // waiting_served_ratio,
            // max_batch_prefill_tokens,
            // max_batch_total_tokens,
            // max_waiting_tokens,
            // max_batch_size,
            revision,
            validation_workers,
            disable_grammar_support,
            max_client_batch_size,
            usage_stats_level,
            backend_name,
            origin,
        }
    }
}

/// This is more or less a copy of the code from the `text-generation-launcher` crate to avoid a dependency
#[derive(Serialize, Debug, Clone)]
pub struct Env {
    git_sha: &'static str,
    docker_label: &'static str,
    nvidia_info: Option<Vec<NvidiaSmiInfo>>,
    xpu_info: Option<Vec<XpuSmiInfo>>,
    hpu_info: Option<Vec<HpuSmiInfo>>,
    system_env: SystemInfo,
}

#[derive(Debug, Serialize, Clone)]
struct NvidiaSmiInfo {
    name: String,
    pci_bus_id: String,
    driver_version: String,
    pstate: String,
    pcie_link_gen_max: String,
    pcie_link_gen_current: String,
    temperature_gpu: String,
    utilization_gpu: String,
    utilization_memory: String,
    memory_total: String,
    memory_free: String,
    memory_used: String,
    reset_status_reset_required: String,
    reset_status_drain_and_reset_recommended: String,
    compute_cap: String,
    ecc_errors_corrected_volatile_total: String,
    mig_mode_current: String,
    power_draw_instant: String,
    power_limit: String,
}

impl NvidiaSmiInfo {
    fn new() -> Option<Vec<NvidiaSmiInfo>> {
        let output = Command::new("nvidia-smi")
            .args([
                "--query-gpu=name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.gpucurrent,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used,reset_status.reset_required,reset_status.drain_and_reset_recommended,compute_cap,ecc.errors.corrected.volatile.total,mig.mode.current,power.draw.instant,power.limit",
                "--format=csv"
            ])
            .output()
            .ok()?;

        if !output.status.success() {
            return None;
        }

        let stdout = String::from_utf8(output.stdout).ok()?;

        let mut rdr = ReaderBuilder::new()
            .has_headers(true)
            .from_reader(stdout.as_bytes());

        let mut infos = Vec::new();

        for result in rdr.records() {
            let record = result.ok()?;
            infos.push(NvidiaSmiInfo {
                name: record[0].to_string(),
                pci_bus_id: record[1].to_string(),
                driver_version: record[2].to_string(),
                pstate: record[3].to_string(),
                pcie_link_gen_max: record[4].to_string(),
                pcie_link_gen_current: record[5].to_string(),
                temperature_gpu: record[6].to_string(),
                utilization_gpu: record[7].to_string(),
                utilization_memory: record[8].to_string(),
                memory_total: record[9].to_string(),
                memory_free: record[10].to_string(),
                memory_used: record[11].to_string(),
                reset_status_reset_required: record[12].to_string(),
                reset_status_drain_and_reset_recommended: record[13].to_string(),
                compute_cap: record[14].to_string(),
                ecc_errors_corrected_volatile_total: record[15].to_string(),
                mig_mode_current: record[16].to_string(),
                power_draw_instant: record[17].to_string(),
                power_limit: record[18].to_string(),
            });
        }

        Some(infos)
    }
}

#[derive(Debug, Serialize, Clone)]
struct XpuSmiInfo {
    device_id: usize,
    gpu_utilization: f32,
    gpu_power: f32,
    gpu_core_temperature: f32,
    gpu_memory_bandwidth_utilization: f32,
}

impl XpuSmiInfo {
    /// based on this https://github.com/intel/xpumanager/blob/master/doc/smi_user_guide.md#dump-the-device-statistics-in-csv-format
    fn new() -> Option<Vec<XpuSmiInfo>> {
        let output = Command::new("xpu-smi")
            .args([
                "dump", "-d", "-1", "-m",
                "0,1,3,17", // Metrics IDs: GPU Utilization, GPU Power, GPU Core Temperature, GPU Memory Bandwidth Utilization
                "-n", "1", "-j",
            ])
            .output()
            .ok()?;

        if !output.status.success() {
            return None;
        }

        let stdout = String::from_utf8(output.stdout).ok()?;
        let mut infos = Vec::new();

        let json_data: serde_json::Value = match serde_json::from_str(&stdout) {
            Ok(data) => data,
            Err(_) => return None,
        };

        if let Some(metrics_data) = json_data.as_array() {
            for entry in metrics_data {
                let device_id = entry["deviceId"].as_u64()? as usize;
                let gpu_utilization = entry["metrics"][0].as_f64()? as f32;
                let gpu_power = entry["metrics"][1].as_f64()? as f32;
                let gpu_core_temperature = entry["metrics"][2].as_f64()? as f32;
                let gpu_memory_bandwidth_utilization = entry["metrics"][3].as_f64()? as f32;

                infos.push(XpuSmiInfo {
                    device_id,
                    gpu_utilization,
                    gpu_power,
                    gpu_core_temperature,
                    gpu_memory_bandwidth_utilization,
                });
            }
        }

        Some(infos)
    }
}

#[derive(Debug, Serialize, Clone)]
struct HpuSmiInfo {
    name: String,
    pci_bus_id: String,
    driver_version: String,
    temperature: String,
    utilization: String,
    memory_total: String,
    memory_free: String,
    memory_used: String,
    power_draw_instant: String,
}

impl HpuSmiInfo {
    fn new() -> Option<Vec<HpuSmiInfo>> {
        let output = Command::new("hl-smi")
            .args([
                "--query-aip=name,bus_id,driver_version,temperature.aip,utilization.aip,memory.total,memory.free,memory.used,power.draw",
                "--format=csv"
            ])
            .output()
            .ok()?;

        if !output.status.success() {
            return None;
        }

        let stdout = String::from_utf8(output.stdout).ok()?;

        let mut rdr = ReaderBuilder::new()
            .has_headers(true)
            .from_reader(stdout.as_bytes());

        let mut infos = Vec::new();

        for result in rdr.records() {
            let record = result.ok()?;
            infos.push(HpuSmiInfo {
                name: record[0].to_string(),
                pci_bus_id: record[1].to_string(),
                driver_version: record[2].to_string(),
                temperature: record[3].to_string(),
                utilization: record[4].to_string(),
                memory_total: record[5].to_string(),
                memory_free: record[6].to_string(),
                memory_used: record[7].to_string(),
                power_draw_instant: record[8].to_string(),
            });
        }

        Some(infos)
    }
}

#[derive(Serialize, Debug, Clone)]
pub struct SystemInfo {
    cpu_count: usize,
    cpu_type: String,
    total_memory: u64,
    architecture: String,
    platform: String,
}

impl SystemInfo {
    fn new() -> Self {
        let mut system = sysinfo::System::new_all();
        system.refresh_all();

        let cpu_count = system.cpus().len();
        let cpu_type = system.cpus()[0].brand().to_string();
        let total_memory = system.total_memory();
        let architecture = std::env::consts::ARCH.to_string();
        let platform = format!(
            "{}-{}-{}",
            std::env::consts::OS,
            std::env::consts::FAMILY,
            std::env::consts::ARCH
        );
        Self {
            cpu_count,
            cpu_type,
            total_memory,
            architecture,
            platform,
        }
    }
}

impl Default for Env {
    fn default() -> Self {
        Self::new()
    }
}

impl Env {
    pub fn new() -> Self {
        Self {
            system_env: SystemInfo::new(),
            nvidia_info: NvidiaSmiInfo::new(),
            xpu_info: XpuSmiInfo::new(),
            hpu_info: HpuSmiInfo::new(),
            git_sha: option_env!("VERGEN_GIT_SHA").unwrap_or("N/A"),
            docker_label: option_env!("DOCKER_LABEL").unwrap_or("N/A"),
        }
    }
    pub fn is_hpu_device(&self) -> bool {
        self.hpu_info.is_some()
    }
}

pub fn is_container() -> io::Result<bool> {
    let path = Path::new("/proc/self/cgroup");
    let file = File::open(path)?;
    let reader = io::BufReader::new(file);

    for line in reader.lines() {
        let line = line?;
        // Check for common container runtimes
        if line.contains("/docker/")
            || line.contains("/docker-")
            || line.contains("/kubepods/")
            || line.contains("/kubepods-")
            || line.contains("containerd")
            || line.contains("crio")
            || line.contains("podman")
        {
            return Ok(true);
        }
    }
    Ok(false)
}


================================================
FILE: router/src/validation.rs
================================================
use crate::config::Config;
use crate::validation::ValidationError::{BestOfSampling, BestOfSeed, EmptyInput};
use crate::{
    GenerateParameters, GenerateRequest, GrammarType, HubPreprocessorConfig, Idefics2Preprocessor,
    TokenizerTrait,
};
use crate::{PyTokenizer, Tokenizer};
use base64::{engine::general_purpose::STANDARD, Engine};
use image::{ImageFormat, ImageReader};
use outlines_core::json_schema::to_regex as json_schema_to_regex;
use rand::{thread_rng, Rng};
use serde_json::Value;
/// Payload validation logic
use std::cmp::min;
use std::io::{Cursor, Read};
use std::iter;
use std::sync::Arc;
use thiserror::Error;
use tokio::sync::mpsc;
use tokio::sync::oneshot;
use tracing::warn;
use tracing::{instrument, Span};
use {once_cell::sync::Lazy, regex::Regex};

static DEFAULT_GENERATION_LENGTH: u32 = 1024;

/// Validation
#[derive(Debug, Clone)]
pub struct Validation {
    /// Validation parameters
    max_best_of: usize,
    max_stop_sequences: usize,
    max_top_n_tokens: u32,
    max_input_length: usize,
    max_total_tokens: usize,
    disable_grammar_support: bool,
    /// Channel to communicate with the background tokenization task
    sender: mpsc::UnboundedSender<TokenizerRequest>,
}

impl Validation {
    #[allow(clippy::too_many_arguments)]
    pub(crate) fn new(
        workers: usize,
        tokenizer: Tokenizer,
        config: Option<Config>,
        preprocessor_config: Option<HubPreprocessorConfig>,
        max_best_of: usize,
        max_stop_sequences: usize,
        max_top_n_tokens: u32,
        max_input_length: usize,
        max_total_tokens: usize,
        disable_grammar_support: bool,
        max_image_fetch_size: usize,
    ) -> Self {
        let workers = if let Tokenizer::Python { .. } = &tokenizer {
            1
        } else {
            workers
        };
        // If we have a fast tokenizer
        let sender = {
            // Create round robin channel
            let (validation_sender, validation_round_robin_receiver) = mpsc::unbounded_channel();
            let mut senders = Vec::with_capacity(workers);

            // Create workers
            for _ in 0..workers {
                let tokenizer_clone = tokenizer.clone();
                let config_clone = config.clone();
                let preprocessor_config_clone = preprocessor_config.clone();
                let (tokenizer_sender, tokenizer_receiver) = mpsc::unbounded_channel();
                senders.push(tokenizer_sender);

                // Spawn worker
                tokio::task::spawn_blocking(move || {
                    tokenizer_worker(
                        tokenizer_clone,
                        config_clone,
                        preprocessor_config_clone,
                        tokenizer_receiver,
                        max_image_fetch_size,
                    )
                });
            }

            // Create tokenization round robin task
            tokio::spawn(round_robin_task(validation_round_robin_receiver, senders));

            validation_sender
        };

        Self {
            max_best_of,
            sender,
            max_stop_sequences,
            max_top_n_tokens,
            max_input_length,
            max_total_tokens,
            disable_grammar_support,
        }
    }

    #[instrument(skip(self, inputs))]
    pub async fn tokenize(
        &self,
        inputs: String,
        add_special_tokens: bool,
        truncate: Option<usize>,
    ) -> Result<(tokenizers::Encoding, Vec<Chunk>), ValidationError> {
        // If we have a fast tokenizer
        // Create response channel
        let (response_sender, response_receiver) = oneshot::channel();
        // Send request to the background validation task
        // Unwrap is safe here
        let _ = &self
            .sender
            .send((
                (inputs, add_special_tokens, truncate),
                response_sender,
                Span::current(),
            ))
            .unwrap();

        // Await on response channel
        // Unwrap is safe here
        let encoding = response_receiver.await.unwrap()?;
        Ok(encoding)
    }

    #[allow(clippy::type_complexity)]
    #[instrument(skip(self, inputs))]
    async fn validate_input(
        &self,
        inputs: String,
        add_special_tokens: bool,
        truncate: Option<usize>,
        max_new_tokens: Option<u32>,
    ) -> Result<(Vec<Chunk>, Option<Vec<u32>>, usize, u32, u32), ValidationError> {
        // If we have a fast tokenizer
        let (encoding, inputs) = self
            .tokenize(inputs.clone(), add_special_tokens, truncate)
            .await?;
        // Create response channel
        let input_length = if let Some(truncate) = truncate {
            std::cmp::min(encoding.len(), truncate)
        } else {
            encoding.len()
        };

        // Get total tokens
        let (max_new_tokens, max_total_new_tokens) = if let Some(max_new_tokens) = max_new_tokens {
            // Do not accept humongous max_new_tokens queries.
            // We preallocate the default but we prevent a single user
            // from taking up all the slots in a handful of queries that consume little
            // amount of tokens. (You can have 10 token long query that creates a handful of token
            // but the requested amount to be 120k.
            let chunk_size = min(max_new_tokens, DEFAULT_GENERATION_LENGTH);
            (chunk_size, max_new_tokens)
        } else {
            // Use the maximum possible number of tokens as default
            // However, the system will re-queue the request everytime it completes
            // `DEFAULT_GENERATION_LENGTH` tokens.
            let max_new_tokens = self.max_total_tokens.saturating_sub(input_length) as u32;
            (
                min(max_new_tokens, DEFAULT_GENERATION_LENGTH),
                max_new_tokens,
            )
        };
        let total_tokens = input_length + max_new_tokens as usize;

        // Validate MaxTotalTokens
        if total_tokens > self.max_total_tokens {
            return Err(ValidationError::MaxTotalTokens(
                self.max_total_tokens,
                input_length,
                max_new_tokens,
            ));
        }

        // Validate InputLength
        if input_length > self.max_input_length {
            return Err(ValidationError::InputLength(
                self.max_input_length,
                input_length,
            ));
        }

        let ids = encoding.get_ids();
        let input_ids = ids[ids.len().saturating_sub(input_length)..].to_owned();

        metrics::histogram!("tgi_request_input_length").record(input_length as f64);
        Ok((
            inputs,
            Some(input_ids),
            input_length,
            max_new_tokens,
            max_total_new_tokens,
        ))
    }

    /// Validate a payload and get the number of tokens in the input
    #[instrument(skip_all)]
    pub(crate) async fn validate(
        &self,
        request: GenerateRequest,
    ) -> Result<ValidGenerateRequest, ValidationError> {
        let GenerateParameters {
            best_of,
            temperature,
            repetition_penalty,
            frequency_penalty,
            top_k,
            top_p,
            typical_p,
            do_sample,
            max_new_tokens,
            stop: stop_sequences,
            truncate,
            seed,
            watermark,
            decoder_input_details,
            top_n_tokens,
            grammar,
            adapter_id,
            ..
        } = request.parameters;

        // sampling must be true when best_of > 1
        let best_of = best_of.unwrap_or(1);
        let sampling = do_sample
            || temperature.is_some()
            || top_k.is_some()
            || top_p.is_some()
            || typical_p.is_some();

        if best_of > 1 && !sampling {
            return Err(BestOfSampling);
        }

        let temperature = temperature.unwrap_or(1.0);
        if temperature <= 0.0 {
            return Err(ValidationError::Temperature);
        }

        let repetition_penalty = repetition_penalty.unwrap_or(1.0);
        if repetition_penalty <= 0.0 {
            return Err(ValidationError::RepetitionPenalty);
        }

        let frequency_penalty = frequency_penalty.unwrap_or(0.0);
        if !(-2.0..=2.0).contains(&frequency_penalty) {
            return Err(ValidationError::FrequencyPenalty);
        }

        // Different because the proto default value is not a valid value
        // for the user
        let top_p = top_p
            .map(|value| {
                if value <= 0.0 || value >= 1.0 {
                    return Err(ValidationError::TopP);
                }
                Ok(value)
            })
            .unwrap_or(Ok(1.0))?;

        let typical_p = typical_p
            .map(|value| {
                if value <= 0.0 || value >= 1.0 {
                    return Err(ValidationError::TypicalP);
                }
                Ok(value)
            })
            .unwrap_or(Ok(1.0))?;

        let top_k: u32 = top_k
            .map(|value| {
                if value <= 0 {
                    return Err(ValidationError::TopK);
                }
                Ok(value as u32)
            })
            .unwrap_or(Ok(0))?;

        if max_new_tokens == Some(0) {
            return Err(ValidationError::NegativeMaxNewTokens);
        }

        if stop_sequences.len() > self.max_stop_sequences {
            return Err(ValidationError::StopSequence(
                self.max_stop_sequences,
                stop_sequences.len(),
            ));
        }

        // If seed is None, assign a random one
        let seed = match seed {
            None => thread_rng().gen(),
            Some(seed) => {
                if best_of > 1 {
                    return Err(BestOfSeed);
                }
                seed
            }
        };

        let top_n_tokens = top_n_tokens
            .map(|value| {
                if value > self.max_top_n_tokens {
                    return Err(ValidationError::TopNTokens(self.max_top_n_tokens, value));
                }
                Ok(value)
            })
            .unwrap_or(Ok(0))?;

        // Check if inputs is empty
        if request.inputs.is_empty() {
            return Err(EmptyInput);
        }

        // Check if truncate is strictly positive and less than max_input_length
        let truncate = truncate
            .map(|value| {
                if value == 0 || value > self.max_input_length {
                    return Err(ValidationError::Truncate(self.max_input_length, value));
                }
                Ok(Some(value))
            })
            .unwrap_or(Ok(None))?;

        // Validate inputs
        let (inputs, input_ids, input_length, max_new_tokens, max_total_new_tokens) = self
            .validate_input(
                request.inputs,
                request.add_special_tokens,
                truncate,
                max_new_tokens,
            )
            .await?;

        // TODO: we should build the FSM here and pass the compiled FSM instead of the grammar
        // NOTE: this is currently difficult because we need the tokenizer in Python to build
        // the FSM and we'd have to load a copy of the tokenizer into our Pyo3 instance which
        // may be slow and memory intensive. Best case is to have a Rust implementation of the FSM
        // compiler and use that to build the FSM here.

        // Validate grammar and unpack the grammar and type for the proto message
        let grammar = match grammar {
            Some(grammar) => {
                // Ensure that grammar is not set if it's not supported
                if self.disable_grammar_support {
                    return Err(ValidationError::Grammar);
                }
                let valid_grammar = match grammar {
                    GrammarType::Json(json) => {
                        let json = match json {
                            // if value is a string, we need to parse it again to make sure its
                            // a valid json
                            Value::String(s) => serde_json::from_str(&s)
                                .map_err(|e| ValidationError::InvalidGrammar(e.to_string())),
                            Value::Object(_) => Ok(json),
                            _ => Err(ValidationError::Grammar),
                        }?;

                        // Check if the json is a valid JSONSchema
                        jsonschema::draft202012::meta::validate(&json)
                            .map_err(|e| ValidationError::InvalidGrammar(e.to_string()))?;

                        // The schema can be valid but lack properties.
                        // We need properties for the grammar to be successfully parsed in Python.
                        // Therefore, we must check and throw an error if properties are missing.
                        json.get("properties")
                            .ok_or(ValidationError::InvalidGrammar(
                                "Grammar must have a 'properties' field".to_string(),
                            ))?;

                        // Do compilation in the router for performance. In the future, we
                        // should also move regex -> automaton compilation in the router,
                        // but this is not yet supported in pure Rust by outlines-core.
                        let grammar_regex = json_schema_to_regex(&json, None, &json)
                            .map_err(ValidationError::RegexFromSchema)?;

                        ValidGrammar::Regex(grammar_regex.to_string())
                    }
                    GrammarType::JsonSchema(schema_config) => {
                        // Extract the actual schema for validation
                        let json = &schema_config.schema;

                        // Check if the json is a valid JSONSchema
                        jsonschema::draft202012::meta::validate(json)
                            .map_err(|e| ValidationError::InvalidGrammar(e.to_string()))?;

                        // The schema can be valid but lack properties.
                        // We need properties for the grammar to be successfully parsed in Python.
                        // Therefore, we must check and throw an error if properties are missing.
                        json.get("properties")
                            .ok_or(ValidationError::InvalidGrammar(
                                "Grammar must have a 'properties' field".to_string(),
                            ))?;

                        // Do compilation in the router for performance
                        let grammar_regex = json_schema_to_regex(json, None, json)
                            .map_err(ValidationError::RegexFromSchema)?;

                        ValidGrammar::Regex(grammar_regex.to_string())
                    }
                    GrammarType::Regex(regex) => ValidGrammar::Regex(regex),
                };
                Some(valid_grammar)
            }
            None => None,
        };

        let parameters = ValidParameters {
            temperature,
            repetition_penalty,
            frequency_penalty,
            top_k,
            top_p,
            typical_p,
            do_sample,
            seed,
            watermark,
            grammar,
        };
        let stopping_parameters = ValidStoppingParameters {
            max_new_tokens,
            max_total_new_tokens,
            stop_sequences,
            ignore_eos_token: false,
        };

        metrics::histogram!("tgi_request_max_new_tokens").record(max_new_tokens as f64);

        Ok(ValidGenerateRequest {
            inputs,
            input_ids: input_ids.map(Arc::new),
            add_special_tokens: request.add_special_tokens,
            decoder_input_details,
            input_length: input_length as u32,
            truncate: truncate.unwrap_or(self.max_input_length) as u32,
            parameters,
            stopping_parameters,
            top_n_tokens,
            adapter_id,
        })
    }

    /// Validate the best_of parameter
    #[instrument(skip_all)]
    pub(crate) fn validate_best_of(&self, best_of: usize) -> Result<usize, ValidationError> {
        if self.max_best_of == 1 && best_of != 1 {
            return Err(ValidationError::BestOfDisabled);
        }

        if best_of > self.max_best_of {
            return Err(ValidationError::BestOf(self.max_best_of, best_of));
        }

        Ok(best_of)
    }
}

/// Round robin tokenization task
async fn round_robin_task(
    mut receiver: mpsc::UnboundedReceiver<TokenizerRequest>,
    senders: Vec<mpsc::UnboundedSender<TokenizerRequest>>,
) {
    loop {
        for sender in &senders {
            match receiver.recv().await {
                None => return,
                Some(request) => sender.send(request).unwrap(),
            };
        }
    }
}

/// Start tokenization workers
fn tokenizer_worker(
    tokenizer: Tokenizer,
    config: Option<Config>,
    preprocessor_config: Option<HubPreprocessorConfig>,
    mut receiver: mpsc::UnboundedReceiver<TokenizerRequest>,
    max_image_fetch_size: usize,
) {
    match tokenizer {
        Tokenizer::Python {
            tokenizer_name,
            revision,
            trust_remote_code,
        } => {
            pyo3::Python::with_gil(|py| -> pyo3::PyResult<()> {
                let tokenizer =
                    PyTokenizer::from_py(py, tokenizer_name, revision, trust_remote_code)?;
                // Loop over requests
                while let Some(((inputs, add_special_tokens, truncate), response_tx, parent_span)) =
                    receiver.blocking_recv()
                {
                    parent_span.in_scope(|| {
                        response_tx
                            .send(prepare_input(
                                inputs,
                                truncate,
                                add_special_tokens,
                                &tokenizer,
                                config.as_ref(),
                                preprocessor_config.as_ref(),
                                max_image_fetch_size,
                            ))
                            .unwrap_or(())
                    })
                }
                Ok(())
            })
            .expect("Failure in python tokenizer worker");
        }
        Tokenizer::Rust(tokenizer) => {
            while let Some(((inputs, add_special_tokens, truncate), response_tx, parent_span)) =
                receiver.blocking_recv()
            {
                parent_span.in_scope(|| {
                    response_tx
                        .send(prepare_input(
                            inputs,
                            truncate,
                            add_special_tokens,
                            &tokenizer,
                            config.as_ref(),
                            preprocessor_config.as_ref(),
                            max_image_fetch_size,
                        ))
                        .unwrap_or(())
                })
            }
        }
    }
}

fn format_from_mimetype(mimetype: &str) -> Option<ImageFormat> {
    match mimetype {
        "image/png" => Some(ImageFormat::Png),
        "image/jpeg" => Some(ImageFormat::Jpeg),
        "image/jpg" => Some(ImageFormat::Jpeg),
        "image/gif" => Some(ImageFormat::Gif),
        "image/webp" => Some(ImageFormat::WebP),
        "image/tiff" => Some(ImageFormat::Tiff),
        // "image/pnm"=>Some(ImageFormat::Pnm),
        // "image/tga"=>Some(ImageFormat::Tga),
        // "image/dds"=>Some(ImageFormat::Dds),
        // "image/bmp"=>Some(ImageFormat::Bmp),
        // "image/ico"=>Some(ImageFormat::Ico),
        // "image/x-exr"=>Some(ImageFormat::OpenExr),
        _ => None,
    }
}

fn format_to_mimetype(format: ImageFormat) -> String {
    match format {
        ImageFormat::Png => "image/png",
        ImageFormat::Jpeg => "image/jpeg",
        ImageFormat::Gif => "image/gif",
        ImageFormat::WebP => "image/webp",
        ImageFormat::Tiff => "image/tiff",
        _ => "application/octet-stream",
    }
    .to_string()
}

fn fetch_image(
    input: &str,
    max_image_fetch_size: usize,
) -> Result<(Vec<u8>, String, usize, usize), ValidationError> {
    if input.starts_with("![](http://") || input.starts_with("![](https://") {
        let url = &input["![](".len()..input.len() - 1];
        let response = reqwest::blocking::get(url)?;

        // Check Content-Length header if present
        if let Some(content_length) = response.content_length() {
            if content_length as usize > max_image_fetch_size {
                return Err(ValidationError::ImageTooLarge(
                    content_length as usize,
                    max_image_fetch_size,
                ));
            }
        }

        // Read the body with size limit to prevent unbounded memory allocation
        let mut data = Vec::new();
        let mut limited_reader = response.take((max_image_fetch_size + 1) as u64);
        limited_reader.read_to_end(&mut data)?;

        if data.len() > max_image_fetch_size {
            return Err(ValidationError::ImageTooLarge(
                data.len(),
                max_image_fetch_size,
            ));
        }

        let format = image::guess_format(&data)?;
        // TODO Remove this clone
        let img = ImageReader::with_format(Cursor::new(data.clone()), format).decode()?;
        let height: usize = img.height().try_into()?;
        let width: usize = img.width().try_into()?;
        let mimetype = format_to_mimetype(format);
        Ok((data.to_vec(), mimetype, height, width))
    } else if input.starts_with("![](data:") {
        // Remove ![](....)
        let content = &input["![](data:".len()..input.len() - 1];
        let tokens: Vec<_> = content.split(';').collect();
        if tokens.len() != 2 {
            return Err(ValidationError::InvalidImageContent(content.to_string()));
        }
        let mimetype = tokens[0];
        let content = tokens[1];

        if !content.starts_with("base64,") {
            return Err(ValidationError::InvalidImageContent(content.to_string()));
        }

        let data = STANDARD.decode(&content["base64,".len()..])?;
        let img = if let Some(format) = format_from_mimetype(mimetype) {
            ImageReader::with_format(Cursor::new(&data), format).decode()?
        } else {
            ImageReader::new(Cursor::new(&data))
                .with_guessed_format()
                .map_err(|_io_error| ValidationError::InvalidImageContent(content.to_string()))?
                .decode()?
        };

        let height: usize = img.height().try_into()?;
        let width: usize = img.width().try_into()?;
        Ok((data, mimetype.to_string(), height, width))
    } else {
        Err(ValidationError::InvalidImageContent(input.to_string()))
    }
}

fn image_tokens(
    config: &Config,
    preprocessor_config: Option<&HubPreprocessorConfig>,
    height: usize,
    width: usize,
) -> String {
    use Config::*;
    use HubPreprocessorConfig::*;
    match config {
        Idefics => "<image>".to_string(),
        Mllama => "<|image|>".to_string(),
        Idefics2(config) => {
            const FAKE: &str = "<fake_token_around_image>";
            const IMAGE: &str = "<image>";

            let slots = config.get_number_of_features(height, width);

            let mut image_string = String::with_capacity(2 * FAKE.len() + slots * IMAGE.len());
            image_string.push_str(FAKE);
            image_string.extend(iter::repeat_n(IMAGE, slots));
            image_string.push_str(FAKE);

            if matches!(
                preprocessor_config,
                Some(Idefics2Processor(Idefics2Preprocessor {
                    do_image_splitting: true,
                    ..
                }))
            ) {
                image_string = image_string.repeat(5);
            };

            image_string
        }
        Idefics3(config) => {
            const FAKE: &str = "<fake_token_around_image>";
            const IMAGE: &str = "<image>";
            const GLOBAL_IMG: &str = "<global-img>";

            let max_longest_edge_for_image_resize = config.get_max_longest_edge_for_image_resize();
            let max_image_size = config.get_max_image_size();

            let (height, width) = {
                let h = height as f32;
                let w = width as f32;

                // First resize to max_longest_edge (always scale to this size)
                let scale1 = max_longest_edge_for_image_resize as f32 / h.max(w);
                let (h, w) = (h * scale1, w * scale1);

                // Ensure we dont exceed max_size (only scale down)
                let scale2 = (max_image_size as f32 / h.max(w)).min(1.0);

                ((h * scale2) as usize, (w * scale2) as usize)
            };

            let image_seq_len = config.get_number_of_features();
            let max_edge = config.get_max_longest_edge();

            let (image_rows, image_cols) = if height > max_edge || width > max_edge {
                (
                    (height as f32 / max_edge as f32).ceil() as usize,
                    (width as f32 / max_edge as f32).ceil() as usize,
                )
            } else {
                (0, 0)
            };

            let mut image_string = String::new();

            if image_rows == 0 && image_cols == 0 {
                // Single image case
                image_string.push_str(FAKE);
                image_string.push_str(GLOBAL_IMG);
                image_string.push_str(&IMAGE.repeat(image_seq_len));
                image_string.push_str(FAKE);
            } else {
                // Split image case
                for n_h in 0..image_rows {
                    for n_w in 0..image_cols {
                        image_string.push_str(FAKE);
                        image_string.push_str(&format!("<row_{}_col_{}>", n_h + 1, n_w + 1));
                        image_string.push_str(&IMAGE.repeat(image_seq_len));
                    }
                    image_string.push('\n');
                }

                image_string.push('\n');
                image_string.push_str(FAKE);
                image_string.push_str(GLOBAL_IMG);
                image_string.push_str(&IMAGE.repeat(image_seq_len));
                image_string.push_str(FAKE);
            }

            image_string
        }
        Paligemma(config) => "<image>".repeat(config.get_number_of_features(height, width)),
        LlavaNext(config) => "<image>".repeat(config.get_number_of_features(height, width)),
        Llama4(config) => {
            const IMAGE_START: &str = "<|image_start|>";
            const IMAGE: &str = "<|image|>";
            const IMAGE_END: &str = "<|image_end|>";
            const PATCH: &str = "<|patch|>";
            const TILE_X_SEP: &str = "<|tile_x_separator|>";
            const TILE_Y_SEP: &str = "<|tile_y_separator|>";

            let image_height = config.image_size();
            let patch_size = config.patch_size();
            let pixel_shuffle_ratio = config.pixel_shuffle_ratio();
            let max_patches = match preprocessor_config {
                Some(HubPreprocessorConfig::Llama4Processor(cfg)) => cfg.max_patches,
                _ => panic!("Expected Llama4Processor in preprocessor_config"),
            };
            let downsample_ratio =
                (1.0 / (pixel_shuffle_ratio * pixel_shuffle_ratio)).round() as usize;

            let (ratio_h, ratio_w) = config.get_aspect_ratios(height, width, max_patches);
            let image_width = image_height; // Assuming pixel shape: [H][W][C]

            let num_patches_per_chunk =
                (image_height / patch_size) * (image_width / patch_size) / downsample_ratio;

            let mut img_string = String::new();
            img_string.push_str(IMAGE_START);

            if ratio_h * ratio_w > 1 {
                for _yy in 0..ratio_h {
                    for xx in 0..ratio_w {
                        img_string.push_str(&PATCH.repeat(num_patches_per_chunk));
                        if xx < ratio_w - 1 {
                            img_string.push_str(TILE_X_SEP);
                        }
                    }
                    img_string.push_str(TILE_Y_SEP);
                }
            }

            img_string.push_str(IMAGE);
            img_string.push_str(&PATCH.repeat(num_patches_per_chunk));
            img_string.push_str(IMAGE_END);

            img_string
        }
        Qwen2Vl(config) => format!(
            "<|vision_start|>{:?}<|vision_end|>",
            "<|image_pad|>".repeat(config.get_number_of_features(height, width))
        ),
        Qwen2_5Vl(config) => format!(
            "<|vision_start|>{:?}<|vision_end|>",
            "<|image_pad|>".repeat(config.get_number_of_features(height, width))
        ),
        Gemma3(_config) => {
            // TODO: prefer using the config to determine the number of features
            let num_mm_soft_tokens_per_image = 256;
            format!(
                "\n\n<start_of_image>{}<end_of_image>\n\n",
                "<image_soft_token>".repeat(num_mm_soft_tokens_per_image)
            )
        }
        _ => unimplemented!("Images tokens are not supported for this model configuration"),
    }
}

fn image_tokens_fixup(config: &Config, text: String) -> String {
    match config {
        Config::Idefics2(_) => {
            const FAKE: &str = "<fake_token_around_image>";
            text.replace(&format!("{FAKE}{FAKE}"), FAKE)
        }
        _ => text,
    }
}

/// Get input length and optionally truncate it
fn prepare_input<T: TokenizerTrait>(
    inputs: String,
    _truncate: Option<usize>,
    add_special_tokens: bool,
    tokenizer: &T,
    config: Option<&Config>,
    preprocessor_config: Option<&HubPreprocessorConfig>,
    max_image_fetch_size: usize,
) -> Result<(tokenizers::Encoding, Vec<Chunk>), ValidationError> {
    use Config::*;
    static RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"!\[\]\([^\)]*\)").unwrap());
    let (tokenizer_query, input_chunks) = match config {
        Some(
            config @ (Idefics | Mllama | Idefics2(_) | Idefics3(_) | Gemma3(_) | Llama4(_)
            | Paligemma(_) | LlavaNext(_) | Qwen2Vl(_) | Qwen2_5Vl(_)),
        ) => {
            let mut input_chunks = Vec::new();
            let mut tokenizer_query = String::with_capacity(inputs.len());
            let mut start = 0;
            for chunk in RE.find_iter(&inputs) {
                let chunk_start = chunk.start();
                let chunk_end = chunk.end();
                if chunk_start != start {
                    input_chunks.push(Chunk::Text(inputs[start..chunk_start].to_string()));
                    tokenizer_query.push_str(&inputs[start..chunk_start]);
                }
                let (data, mimetype, height, width) =
                    fetch_image(&inputs[chunk_start..chunk_end], max_image_fetch_size)?;
                input_chunks.push(Chunk::Image(Image { data, mimetype }));
                tokenizer_query.push_str(&image_tokens(config, preprocessor_config, height, width));
                start = chunk_end;
            }
            if start != inputs.len() {
                input_chunks.push(Chunk::Text(inputs[start..].to_string()));
                tokenizer_query.push_str(&inputs[start..]);
            }

            tokenizer_query = image_tokens_fixup(config, tokenizer_query);

            (tokenizer_query, input_chunks)
        }
        _ => (inputs.clone(), vec![Chunk::Text(inputs)]),
    };

    // Get the number of tokens in the input
    let encoding = tokenizer
        .encode_trait(tokenizer_query, add_special_tokens)
        .map_err(|err| ValidationError::Tokenizer(err.to_string()))?;

    Ok((encoding, input_chunks))
}

type TokenizerRequest = (
    (String, bool, Option<usize>),
    oneshot::Sender<Result<(tokenizers::Encoding, Vec<Chunk>), ValidationError>>,
    Span,
);

#[derive(Debug, Clone, Eq, PartialEq)]
pub struct Image {
    pub data: Vec<u8>,
    pub mimetype: String,
}

#[derive(Debug, Clone, Eq, PartialEq)]
pub enum Chunk {
    Text(String),
    Image(Image),
}

/// Convert input chunks to a stringly-typed input for backwards
/// compat for backends that haven't implemented chunked inputs.
pub trait ChunksToString {
    /// Convert chunks to string.
    fn chunks_to_string(&self) -> String;
}

impl ChunksToString for Vec<Chunk> {
    fn chunks_to_string(&self) -> String {
        let mut output = String::new();
        self.iter().for_each(|c| match &c {
            Chunk::Text(text) => output.push_str(text),
            Chunk::Image(Image { data, mimetype }) => {
                let encoded = STANDARD.encode(data);
                output.push_str(&format!("![](data:{};base64,{})", mimetype, encoded))
            }
        });
        output
    }
}

#[derive(Debug, Clone)]
pub enum ValidGrammar {
    Json(String),
    Regex(String),
}

#[derive(Debug, Clone)]
pub struct ValidParameters {
    /// / exponential scaling output probability distribution
    pub temperature: f32,
    /// / restricting to the k highest probability elements
    pub top_k: u32,
    /// / restricting to top tokens summing to prob_cut_off <= prob_cut_off
    pub top_p: f32,
    /// / restricting to top tokens summing to prob_cut_off <= prob_cut_off
    pub typical_p: f32,
    /// / apply sampling on the logits
    pub do_sample: bool,
    /// / random seed for sampling
    pub seed: u64,
    /// / repetition penalty
    pub repetition_penalty: f32,
    /// / frequency penalty
    pub frequency_penalty: f32,
    /// / token watermarking using "A Watermark for Large Language Models"
    pub watermark: bool,
    /// / grammar (applied if not empty)
    pub grammar: Option<ValidGrammar>,
}

#[derive(Debug, Clone)]
pub struct ValidStoppingParameters {
    /// / Maximum number of generated tokens
    pub max_new_tokens: u32,
    /// Maximum number of generated tokens before being re-queued by the system
    pub max_total_new_tokens: u32,
    /// / Optional stopping sequences
    pub stop_sequences: Vec<String>,
    /// / Ignore end of sequence token
    /// / used for benchmarking
    pub ignore_eos_token: bool,
}

#[derive(Debug, Clone)]
pub struct ValidGenerateRequest {
    pub inputs: Vec<Chunk>,
    pub input_ids: Option<Arc<Vec<u32>>>,
    pub input_length: u32,
    pub truncate: u32,
    pub add_special_tokens: bool,
    pub decoder_input_details: bool,
    pub parameters: ValidParameters,
    pub stopping_parameters: ValidStoppingParameters,
    pub top_n_tokens: u32,
    pub adapter_id: Option<String>,
}

#[derive(Error, Debug)]
pub enum ValidationError {
    #[error("`best_of` must be > 0 and <= {0}. Given: {1}")]
    BestOf(usize, usize),
    #[error("`best_of` != 1 is not allowed for this endpoint")]
    BestOfDisabled,
    #[error("you must use sampling when `best_of` is > 1")]
    BestOfSampling,
    #[error("`seed` must not be set when `best_of` > 1")]
    BestOfSeed,
    #[error("`best_of` != 1 is not supported when streaming tokens")]
    BestOfStream,
    #[error("`top_n_tokens` must be >= 0 and <= {0}. Given: {1}")]
    TopNTokens(u32, u32),
    #[error("`top_n_tokens` != 0 is not allowed for this endpoint")]
    TopNTokensDisabled,
    #[error("`decoder_input_details` == true is not supported when streaming tokens")]
    PrefillDetailsStream,
    #[error("`temperature` must be strictly positive")]
    Temperature,
    #[error("`repetition_penalty` must be strictly positive")]
    RepetitionPenalty,
    #[error("`frequency_penalty` must be >= -2.0 and <= 2.0")]
    FrequencyPenalty,
    #[error("`top_p` must be > 0.0 and < 1.0")]
    TopP,
    #[error("`top_k` must be strictly positive")]
    TopK,
    #[error("`truncate` must be strictly positive and less than {0}. Given: {1}")]
    Truncate(usize, usize),
    #[error("`typical_p` must be > 0.0 and < 1.0")]
    TypicalP,
    #[error("one of `max_new_tokens` or `truncate` must be set if a fast tokenizer is not in use")]
    UnsetMaxNewTokens,
    #[error("`max_new_tokens` must be strictly positive")]
    NegativeMaxNewTokens,
    #[error("`max_new_tokens` must be <= {0}. Given: {1}")]
    MaxNewTokens(usize, u32),
    #[error("`inputs` tokens + `max_new_tokens` must be <= {0}. Given: {1} `inputs` tokens and {2} `max_new_tokens`")]
    MaxTotalTokens(usize, usize, u32),
    #[error("`inputs` must have less than {0} tokens. Given: {1}")]
    InputLength(usize, usize),
    #[error("`inputs` cannot be empty")]
    EmptyInput,
    #[error("`stop` supports up to {0} stop sequences. Given: {1}")]
    StopSequence(usize, usize),
    #[error("tokenizer error {0}")]
    Tokenizer(String),
    #[error("grammar is not supported")]
    Grammar,
    #[error("grammar is not valid: {0}")]
    InvalidGrammar(String),
    #[error("cannot compile regex from schema: {0}")]
    RegexFromSchema(anyhow::Error),
    #[error("base64 encoding is invalid: {0}")]
    InvalidBase64(#[from] base64::DecodeError),
    #[error("invalid image: {0}")]
    InvalidImage(#[from] image::ImageError),
    #[error("invalid integer: {0}")]
    InvalidInt(#[from] core::num::TryFromIntError),
    #[error("invalid image content: {0}")]
    InvalidImageContent(String),
    #[error("Could not fetch image: {0}")]
    FailedFetchImage(#[from] reqwest::Error),
    #[error("Image size {0} bytes exceeds maximum allowed size of {1} bytes")]
    ImageTooLarge(usize, usize),
    #[error("Failed to read image data: {0}")]
    ImageReadError(#[from] std::io::Error),
    #[error("{0} modality is not supported")]
    UnsupportedModality(&'static str),
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::config::{Idefics2, PaliTextConfig, Paligemma};
    use crate::default_parameters;
    use crate::tests::get_tokenizer;

    #[tokio::test]
    async fn test_validation_max_new_tokens() {
        let tokenizer = get_tokenizer();
        let max_best_of = 2;
        let max_stop_sequence = 3;
        let max_top_n_tokens = 4;
        let max_input_length = 5;
        let max_total_tokens = 6;
        let workers = 1;
        let disable_grammar_support = true;
        let config = None;
        let validation = Validation::new(
            workers,
            tokenizer,
            config,
            None,
            max_best_of,
            max_stop_sequence,
            max_top_n_tokens,
            max_input_length,
            max_total_tokens,
            disable_grammar_support,
            1024 * 1024 * 1024, // 1GB
        );

        let max_new_tokens = 10;
        match validation
            .validate_input("Hello".to_string(), true, None, Some(max_new_tokens))
            .await
        {
            Err(ValidationError::MaxTotalTokens(6, 1, 10)) => (),
            // Ok((_s, _, 0, 10)) => (),
            r => panic!("Unexpected not max new tokens: {r:?}"),
        }
    }

    #[tokio::test]
    async fn test_validation_input_length() {
        let tokenizer = get_tokenizer();
        let max_best_of = 2;
        let max_stop_sequence = 3;
        let max_top_n_tokens = 4;
        let max_input_length = 5;
        let max_total_tokens = 6;
        let disable_grammar_support = true;
        let workers = 1;
        let config = None;
        let validation = Validation::new(
            workers,
            tokenizer,
            config,
            None,
            max_best_of,
            max_stop_sequence,
            max_top_n_tokens,
            max_input_length,
            max_total_tokens,
            disable_grammar_support,
            1024 * 1024 * 1024, // 1GB
        );

        let max_new_tokens = 10;
        match validation
            .validate_input("Hello".to_string(), true, None, Some(max_new_tokens))
            .await
        {
            Err(ValidationError::MaxTotalTokens(6, 1, 10)) => (),
            _ => panic!("Unexpected not max new tokens"),
        }
    }

    #[tokio::test]
    async fn test_validation_best_of_sampling() {
        let tokenizer = get_tokenizer();
        let max_best_of = 2;
        let max_stop_sequence = 3;
        let max_top_n_tokens = 4;
        let max_input_length = 5;
        let max_total_tokens = 6;
        let workers = 1;
        let disable_grammar_support = true;
        let config = None;
        let validation = Validation::new(
            workers,
            tokenizer,
            config,
            None,
            max_best_of,
            max_stop_sequence,
            max_top_n_tokens,
            max_input_length,
            max_total_tokens,
            disable_grammar_support,
            1024 * 1024 * 1024, // 1GB
        );
        match validation
            .validate(GenerateRequest {
                inputs: "Hello".to_string(),
                add_special_tokens: true,
                parameters: GenerateParameters {
                    best_of: Some(2),
                    do_sample: false,
                    ..default_parameters()
                },
            })
            .await
        {
            Err(ValidationError::BestOfSampling) => (),
            _ => panic!("Unexpected not best of sampling"),
        }
    }

    #[tokio::test]
    async fn test_validation_top_p() {
        let tokenizer = get_tokenizer();
        let max_best_of = 2;
        let max_stop_sequence = 3;
        let max_top_n_tokens = 4;
        let max_input_length = 5;
        let max_total_tokens = 106;
        let workers = 1;
        let disable_grammar_support = true;
        let config = None;
        let validation = Validation::new(
            workers,
            tokenizer,
            config,
            None,
            max_best_of,
            max_stop_sequence,
            max_top_n_tokens,
            max_input_length,
            max_total_tokens,
            disable_grammar_support,
            1024 * 1024 * 1024, // 1GB
        );
        match validation
            .validate(GenerateRequest {
                inputs: "Hello".to_string(),
                add_special_tokens: true,
                parameters: GenerateParameters {
                    top_p: Some(1.0),
                    max_new_tokens: Some(5),
                    ..default_parameters()
                },
            })
            .await
        {
            Err(ValidationError::TopP) => (),
            _ => panic!("Unexpected top_p"),
        }

        match validation
            .validate(GenerateRequest {
                inputs: "Hello".to_string(),
                add_special_tokens: true,
                parameters: GenerateParameters {
                    top_p: Some(0.99),
                    max_new_tokens: Some(5),
                    ..default_parameters()
                },
            })
            .await
        {
            Ok(_) => (),
            _ => panic!("Unexpected top_p error"),
        }

        let valid_request = validation
            .validate(GenerateRequest {
                inputs: "Hello".to_string(),
                add_special_tokens: true,
                parameters: GenerateParameters {
                    top_p: None,
                    max_new_tokens: Some(5),
                    ..default_parameters()
                },
            })
            .await
            .unwrap();
        // top_p == 1.0 is invalid for users to ask for but it's the default resolved value.
        assert_eq!(valid_request.parameters.top_p, 1.0);
    }

    #[tokio::test]
    async fn test_validation_top_n_tokens() {
        let tokenizer = get_tokenizer();
        let max_best_of = 2;
        let max_stop_sequences = 3;
        let max_top_n_tokens = 4;
        let max_input_length = 5;
        let max_total_tokens = 106;
        let workers = 1;
        let disable_grammar_support = true;
        let config = None;
        let validation = Validation::new(
            workers,
            tokenizer,
            config,
            None,
            max_best_of,
            max_stop_sequences,
            max_top_n_tokens,
            max_input_length,
            max_total_tokens,
            disable_grammar_support,
            1024 * 1024 * 1024, // 1GB
        );
        match validation
            .validate(GenerateRequest {
                inputs: "Hello".to_string(),
                add_special_tokens: true,
                parameters: GenerateParameters {
                    top_n_tokens: Some(5),
                    max_new_tokens: Some(5),
                    ..default_parameters()
                },
            })
            .await
        {
            Err(ValidationError::TopNTokens(4, 5)) => (),
            _ => panic!("Unexpected top_n_tokens"),
        }

        validation
            .validate(GenerateRequest {
                inputs: "Hello".to_string(),
                add_special_tokens: true,
                parameters: GenerateParameters {
                    top_n_tokens: Some(4),
                    max_new_tokens: Some(5),
                    ..default_parameters()
                },
            })
            .await
            .unwrap();

        validation
            .validate(GenerateRequest {
                inputs: "Hello".to_string(),
                add_special_tokens: true,
                parameters: GenerateParameters {
                    top_n_tokens: Some(0),
                    max_new_tokens: Some(5),
                    ..default_parameters()
                },
            })
            .await
            .unwrap();

        let valid_request = validation
            .validate(GenerateRequest {
                inputs: "Hello".to_string(),
                add_special_tokens: true,
                parameters: GenerateParameters {
                    top_n_tokens: None,
                    max_new_tokens: Some(5),
                    ..default_parameters()
                },
            })
            .await
            .unwrap();

        assert_eq!(valid_request.top_n_tokens, 0);
    }

    static PIXEL_GIF: &str = "R0lGODdhAQABAIEAAP///wAAAAAAAAAAACwAAAAAAQABAAAIBAABBAQAOw==";

    #[tokio::test]
    async fn test_prepare_input_chunks() {
        let pixel_data = STANDARD.decode(PIXEL_GIF).unwrap();

        let tokenizer = get_tokenizer();

        let max_best_of = 2;
        let max_stop_sequence = 3;
        let max_top_n_tokens = 4;
        let max_input_length = 5;
        let max_total_tokens = 6;
        let disable_grammar_support = true;
        let workers = 1;
        let config = Config::Paligemma(Paligemma {
            text_config: PaliTextConfig {
                num_image_tokens: 1,
            },
        });
        let validation = Validation::new(
            workers,
            tokenizer,
            Some(config),
            None,
            max_best_of,
            max_stop_sequence,
            max_top_n_tokens,
            max_input_length,
            max_total_tokens,
            disable_grammar_support,
            1024 * 1024 * 1024, // 1GB
        );

        let chunks = match validation
            .tokenize(
                format!("test![](data:image/gif;base64,{})", PIXEL_GIF),
                true,
                None,
            )
            .await
        {
            Ok((_encoding, chunks)) => chunks,
            _ => panic!("Unexpected tokenization failure"),
        };

        assert!(
            chunks
                == vec![
                    Chunk::Text("test".to_string()),
                    Chunk::Image(Image {
                        data: pixel_data.clone(),
                        mimetype: "image/gif".to_string()
                    })
                ],
            "Failed to process images",
        );
    }

    #[tokio::test]
    async fn test_idefics2_correct_n_fake_tokens() {
        let pixel_data = STANDARD.decode(PIXEL_GIF).unwrap();

        let tokenizer = get_tokenizer();

        let max_best_of = 2;
        let max_stop_sequence = 3;
        let max_top_n_tokens = 4;
        let max_input_length = 5;
        let max_total_tokens = 6;
        let disable_grammar_support = true;
        let workers = 1;
        let config = Config::Idefics2(Idefics2 {});
        let validation = Validation::new(
            workers,
            tokenizer,
            Some(config),
            Some(HubPreprocessorConfig::Idefics2Processor(
                Idefics2Preprocessor {
                    do_image_splitting: true,
                },
            )),
            max_best_of,
            max_stop_sequence,
            max_top_n_tokens,
            max_input_length,
            max_total_tokens,
            disable_grammar_support,
            1024 * 1024 * 1024, // 1GB
        );

        let (encoding, chunks) = match validation
            .tokenize(
                format!(
                    "test![](data:image/gif;base64,{})![](data:image/gif;base64,{})",
                    PIXEL_GIF, PIXEL_GIF
                ),
                true,
                None,
            )
            .await
        {
            Ok((encoding, chunks)) => (encoding, chunks),
            _ => panic!("Unexpected tokenization failure"),
        };

        assert!(
            chunks
                == vec![
                    Chunk::Text("test".to_string()),
                    Chunk::Image(Image {
                        data: pixel_data.clone(),
                        mimetype: "image/gif".to_string()
                    }),
                    Chunk::Image(Image {
                        data: pixel_data.clone(),
                        mimetype: "image/gif".to_string()
                    })
                ],
            "Failed to process images",
        );

        // Verify the number of fake tokens:
        //
        // - Two images surrounded/separated by a fake token = 3.
        // - Both are split in 5 subimages, separated by a fake token: 2 * 4
        //
        // Fake tokens get split up by the testing tokenizer, but we don't care.
        assert_eq!(
            encoding
                .get_tokens()
                .iter()
                .filter(|t| *t == "fake")
                .count(),
            11
        );
    }
}


================================================
FILE: router/src/vertex.rs
================================================
use crate::infer::Infer;
use crate::server::{generate_internal, ComputeType};
use crate::{ChatRequest, ErrorResponse, GenerateParameters, GenerateRequest};
use axum::extract::Extension;
use axum::http::{HeaderMap, StatusCode};
use axum::response::{IntoResponse, Response};
use axum::Json;
use serde::{Deserialize, Serialize};
use tracing::instrument;
use tracing_opentelemetry::OpenTelemetrySpanExt;
use utoipa::ToSchema;

#[derive(Clone, Deserialize, ToSchema)]
#[cfg_attr(test, derive(Debug, PartialEq))]
pub(crate) struct GenerateVertexInstance {
    #[schema(example = "What is Deep Learning?")]
    pub inputs: String,
    #[schema(nullable = true, default = "null", example = "null")]
    pub parameters: Option<GenerateParameters>,
}

#[derive(Clone, Deserialize, ToSchema)]
#[cfg_attr(test, derive(Debug, PartialEq))]
#[serde(untagged)]
pub(crate) enum VertexInstance {
    Generate(GenerateVertexInstance),
    Chat(ChatRequest),
}

#[derive(Deserialize, ToSchema)]
#[cfg_attr(test, derive(Debug, PartialEq))]
pub(crate) struct VertexRequest {
    #[serde(rename = "instances")]
    pub instances: Vec<VertexInstance>,
}

#[derive(Clone, Deserialize, ToSchema, Serialize)]
pub(crate) struct VertexResponse {
    pub predictions: Vec<String>,
}

/// Generate tokens from Vertex request
#[utoipa::path(
post,
tag = "Text Generation Inference",
path = "/vertex",
request_body = VertexRequest,
responses(
(status = 200, description = "Generated Text", body = VertexResponse),
(status = 424, description = "Generation Error", body = ErrorResponse,
example = json ! ({"error": "Request failed during generation"})),
(status = 429, description = "Model is overloaded", body = ErrorResponse,
example = json ! ({"error": "Model is overloaded"})),
(status = 422, description = "Input validation error", body = ErrorResponse,
example = json ! ({"error": "Input validation error"})),
(status = 500, description = "Incomplete generation", body = ErrorResponse,
example = json ! ({"error": "Incomplete generation"})),
)
)]
#[instrument(
    skip_all,
    fields(
        total_time,
        validation_time,
        queue_time,
        inference_time,
        time_per_token,
        seed,
    )
)]
pub(crate) async fn vertex_compatibility(
    Extension(infer): Extension<Infer>,
    Extension(compute_type): Extension<ComputeType>,
    Extension(context): Extension<Option<opentelemetry::Context>>,
    Json(req): Json<VertexRequest>,
) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
    let span = tracing::Span::current();
    if let Some(context) = context {
        span.set_parent(context);
    }

    metrics::counter!("tgi_request_count").increment(1);

    // check that theres at least one instance
    if req.instances.is_empty() {
        return Err((
            StatusCode::UNPROCESSABLE_ENTITY,
            Json(ErrorResponse {
                error: "Input validation error".to_string(),
                error_type: "Input validation error".to_string(),
            }),
        ));
    }

    // Prepare futures for all instances
    let mut futures = Vec::with_capacity(req.instances.len());

    for instance in req.instances.into_iter() {
        let generate_request = match instance {
            VertexInstance::Generate(instance) => GenerateRequest {
                inputs: instance.inputs.clone(),
                add_special_tokens: true,
                parameters: GenerateParameters {
                    do_sample: true,
                    max_new_tokens: instance.parameters.as_ref().and_then(|p| p.max_new_tokens),
                    seed: instance.parameters.as_ref().and_then(|p| p.seed),
                    details: true,
                    decoder_input_details: true,
                    ..Default::default()
                },
            },
            VertexInstance::Chat(instance) => {
                let (generate_request, _using_tools): (GenerateRequest, bool) =
                    instance.try_into_generate(&infer)?;
                generate_request
            }
        };

        let infer_clone = infer.clone();
        let compute_type_clone = compute_type.clone();
        let span_clone = span.clone();

        futures.push(async move {
            generate_internal(
                Extension(infer_clone),
                compute_type_clone,
                Json(generate_request),
                span_clone,
            )
            .await
            .map(|(_, _, Json(generation))| generation.generated_text)
            .map_err(|_| {
                (
                    StatusCode::INTERNAL_SERVER_ERROR,
                    Json(ErrorResponse {
                        error: "Incomplete generation".into(),
                        error_type: "Incomplete generation".into(),
                    }),
                )
            })
        });
    }

    // execute all futures in parallel, collect results, returning early if any error occurs
    let results = futures::future::join_all(futures).await;
    let predictions: Result<Vec<_>, _> = results.into_iter().collect();
    let predictions = predictions?;

    let response = VertexResponse { predictions };
    Ok((HeaderMap::new(), Json(response)).into_response())
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::{Message, MessageBody, MessageContent};

    #[test]
    fn vertex_deserialization() {
        let string = serde_json::json!({

        "instances": [
            {
                "messages": [{"role": "user", "content": "What's Deep Learning?"}],
                "max_tokens": 128,
                "top_p": 0.95,
                "temperature": 0.7
            }
        ]

        });
        let request: VertexRequest = serde_json::from_value(string).expect("Can deserialize");
        assert_eq!(
            request,
            VertexRequest {
                instances: vec![VertexInstance::Chat(ChatRequest {
                    messages: vec![Message {
                        name: None,
                        role: "user".to_string(),
                        body: MessageBody::Content {
                            content: MessageContent::SingleText(
                                "What's Deep Learning?".to_string()
                            )
                        },
                    },],
                    max_tokens: Some(128),
                    top_p: Some(0.95),
                    temperature: Some(0.7),
                    ..Default::default()
                })]
            }
        );
    }
}


================================================
FILE: rust-toolchain.toml
================================================
[toolchain]
# Released on: 30 January, 2025
# https://releases.rs/docs/1.84.1/
channel = "1.85.1"
components = ["rustfmt", "clippy"]


================================================
FILE: sagemaker-entrypoint.sh
================================================
#!/bin/bash

if [[ -z "${HF_MODEL_ID}" ]]; then
  echo "HF_MODEL_ID must be set"
  exit 1
fi
export MODEL_ID="${HF_MODEL_ID}"

if [[ -n "${HF_MODEL_REVISION}" ]]; then
  export REVISION="${HF_MODEL_REVISION}"
fi

if [[ -n "${SM_NUM_GPUS}" ]]; then
  export NUM_SHARD="${SM_NUM_GPUS}"
fi

if [[ -n "${HF_MODEL_QUANTIZE}" ]]; then
  export QUANTIZE="${HF_MODEL_QUANTIZE}"
fi

if [[ -n "${HF_MODEL_TRUST_REMOTE_CODE}" ]]; then
  export TRUST_REMOTE_CODE="${HF_MODEL_TRUST_REMOTE_CODE}"
fi

text-generation-launcher --port 8080


================================================
FILE: server/.gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
text_generation_server/__pycache__/
text_generation_server/pb/__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# poetry
#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
#   This is especially recommended for binary packages to ensure reproducibility, and is more
#   commonly ignored for libraries.
#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# pdm
#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
#   in version control.
#   https://pdm.fming.dev/#use-with-ide
.pdm.toml

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

transformers
safetensors
flash-attention/
flash-attention-v2/
vllm/
llm-awq/
eetq/
mamba/


================================================
FILE: server/Makefile
================================================
include Makefile-flash-att
include Makefile-flash-att-v2
include Makefile-vllm
include Makefile-awq
include Makefile-selective-scan
include Makefile-exllamav2
include Makefile-flashinfer

unit-tests:
	pip install -U pip uv
	uv pip install -e ".[dev]"
	uv sync --inexact --extra dev --active
	pytest -s -vv -m "not private" tests

gen-server:
	# Compile protos
	pip install -U pip uv
	uv pip install -r requirements_gen.txt
	mkdir text_generation_server/pb || true
	python -m grpc_tools.protoc -I../proto/v3 --python_out=text_generation_server/pb \
		--grpc_python_out=text_generation_server/pb --mypy_out=text_generation_server/pb ../proto/v3/generate.proto
	find text_generation_server/pb/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \;
	touch text_generation_server/pb/__init__.py

gen-server-raw:
	mkdir text_generation_server/pb || true
	python -m grpc_tools.protoc -I../proto/v3 --python_out=text_generation_server/pb \
		--grpc_python_out=text_generation_server/pb --mypy_out=text_generation_server/pb ../proto/v3/generate.proto
	find text_generation_server/pb/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \;
	touch text_generation_server/pb/__init__.py

install-server: gen-server
	uv sync --inexact --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines --active


install: install-cuda
	echo "Installed server"

install-cuda: install-server install-flash-attention-v2-cuda install-flash-attention
	uv sync --inexact --extra attention --extra bnb --active
	uv pip install nvidia-nccl-cu12==2.22.3
	kernels download .

install-rocm: install-server install-flash-attention-v2-rocm  install-vllm-rocm

export-requirements:
	uv pip compile pyproject.toml --extra gen -o requirements_gen.txt --python-version 3.11
	uv pip compile pyproject.toml --extra bnb --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines -o requirements_cuda.txt --python-version 3.11
	uv pip compile pyproject.toml --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines -o requirements_intel.txt --python-version 3.11
	uv pip compile pyproject.toml --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines -o requirements_rocm.txt --python-version 3.11


================================================
FILE: server/Makefile-awq
================================================
# Fork that adds only the correct stream to this kernel in order
# to make cuda graphs work.
awq_commit := bd1dc2d5254345cc76ab71894651fb821275bdd4

awq:
	rm -rf llm-awq
	git clone https://github.com/huggingface/llm-awq

build-awq: awq
	cd llm-awq/ && git fetch && git checkout $(awq_commit)
	cd llm-awq/awq/kernels && python setup.py build

install-awq: build-awq
	pip uninstall awq_inference_engine -y || true
	cd llm-awq/awq/kernels && python setup.py install


================================================
FILE: server/Makefile-eetq
================================================
eetq_commit := 465e9726bf7ae30803a2d0dd9e5d4315aef17491

eetq:
    # Clone eetq
	pip install packaging
	git clone https://github.com/NetEase-FuXi/EETQ.git eetq

build-eetq: eetq
	cd eetq && git fetch && git checkout $(eetq_commit) && git submodule update --init --recursive
	cd eetq && python setup.py build

install-eetq: build-eetq
	cd eetq && python setup.py install


================================================
FILE: server/Makefile-exllamav2
================================================
exllamav2_commit := v0.1.8

build-exllamav2:
	git clone https://github.com/turboderp/exllamav2.git exllamav2 && \
	cd exllamav2 && git fetch && git checkout $(exllamav2_commit)  && \
	git submodule update --init --recursive && \
	pip install -r requirements.txt && \
	CUDA_ARCH_LIST="8.0;9.0a" NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90a,code=sm_90a" TORCH_CUDA_ARCH_LIST="8.0;9.0a" python setup.py build

install-exllamav2: build-exllamav2
	cd exllamav2/ &&  \
	CUDA_ARCH_LIST="8.0;9.0a" NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90a,code=sm_90a" TORCH_CUDA_ARCH_LIST="8.0;9.0a" python setup.py install


================================================
FILE: server/Makefile-flash-att
================================================
flash_att_commit := ceee0de88c037ee6eda5e75c813a8648e4bcb1c9

build-flash-attention:
	if [ ! -d 'flash-attention' ]; then \
		pip install -U packaging ninja  --no-cache-dir && \
		git clone https://github.com/Narsil/flash-attention.git; \
	fi
	cd flash-attention && git fetch && git checkout $(flash_att_commit) && \
	MAX_JOBS=8 python setup.py build && cd csrc/layer_norm && python setup.py build && cd ../rotary && python setup.py build

install-flash-attention: build-flash-attention
	cd flash-attention && git checkout $(flash_att_commit) && MAX_JOBS=8 python setup.py install && cd csrc/layer_norm && python setup.py install && cd ../rotary && python setup.py install


================================================
FILE: server/Makefile-flash-att-v2
================================================
flash_att_v2_commit_cuda := v2.6.1
flash_att_v2_commit_rocm := 47bd46e0204a95762ae48712fd1a3978827c77fd

build-flash-attention-v2-cuda:
	pip install -U packaging wheel
	pip install --no-build-isolation flash-attn==$(flash_att_v2_commit_cuda)

install-flash-attention-v2-cuda: build-flash-attention-v2-cuda
	echo "Flash v2 installed"

build-flash-attention-v2-rocm:
	if [ ! -d 'flash-attention-v2' ]; then \
		pip install -U packaging ninja  --no-cache-dir && \
		git clone https://github.com/mht-sharma/flash-attention.git flash-attention-v2 && \
		cd flash-attention-v2 && git fetch && git checkout $(flash_att_v2_commit_rocm) && \
		git submodule update --init --recursive && GPU_ARCHS="gfx90a;gfx942" PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py build; \
	fi

install-flash-attention-v2-rocm: build-flash-attention-v2-rocm
	cd flash-attention-v2 &&  \
	GPU_ARCHS="gfx90a;gfx942" PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py install


================================================
FILE: server/Makefile-flashinfer
================================================
install-flashinfer:
	# We need fsspec as an additional dependency, but
	# `pip install flashinfer` cannot resolve it.
	uv pip install fsspec sympy==1.13.1 numpy
	uv pip install -U setuptools
	TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;8.9;9.0+PTX" FLASHINFER_ENABLE_AOT=1 pip install git+https://github.com/flashinfer-ai/flashinfer.git@v0.2.0.post2#egg=flashinfer-python  --no-build-isolation


================================================
FILE: server/Makefile-selective-scan
================================================
selective_scan_commit := 2a3704fd47ba817b415627b06fd796b971fdc137

causal-conv1d:
	rm -rf causal-conv1d
	git clone https://github.com/Dao-AILab/causal-conv1d.git

build-causal-conv1d: causal-conv1d
	cd causal-conv1d/ && git checkout v1.1.1 # known latest working version tag
	cd causal-conv1d/ && CAUSAL_CONV1D_FORCE_BUILD=TRUE python setup.py build

install-causal-conv1d: build-causal-conv1d
	pip uninstall causal-conv1d -y || true
	cd causal-conv1d/ && pip install .

# selective-scan dependends on causal-conv1d
selective-scan:
	rm -rf mamba
	git clone https://github.com/state-spaces/mamba.git mamba

build-selective-scan: selective-scan
	cd mamba/ && git fetch && git checkout $(selective_scan_commit)
	cd mamba && python setup.py build

install-selective-scan: install-causal-conv1d build-selective-scan
	pip uninstall selective-scan-cuda -y || true
	cd mamba && pip install .

build-all: build-causal-conv1d build-selective-scan


================================================
FILE: server/Makefile-vllm
================================================
commit_rocm := de990cd12537f78f74e40b5c8ee1a62d63d734dd

build-vllm-rocm:
	if [ ! -d 'vllm' ]; then \
		pip install -U ninja packaging --no-cache-dir && \
		git clone https://github.com/mht-sharma/vllm.git vllm; \
	fi
	cd vllm && git fetch && git checkout $(commit_rocm) &&  \
	PYTORCH_ROCM_ARCH="gfx90a;gfx942" python3 setup.py bdist_wheel --dist-dir=dist

install-vllm-rocm: build-vllm-rocm
	cd vllm && git fetch && git checkout $(commit_rocm)


================================================
FILE: server/README.md
================================================
# Text Generation Inference Python gRPC Server

A Python gRPC server for Text Generation Inference

## Install

```shell
make install
```

## Run

```shell
make run-dev
```


================================================
FILE: server/bounds-from-nix.py
================================================
#!/usr/bin/env python3

import json
import subprocess
from typing import Dict, Union
import toml

# Special cases that have download URLs.
SKIP = {"attention-kernels", "marlin-kernels", "moe-kernels"}


def is_optional(info: Union[str, Dict[str, str]]) -> bool:
    return isinstance(info, dict) and "optional" in info and info["optional"]


if __name__ == "__main__":
    with open("pyproject.toml") as f:
        pyproject = toml.load(f)

    nix_packages = json.loads(
        subprocess.run(
            ["nix", "develop", ".#server", "--command", "pip", "list", "--format=json"],
            stdout=subprocess.PIPE,
        ).stdout
    )

    nix_packages = {pkg["name"]: pkg["version"] for pkg in nix_packages}

    packages = []
    optional_packages = []

    for package, info in pyproject["tool"]["poetry"]["dependencies"].items():
        if package in nix_packages and package not in SKIP:
            if is_optional(info):
                optional_packages.append(f'"{package}@^{nix_packages[package]}"')
            else:
                packages.append(f'"{package}@^{nix_packages[package]}"')

    print(f"poetry add {' '.join(packages)}")
    print(f"poetry add --optional {' '.join(optional_packages)}")


================================================
FILE: server/custom_kernels/custom_kernels/fused_attention_cuda.cu
================================================
#include <ATen/Dispatch.h>
#include <THC/THCAtomics.cuh>
#include <ATen/ATen.h>
#include <torch/torch.h>
#include <vector>

#include <optional>

/**
* Friendly reminder of how multithreading works in CUDA: https://developer.nvidia.com/blog/even-easier-introduction-cuda
* Check example at https://github.com/thomasw21/LinearTransformers/blob/main/model/attention/fast_weight/fast_weight_cuda.cu
**/

// Available in pytorch main
//#define DISPATCH_CASE_FLOATING_TYPES(...) \
//  at::AT_DISPATCH_CASE(at::ScalarType::Double, __VA_ARGS__) \
//  at::AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
//  at::AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
//  at::AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \

/*
* Forward passes
*/

/**
* cast to fp32 if in fp16 + mask + softmax computation in fp32 + cast back to original dtype
**/
template<typename attention_scores_scalar, int64_t min_kv_length_shard_size_per_thread>
__global__ void forward_masked_softmax_kernel(
    const torch::PackedTensorAccessor32<attention_scores_scalar, 2, torch::RestrictPtrTraits> attention_scores, // [B, KV]
    const torch::PackedTensorAccessor32<bool, 2, torch::RestrictPtrTraits> mask, // [B, KV]
    torch::PackedTensorAccessor32<attention_scores_scalar, 2, torch::RestrictPtrTraits> result, // [B, KV]
    const int64_t effective_kv_length,
    const dim3 blockDim,
    const int64_t rows_per_block,
    const int64_t kv_length,
    const int64_t batch_size
) {
    const auto row_id = threadIdx.x / effective_kv_length;
    const auto effective_kv_length_id = threadIdx.x % effective_kv_length;
    const auto kv_length_start = effective_kv_length_id * min_kv_length_shard_size_per_thread;
    auto kv_length_end_ = (effective_kv_length_id + 1) * min_kv_length_shard_size_per_thread;
    kv_length_end_ = (kv_length_end_ > kv_length) ? kv_length : kv_length_end_;
    const auto kv_length_end = kv_length_end_;

    const auto batch_id = blockIdx.x * rows_per_block + row_id;

    // We need 2 float storage for each row, one for max computation, the other for normalizing exponential
    extern __shared__ float temp_storage[];
    const auto row_id_mem_offset = row_id * 2;
    if (effective_kv_length_id == 0) {
        temp_storage[row_id_mem_offset] = -std::numeric_limits<float>::infinity();
        temp_storage[row_id_mem_offset + 1] = 0;
    }
    __syncthreads();

    // Compute mask and max
    if (batch_id < batch_size) {
        float thread_max = -std::numeric_limits<float>::infinity();
        for (int kv_length_id = kv_length_start; kv_length_id < kv_length_end; ++kv_length_id) {
            if (mask[batch_id][kv_length_id] == 0) {
                const float candidate = attention_scores[batch_id][kv_length_id];
                thread_max = (thread_max < candidate) ? candidate : thread_max;
            }
        }
        if (thread_max != -std::numeric_limits<float>::infinity()) {
            // TODO @thomasw21 with more memory we can probably compute a much faster `max-reduce` in parallel O(ln(n)) operations in each memory slot
            gpuAtomicMax(&temp_storage[row_id_mem_offset], thread_max);
        }
    }

    __syncthreads();

    // Compute exp(elt - max) masked
    float exponential[min_kv_length_shard_size_per_thread];
    if (batch_id < batch_size) {
        float thread_add = 0;
        for (int kv_length_id = kv_length_start; kv_length_id < kv_length_end; ++kv_length_id) {
            if (mask[batch_id][kv_length_id] == 0) {
                exponential[kv_length_id - kv_length_start] = std::exp(static_cast<float>(attention_scores[batch_id][kv_length_id]) - temp_storage[row_id_mem_offset]);
                thread_add = thread_add + exponential[kv_length_id - kv_length_start];
            } else {
                exponential[kv_length_id - kv_length_start] = 0.;
            }
        }
        if (thread_add > 0) {
            // TODO @thomasw21 with more memory we can probably compute a much faster `sum-reduce` in parallel O(ln(n)) operations in each memory slot
            gpuAtomicAdd(&temp_storage[row_id_mem_offset + 1], thread_add);
        }
    }

    __syncthreads();

    // Compute softmax
    if (batch_id < batch_size) {
        // If sum of all exponential is 0, we set the softmax values to 0
        if (temp_storage[row_id_mem_offset + 1] == 0.) {
            for (int kv_length_id = kv_length_start; kv_length_id < kv_length_end; ++kv_length_id) {
                result[batch_id][kv_length_id] = 0.;
            }
        } else {
            for (int kv_length_id = kv_length_start; kv_length_id < kv_length_end; ++kv_length_id) {
                result[batch_id][kv_length_id] = static_cast<attention_scores_scalar>(exponential[kv_length_id - kv_length_start] / temp_storage[row_id_mem_offset + 1]);
            }
        }
    }
}

#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

std::tuple<at::Tensor, std::optional<std::vector<at::Tensor>>, at::Tensor> forward(
    const at::Tensor query,
    const at::Tensor key,
    const at::Tensor value,
    const std::optional<std::vector<at::Tensor>> layer_past,
    const at::Tensor attention_mask,
    const std::optional<at::Tensor> head_mask,
    const float inv_norm_factor,
    const int num_heads,
    const bool use_cache
) {
    auto query_layer = query;
    auto key_layer = key;
    auto value_layer = value;

     if (layer_past) {
        const auto past_key = (*layer_past).at(0);
        const auto past_value = (*layer_past).at(1);
        key_layer = at::cat({past_key, key_layer}, 2);
        value_layer = at::cat({past_value, value_layer}, 2);
    }

    std::optional<std::vector<at::Tensor>> present;
    if (use_cache) {
        present = {key_layer, value_layer};
    } else {
        present = {};
    }

    const auto batch_size = query_layer.size(0);
    const auto q_length = query_layer.size(2);
    const auto attn_head_size = query_layer.size(3);
    const auto batch_size_times_num_heads = batch_size * num_heads;
    const auto kv_length = key_layer.size(2);

    const auto query_view = query_layer.reshape({batch_size_times_num_heads, q_length, attn_head_size});
    auto key_view = key_layer.reshape({batch_size_times_num_heads, kv_length, attn_head_size}).transpose(1, 2);
    auto value_view = value_layer.reshape({batch_size_times_num_heads, kv_length, attn_head_size});

    auto query_scaled = query_view * inv_norm_factor;
    auto attention_scores = at::bmm(query_scaled, key_view);

    // Computing `optionally_cast_fp16_to_fp32 + masked_fill + softmax + cast_to_intial_dtype`
    at::Tensor attention_probs;
    if (true) {
        // TODO @thomasw21: it's easier to think of attention_scores as 2D tensors
        const auto attention_scores_2d = attention_scores.view({batch_size_times_num_heads * q_length, kv_length});
        const auto attention_mask_2d = attention_mask.view({batch_size_times_num_heads * q_length, kv_length});

        // Custom kernel
        attention_probs = at::empty_like(attention_scores_2d);

        // Check that inputs and contiguous + cuda tensors
        CHECK_INPUT(attention_scores_2d);
        CHECK_INPUT(attention_mask_2d);

        // TODO @thomas21: change by to this as it's cleaner when pytorch 1.13 comes out
        // DISPATCH_CASE_FLOATING_TYPES(attention_scores.scalar_type(), "masked_softmax", [&] {
        AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, attention_scores.scalar_type(), "masked_softmax", [&] {
            /*
            * Understanding how GPUs work: https://developer.nvidia.com/blog/cuda-refresher-cuda-programming-model/
            * A100 specifications: https://images.nvidia.com/aem-dam/en-zz/Solutions/data-center/nvidia-ampere-architecture-whitepaper.pdf
            *  - SMs: 108
            *  - TPCs: 56 (What's that?)
            *  - Memory size: 40 GB
            *  - L2 Cache size: 40960 KB (shared across all SMs)
            *  - L1/Shared memory size: 192 KB (shared across all threads within a SM)
            *  - Max Threads / SM: 2048
            *  - Max Thread Blocks / SM: 32
            */

            /*
            * We should split [batch_size_times_num_heads_block, q_length] in seperate blocks and [batch_size_times_num_heads_block_size, kv_length] a single block
            * with multiple threads as we need to `sync_threads` to run exponential sum.
            * We maximise the usage of threads within a single block
            */
            // TODO @thomasw21 figure out everything warp related:
            //  - why do they have to be power of 2
            // TODO @thomas21 check why everyone is setting 1024 when officially it's 2048
            const auto MAX_THREADS_PER_SM = 1024;
            // TODO @thomasw21 figure out how to have longer sequences, currently the maximum is `max_kv_length = MAX_THREADS_PER_SM * MIN_KV_LENGTH_SHARD_SIZE_PER_THREAD`
            const auto MIN_KV_LENGTH_SHARD_SIZE_PER_THREAD = 4;
            // `effective_kv_length = ceil(kv_length / MIN_KV_LENGTH_SHARD_SIZE_PER_THREAD)`
            const auto effective_kv_length = (kv_length - 1)/ MIN_KV_LENGTH_SHARD_SIZE_PER_THREAD + 1;
            const auto rows_per_block = MAX_THREADS_PER_SM / effective_kv_length;
            const auto num_blocks = (batch_size_times_num_heads * q_length - 1) / rows_per_block + 1;

            const dim3 gridDim(num_blocks); // Number of blocks that run
            const dim3 blockDim(MAX_THREADS_PER_SM); // Number of threads that run per block
            const int shared_mem_forward = rows_per_block * 2 * sizeof(float);

            // 192 * 2 ** 10
            // const auto MAX_L1_MEMORY = 196608;
            // const auto MAX_SMs = 108;
            // TORCH_CHECK(batch_size_times_num_heads * q_length <= MAX_L1_MEMORY, "Shared memory exceeds 192KB limitation.");
            // TORCH_CHECK(gridDim.x * gridDim.y * gridDim.z <= MAX_SMs, "A100s only have 108 SMs. Raising as require blocks is bigger.");
            // TORCH_CHECK(blockDim.x * blockDim.y * blockDim.z <= MAX_THREADS_PER_SM, "A100s only have 2048 threads per block. Raising as require requested threads is higher.");

            forward_masked_softmax_kernel<scalar_t, MIN_KV_LENGTH_SHARD_SIZE_PER_THREAD><<<gridDim, blockDim, shared_mem_forward>>>(
                attention_scores_2d.packed_accessor32<scalar_t, 2, torch::RestrictPtrTraits>(),
                attention_mask_2d.packed_accessor32<bool, 2, torch::RestrictPtrTraits>(),
                attention_probs.packed_accessor32<scalar_t, 2, torch::RestrictPtrTraits>(),
                effective_kv_length,
                blockDim,
                rows_per_block,
                kv_length,
                batch_size_times_num_heads * q_length
            );
        });
        attention_probs = attention_probs.view({batch_size_times_num_heads, q_length, kv_length});
    } else {
        // Pytorch C++ API
        auto input_dtype = attention_scores.scalar_type();
        if (input_dtype == at::ScalarType::Float) {
            attention_scores = attention_scores.to(at::ScalarType::Float);
        };
        // TODO @thomasw21 Figure out how to get minimum value
        auto attn_weights = attention_scores.masked_fill_(attention_mask, -1e34);
        attention_probs = attn_weights.softmax(-1, at::ScalarType::Float).to(input_dtype);
    }

    auto context_layer = attention_probs.bmm(value_view);

    // `_merge_heads`
    context_layer = context_layer.view({batch_size, num_heads, q_length, attn_head_size});
    context_layer = context_layer.permute({0, 2, 1, 3});
    context_layer = context_layer.reshape({batch_size, q_length, attn_head_size * num_heads});

    return std::make_tuple(context_layer, present, attention_probs);
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def(
        "forward",
        &forward,
        "GPT-Neox attention mechanism forward (CUDA)"
    );
}


================================================
FILE: server/custom_kernels/custom_kernels/fused_bloom_attention_cuda.cu
================================================
#include <ATen/Dispatch.h>
#include <THC/THCAtomics.cuh>
#include <ATen/ATen.h>
#include <torch/torch.h>
#include <vector>

#include <optional>

/**
* Friendly reminder of how multithreading works in CUDA: https://developer.nvidia.com/blog/even-easier-introduction-cuda
* Check example at https://github.com/thomasw21/LinearTransformers/blob/main/model/attention/fast_weight/fast_weight_cuda.cu
**/

// Available in pytorch main
//#define DISPATCH_CASE_FLOATING_TYPES(...) \
//  at::AT_DISPATCH_CASE(at::ScalarType::Double, __VA_ARGS__) \
//  at::AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
//  at::AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
//  at::AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \

/*
* Forward passes
*/

/**
* cast to fp32 if in fp16 + mask + softmax computation in fp32 + cast back to original dtype
**/
template<typename attention_scores_scalar, int64_t min_kv_length_shard_size_per_thread>
__global__ void forward_masked_softmax_kernel(
    const torch::PackedTensorAccessor32<attention_scores_scalar, 2, torch::RestrictPtrTraits> attention_scores, // [B, KV]
    const torch::PackedTensorAccessor32<bool, 2, torch::RestrictPtrTraits> mask, // [B, KV]
    torch::PackedTensorAccessor32<attention_scores_scalar, 2, torch::RestrictPtrTraits> result, // [B, KV]
    const int64_t effective_kv_length,
    const dim3 blockDim,
    const int64_t rows_per_block,
    const int64_t kv_length,
    const int64_t batch_size
) {
    const auto row_id = threadIdx.x / effective_kv_length;
    const auto effective_kv_length_id = threadIdx.x % effective_kv_length;
    const auto kv_length_start = effective_kv_length_id * min_kv_length_shard_size_per_thread;
    auto kv_length_end_ = (effective_kv_length_id + 1) * min_kv_length_shard_size_per_thread;
    kv_length_end_ = (kv_length_end_ > kv_length) ? kv_length : kv_length_end_;
    const auto kv_length_end = kv_length_end_;

    const auto batch_id = blockIdx.x * rows_per_block + row_id;

    // We need 2 float storage for each row, one for max computation, the other for normalizing exponential
    extern __shared__ float temp_storage[];
    const auto row_id_mem_offset = row_id * 2;
    if (effective_kv_length_id == 0) {
        temp_storage[row_id_mem_offset] = -std::numeric_limits<float>::infinity();
        temp_storage[row_id_mem_offset + 1] = 0;
    }
    __syncthreads();

    // Compute mask and max
    if (batch_id < batch_size) {
        float thread_max = -std::numeric_limits<float>::infinity();
        for (int kv_length_id = kv_length_start; kv_length_id < kv_length_end; ++kv_length_id) {
            if (mask[batch_id][kv_length_id] == 0) {
                const float candidate = attention_scores[batch_id][kv_length_id];
                thread_max = (thread_max < candidate) ? candidate : thread_max;
            }
        }
        if (thread_max != -std::numeric_limits<float>::infinity()) {
            // TODO @thomasw21 with more memory we can probably compute a much faster `max-reduce` in parallel O(ln(n)) operations in each memory slot
            gpuAtomicMax(&temp_storage[row_id_mem_offset], thread_max);
        }
    }

    __syncthreads();

    // Compute exp(elt - max) masked
    float exponential[min_kv_length_shard_size_per_thread];
    if (batch_id < batch_size) {
        float thread_add = 0;
        for (int kv_length_id = kv_length_start; kv_length_id < kv_length_end; ++kv_length_id) {
            if (mask[batch_id][kv_length_id] == 0) {
                exponential[kv_length_id - kv_length_start] = std::exp(static_cast<float>(attention_scores[batch_id][kv_length_id]) - temp_storage[row_id_mem_offset]);
                thread_add = thread_add + exponential[kv_length_id - kv_length_start];
            } else {
                exponential[kv_length_id - kv_length_start] = 0.;
            }
        }
        if (thread_add > 0) {
            // TODO @thomasw21 with more memory we can probably compute a much faster `sum-reduce` in parallel O(ln(n)) operations in each memory slot
            gpuAtomicAdd(&temp_storage[row_id_mem_offset + 1], thread_add);
        }
    }

    __syncthreads();

    // Compute softmax
    if (batch_id < batch_size) {
        // If sum of all exponential is 0, we set the softmax values to 0
        if (temp_storage[row_id_mem_offset + 1] == 0.) {
            for (int kv_length_id = kv_length_start; kv_length_id < kv_length_end; ++kv_length_id) {
                result[batch_id][kv_length_id] = 0.;
            }
        } else {
            for (int kv_length_id = kv_length_start; kv_length_id < kv_length_end; ++kv_length_id) {
                result[batch_id][kv_length_id] = static_cast<attention_scores_scalar>(exponential[kv_length_id - kv_length_start] / temp_storage[row_id_mem_offset + 1]);
            }
        }
    }
}

#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

std::tuple<at::Tensor, std::optional<std::vector<at::Tensor>>, at::Tensor> forward(
    const at::Tensor fused_qkv,
    const std::optional<std::vector<at::Tensor>> layer_past,
    const at::Tensor alibi,
    const at::Tensor attention_mask,
    const std::optional<at::Tensor> head_mask,
    const float beta,
    const float inv_norm_factor,
    const int num_heads,
    const bool use_cache
) {
    const auto batch_size = fused_qkv.size(0);
    const auto q_length = fused_qkv.size(1);
    const auto three_times_hidden_size = fused_qkv.size(2);
    const auto head_dim = three_times_hidden_size / (3 * num_heads);
    const auto batch_size_times_num_heads = batch_size * num_heads;

    // `split_heads`
    const auto fused_qkv_view = fused_qkv.view({batch_size, q_length, num_heads, 3 * head_dim});
    const auto tensor_list = fused_qkv_view.split(head_dim, -1);
    const auto query_layer = tensor_list[0].transpose(1, 2).reshape({batch_size_times_num_heads, q_length, head_dim});
    auto key_layer = tensor_list[1].permute({0, 2, 3, 1}).reshape({batch_size_times_num_heads, head_dim, q_length});
    auto value_layer = tensor_list[2].transpose(1, 2).reshape({batch_size_times_num_heads, q_length, head_dim});

    if (layer_past) {
        const auto past_key = (*layer_past).at(0);
        const auto past_value = (*layer_past).at(1);
        key_layer = at::cat({past_key, key_layer}, 2);
        value_layer = at::cat({past_value, value_layer}, 1);
    }

    std::optional<std::vector<at::Tensor>> present;
    if (use_cache) {
        present = {key_layer, value_layer};
    } else {
        present = {};
    }

    auto attention_scores = alibi.baddbmm(query_layer, key_layer, beta, inv_norm_factor);

    // Computing `optionally_cast_fp16_to_fp32 + masked_fill + softmax + cast_to_intial_dtype`
    at::Tensor attention_probs;
    if (true) {
        const auto kv_length = key_layer.size(2);

        // TODO @thomasw21: it's easier to think of attention_scores as 2D tensors
        const auto attention_scores_2d = attention_scores.view({batch_size_times_num_heads * q_length, kv_length});
        const auto attention_mask_2d = attention_mask.view({batch_size_times_num_heads * q_length, kv_length});

        // Custom kernel
        attention_probs = at::empty_like(attention_scores_2d);

        // Check that inputs and contiguous + cuda tensors
        CHECK_INPUT(attention_scores_2d);
        CHECK_INPUT(attention_mask_2d);

        // TODO @thomas21: change by to this as it's cleaner when pytorch 1.13 comes out
        // DISPATCH_CASE_FLOATING_TYPES(attention_scores.scalar_type(), "masked_softmax", [&] {
        AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, attention_scores.scalar_type(), "masked_softmax", [&] {
            /*
            * Understanding how GPUs work: https://developer.nvidia.com/blog/cuda-refresher-cuda-programming-model/
            * A100 specifications: https://images.nvidia.com/aem-dam/en-zz/Solutions/data-center/nvidia-ampere-architecture-whitepaper.pdf
            *  - SMs: 108
            *  - TPCs: 56 (What's that?)
            *  - Memory size: 40 GB
            *  - L2 Cache size: 40960 KB (shared across all SMs)
            *  - L1/Shared memory size: 192 KB (shared across all threads within a SM)
            *  - Max Threads / SM: 2048
            *  - Max Thread Blocks / SM: 32
            */

            /*
            * We should split [batch_size_times_num_heads_block, q_length] in seperate blocks and [batch_size_times_num_heads_block_size, kv_length] a single block
            * with multiple threads as we need to `sync_threads` to run exponential sum.
            * We maximise the usage of threads within a single block
            */
            // TODO @thomasw21 figure out everything warp related:
            //  - why do they have to be power of 2
            // TODO @thomas21 check why everyone is setting 1024 when officially it's 2048
            const auto MAX_THREADS_PER_SM = 1024;
            // TODO @thomasw21 figure out how to have longer sequences, currently the maximum is `max_kv_length = MAX_THREADS_PER_SM * MIN_KV_LENGTH_SHARD_SIZE_PER_THREAD`
            const auto MIN_KV_LENGTH_SHARD_SIZE_PER_THREAD = 4;
            // `effective_kv_length = ceil(kv_length / MIN_KV_LENGTH_SHARD_SIZE_PER_THREAD)`
            const auto effective_kv_length = (kv_length - 1)/ MIN_KV_LENGTH_SHARD_SIZE_PER_THREAD + 1;
            const auto rows_per_block = MAX_THREADS_PER_SM / effective_kv_length;
            const auto num_blocks = (batch_size_times_num_heads * q_length - 1) / rows_per_block + 1;

            const dim3 gridDim(num_blocks); // Number of blocks that run
            const dim3 blockDim(MAX_THREADS_PER_SM); // Number of threads that run per block
            const int shared_mem_forward = rows_per_block * 2 * sizeof(float);

            // 192 * 2 ** 10
            // const auto MAX_L1_MEMORY = 196608;
            // const auto MAX_SMs = 108;
            // TORCH_CHECK(batch_size_times_num_heads * q_length <= MAX_L1_MEMORY, "Shared memory exceeds 192KB limitation.");
            // TORCH_CHECK(gridDim.x * gridDim.y * gridDim.z <= MAX_SMs, "A100s only have 108 SMs. Raising as require blocks is bigger.");
            // TORCH_CHECK(blockDim.x * blockDim.y * blockDim.z <= MAX_THREADS_PER_SM, "A100s only have 2048 threads per block. Raising as require requested threads is higher.");

            forward_masked_softmax_kernel<scalar_t, MIN_KV_LENGTH_SHARD_SIZE_PER_THREAD><<<gridDim, blockDim, shared_mem_forward>>>(
                attention_scores_2d.packed_accessor32<scalar_t, 2, torch::RestrictPtrTraits>(),
                attention_mask_2d.packed_accessor32<bool, 2, torch::RestrictPtrTraits>(),
                attention_probs.packed_accessor32<scalar_t, 2, torch::RestrictPtrTraits>(),
                effective_kv_length,
                blockDim,
                rows_per_block,
                kv_length,
                batch_size_times_num_heads * q_length
            );
        });
        attention_probs = attention_probs.view({batch_size_times_num_heads, q_length, kv_length});
    } else {
        // Pytorch C++ API
        auto input_dtype = attention_scores.scalar_type();
        if (input_dtype == at::ScalarType::Float) {
            attention_scores = attention_scores.to(at::ScalarType::Float);
        };
        // TODO @thomasw21 Figure out how to get minimum value
        auto attn_weights = attention_scores.masked_fill_(attention_mask, -1e34);
        attention_probs = attn_weights.softmax(-1, at::ScalarType::Float).to(input_dtype);
    }

    auto context_layer = attention_probs.bmm(value_layer);

    // `_merge_heads`
    context_layer = context_layer.view({batch_size, num_heads, q_length, head_dim});
    context_layer = context_layer.permute({0, 2, 1, 3});
    context_layer = context_layer.reshape({batch_size, q_length, three_times_hidden_size / 3});

    return std::make_tuple(context_layer, present, attention_probs);
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def(
        "forward",
        &forward,
        "Bloom attention mechanism forward (CUDA)"
    );
}


================================================
FILE: server/custom_kernels/setup.py
================================================
from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension

extra_compile_args = ["-std=c++17"]

setup(
    name="custom_kernels",
    ext_modules=[
        CUDAExtension(
            name="custom_kernels.fused_bloom_attention_cuda",
            sources=["custom_kernels/fused_bloom_attention_cuda.cu"],
            extra_compile_args=extra_compile_args,
        ),
        CUDAExtension(
            name="custom_kernels.fused_attention_cuda",
            sources=["custom_kernels/fused_attention_cuda.cu"],
            extra_compile_args=extra_compile_args,
        ),
    ],
    cmdclass={"build_ext": BuildExtension},
)


================================================
FILE: server/exllama_kernels/exllama_kernels/cu_compat.cuh
================================================
// Adapted from turboderp exllama: https://github.com/turboderp/exllama

#ifndef _cuda_compat_cuh
#define _cuda_compat_cuh

// atomicAdd for half types, to support CC < 7.x

__device__ __forceinline__ void atomicAdd_half(half* address, half val)
{
    unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
    unsigned int old = *address_as_ui;
    unsigned int assumed;

    do
    {
        assumed = old;
        __half_raw hsum;
        hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
        half tmpres = __hadd(hsum, val);
        hsum = __half_raw(tmpres);
        old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
        old = atomicCAS(address_as_ui, assumed, old);
    }
    while (assumed != old);
}

// atomicAdd for half2 types

__device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val)
{
    unsigned int* address_as_ui = (unsigned int*)address;
    unsigned int old = *address_as_ui;
    unsigned int assumed;
    do
    {
        assumed = old;
        half2 old_val = *((half2*)&old);
        half2 new_val = __hadd2(old_val, val);
        old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val));
    }
    while (assumed != old);
}

//

#if defined(__CUDA_ARCH__) || defined(USE_ROCM)
#if __CUDA_ARCH__ < 700 || defined(USE_ROCM)

__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }

#if __CUDA_ARCH__ < 600 || defined(USE_ROCM)
__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
#endif

#endif
#endif

#endif


================================================
FILE: server/exllama_kernels/exllama_kernels/cuda_buffers.cu
================================================
// Adapted from turboderp exllama: https://github.com/turboderp/exllama

#define _cuda_buffers_cu
#include "cuda_buffers.cuh"

CudaBuffers* g_buffers[CUDA_MAX_DEVICES] = {NULL};
// __constant__ half2 q4_table[16][256];
// half2 q4_table_host[16][256];
// bool q4_table_init = false;

CudaBuffers::CudaBuffers
(
    int _device,
    half* _temp_state,
    half* _temp_dq
) :
    device(_device),
    temp_state(_temp_state),
    temp_dq(_temp_dq)
{
    cudaSetDevice(_device);

    cudaStreamCreate(&alt_stream_1);
    cudaStreamCreate(&alt_stream_2);
    cudaStreamCreate(&alt_stream_3);
    cudaEventCreate(&alt_stream_1_done);
    cudaEventCreate(&alt_stream_2_done);
    cudaEventCreate(&alt_stream_3_done);
}

CudaBuffers::~CudaBuffers()
{
    cudaStreamDestroy(alt_stream_1);
    cudaStreamDestroy(alt_stream_2);
    cudaStreamDestroy(alt_stream_3);
    cudaEventDestroy(alt_stream_1_done);
    cudaEventDestroy(alt_stream_2_done);
    cudaEventDestroy(alt_stream_3_done);
}

CudaBuffers* get_buffers(const int device_index)
{
    return g_buffers[device_index];
}

void prepare_buffers_cuda
(
    int _device,
    half* _temp_state,
    half* _temp_dq
)
{
    CudaBuffers* buffers = new CudaBuffers
    (
        _device,
        _temp_state,
        _temp_dq
    );

    g_buffers[_device] = buffers;
}

void cleanup_buffers_cuda()
{
    for (int i = 0; i < CUDA_MAX_DEVICES; i++)
    {
        if (!g_buffers[i]) continue;
        delete g_buffers[i];
        g_buffers[i] = NULL;
    }
}


================================================
FILE: server/exllama_kernels/exllama_kernels/cuda_buffers.cuh
================================================
// Adapted from turboderp exllama: https://github.com/turboderp/exllama

#ifndef _cuda_buffers_cuh
#define _cuda_buffers_cuh

#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cstdint>
#include <cstdio>

const int CUDA_MAX_DEVICES = 16;

// #ifndef _cuda_buffers_cu
// extern __constant__ half2 q4_table[16][256];
// #endif

class CudaBuffers
{
public:
    int device;

    half* temp_state;           // [max_hidden_rows * intermediate_size]
    half* temp_dq;              // size of largest quant tensor * 8

    cudaStream_t alt_stream_1;
    cudaStream_t alt_stream_2;
    cudaStream_t alt_stream_3;
    cudaEvent_t alt_stream_1_done;
    cudaEvent_t alt_stream_2_done;
    cudaEvent_t alt_stream_3_done;

    CudaBuffers
    (
        int _device,
        half* _temp_state,
        half* _temp_dq
    );
    ~CudaBuffers();
};

CudaBuffers* get_buffers(const int device_index);

void prepare_buffers_cuda
(
    int _device,
    half* _temp_state,
    half* _temp_dq
);

void cleanup_buffers_cuda();

#endif


================================================
FILE: server/exllama_kernels/exllama_kernels/cuda_func/column_remap.cu
================================================
// Adapted from turboderp exllama: https://github.com/turboderp/exllama

#include "column_remap.cuh"
#include "../util.cuh"

const int SHUF_BLOCKSIZE_X = 256;
const int SHUF_BLOCKSIZE_Y = 16;

__global__ void column_remap_kernel
(
    const half* __restrict__ x,
    half* __restrict__ x_new,
    const int x_width,
    const int x_height,
    const uint32_t* x_map
)
{
    int x_column = SHUF_BLOCKSIZE_X * blockIdx.x + threadIdx.x;
    int x_row = SHUF_BLOCKSIZE_Y * blockIdx.y;

    int x_stride = x_width;
    int x_idx = x_row * x_stride + x_column;

    int x_row_end = min(x_row + SHUF_BLOCKSIZE_Y, x_height);
    int x_idx_end = x_row_end * x_stride + x_column;

    int s_column = x_map[x_column];
    int s_idx = x_row * x_stride + s_column;

    while (x_idx < x_idx_end)
    {
        x_new[x_idx] = x[s_idx];
        x_idx += x_stride;
        s_idx += x_stride;
    }
}

// Remap columns in x to correspond to sequential group index before matmul
//
// perform x -> seq_x such that seq_x @ seq_w == x @ w

void column_remap_cuda
(
    const half* x,
    half* x_new,
    const int x_height,
    const int x_width,
    const uint32_t* x_map
)
{
    dim3 threads(SHUF_BLOCKSIZE_X, 1, 1);

    dim3 blocks
    (
        (x_width + SHUF_BLOCKSIZE_X - 1) / SHUF_BLOCKSIZE_X,
        (x_height + SHUF_BLOCKSIZE_Y - 1) / SHUF_BLOCKSIZE_Y,
        1
    );

    column_remap_kernel<<<blocks, threads>>>(x, x_new, x_width, x_height, x_map);
}


================================================
FILE: server/exllama_kernels/exllama_kernels/cuda_func/column_remap.cuh
================================================
// Adapted from turboderp exllama: https://github.com/turboderp/exllama

#ifndef _column_remap_cuh
#define _column_remap_cuh

#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cstdint>

void column_remap_cuda
(
    const half* x,
    half* x_new,
    const int x_height,
    const int x_width,
    const uint32_t* x_map
);

#endif


================================================
FILE: server/exllama_kernels/exllama_kernels/cuda_func/q4_matmul.cu
================================================
#include "q4_matmul.cuh"
#include "column_remap.cuh"
#include <ATen/cuda/CUDAContext.h>
#include "../util.cuh"
#include "../matrix.cuh"
#include "../cu_compat.cuh"
#include "../cuda_buffers.cuh"
#if defined(USE_ROCM)
#include "../hip_compat.cuh"
#endif

const int THREADS_X = 32;       // Block size and thread count along columns in w and out
const int THREADS_Y = 1;        // Block size and thread count along rows in x and out

typedef void (*fp_q4_matmul_kernel)
(
    const half*,
    const uint32_t*,
    half*,
    const half*,
    const uint32_t*,
    const int,
    const int,
    const int,
    const int,
    const int,
    const uint32_t*,
    bool
);

template<bool use_half2, bool use_groupsize, bool use_x_map>
__global__ void q4_matmul_kernel
(
    const half* __restrict__ x,
    const uint32_t* __restrict__ w,
    half* __restrict__ out,
    const half* __restrict__ w_scales,
    const uint32_t* __restrict__ w_zeros,
    const int height,
    const int dim,
    const int width,
    const int groupsize,
    const int block_size_z,
    const uint32_t* __restrict__ x_map,
    bool no_zero
)
{
    // Start of block

    int x_column = block_size_z * blockIdx.z;
    int x_column_end = min(dim, block_size_z * (blockIdx.z + 1));

    int w_column = THREADS_X * blockIdx.x + threadIdx.x;
    int x_row = THREADS_Y * blockIdx.y + threadIdx.y;

    int iterations = (x_column_end - x_column) / 8;

    // Views

    MatrixView_half x_(x, height, dim);
    MatrixView_half w_scales_(w_scales, dim / groupsize, width);
    MatrixView_q4_row w_zeros_(w_zeros, dim / groupsize, width);
    MatrixView_q4_column w_(w, dim, width);
    MatrixView_half_rw out_(out, height, width);

    // Zero output

    if (!no_zero && blockIdx.z == 0 && (threadIdx.x & 1) == 0)
    {
        *((uint32_t*) out_.item_ptr(x_row, w_column)) = 0;
        __syncthreads();
    }

    // Loop over part of x row (and w column)

    half2 acc = {};
    half acc_h = {};

    if constexpr (use_groupsize)
    {
        // For quant matrices where groupsize divides BLOCK_SIZE_Z we always start on a group boundary, so this
        // could be slightly faster

        for (int k = x_column, group = x_column / groupsize; k < x_column + iterations * 8; group++, k += groupsize)
        {
            if constexpr (use_half2)
            {
                half2 w_scale = w_scales_.item_half2half2(group, w_column);
                uint32_t w_zero = (w_zeros_.item(group, w_column) + 1) & 0x0F;

                if constexpr (use_x_map) acc = dot_product_8_x_map(acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8, x_map);
                else                     acc = dot_product_8      (acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8);
            }
            else
            {
                half w_scale = w_scales_.item(group, w_column);
                uint32_t w_zero = (w_zeros_.item(group, w_column) + 1) & 0x0F;

                if constexpr (use_x_map) acc_h = dot_product_8_x_map_h(acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8, x_map);
                else                     acc_h = dot_product_8_h      (acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8);
            }
        }
    }
    else
    {
        // Otherwise assume groupsize is a multiple of 8, do 8 columns per iteration and trust the cache

        for (int k = x_column; k < x_column + iterations * 8; k += 8)
        {
            if constexpr (use_half2)
            {
                int group = k / groupsize;
                half2 w_scale = w_scales_.item_half2half2(group, w_column);
                uint32_t w_zero = (w_zeros_.item(group, w_column) + 1) & 0x0F;

                if constexpr (use_x_map) acc = dot_product_8_x_map(acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1, x_map);
                else                     acc = dot_product_8      (acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1);
            }
            else
            {
                int group = k / groupsize;
                half w_scale = w_scales_.item(group, w_column);
                uint32_t w_zero = (w_zeros_.item(group, w_column) + 1) & 0x0F;

                if constexpr (use_x_map) acc_h = dot_product_8_x_map_h(acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1, x_map);
                else                     acc_h = dot_product_8_h      (acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1);
            }
        }
    }

    // Add to block result

    if constexpr (use_half2)
    {
        half result = __hadd(__low2half(acc), __high2half(acc));
        atomicAdd(out_.item_ptr(x_row, w_column), result);
    }
    else
    {
        atomicAdd(out_.item_ptr(x_row, w_column), acc_h);
    }
}

fp_q4_matmul_kernel q4_matmul_kernel_pick(ExLlamaTuning* tuningParams, int block_size_z, int groupsize, uint32_t* x_map)
{
    // <bool use_half2, bool use_groupsize, bool use_x_map>
    if (tuningParams->matmul_no_half2) {
        if (block_size_z % groupsize == 0) {
            if (x_map) return q4_matmul_kernel<false, true,  true >;
            else       return q4_matmul_kernel<false, true,  false>;
        } else {
            if (x_map) return q4_matmul_kernel<false, false, true >;
            else       return q4_matmul_kernel<false, false, false>;
        }
    } else {
        if (block_size_z % groupsize == 0)
        {
            if (x_map) return q4_matmul_kernel<true,  true,  true >;
            else       return q4_matmul_kernel<true,  true,  false>;
        } else {
            if (x_map) return q4_matmul_kernel<true,  false, true >;
            else       return q4_matmul_kernel<true,  false, false>;
        }
    }
};

// Compute y = x @ w

void q4_matmul_cuda
(
    ExLlamaTuning* tuningParams,
    const half* x,
    const int x_height,
    const Q4Matrix* w,
    half* out,
    bool no_zero,
    cudaStream_t alt_stream
)
{
    int height = x_height;
    int dim = w->height;
    int width = w->width;

    cudaSetDevice(w->device);

    uint32_t* x_map = w->cuda_x_map;
    const half* x_mapped = x;
    if (x_map && !tuningParams->matmul_fused_remap && !alt_stream)
    {
        CudaBuffers* buffers = get_buffers(w->device);
        column_remap_cuda(x, buffers->temp_state, x_height, dim, w->cuda_x_map);
        x_mapped = buffers->temp_state;
        x_map = NULL;
    }

    int block_size_z;
    if (w->width == 4096) block_size_z = 384;           // 7B
    else if (w->width == 11008) block_size_z = 256;
    else if (w->width == 5120) block_size_z = 384;      // 13B
    else if (w->width == 13824) block_size_z = 256;
    else if (w->width == 6656) block_size_z = 256;      // 33B
    else if (w->width == 17920) block_size_z = 128;
    else block_size_z = 256;

    //if (!no_zero) cudaMemsetAsync(out, 0, x_height * w->width * sizeof(half));

    dim3 threads(THREADS_X, THREADS_Y, 1);

    dim3 blocks
    (
        (width + threads.x - 1) / threads.x,
        (height + threads.y - 1) / threads.y,
        (dim + block_size_z - 1) / block_size_z
    );

    fp_q4_matmul_kernel kernel = q4_matmul_kernel_pick(tuningParams, block_size_z, w->groupsize, x_map);

    kernel<<<blocks, threads, 0, alt_stream>>> (x_mapped, w->cuda_qweight, out, w->cuda_scales, w->cuda_qzeros, height, dim, width, w->groupsize, block_size_z, x_map, no_zero);
}

void q4_matmul_recons_cuda
(
    ExLlamaTuning* tuningParams,
    const half* x,
    const int x_height,
    Q4Matrix* w,
    half* out,
    bool no_zero,
    const cublasHandle_t handle
)
{
    int height = x_height;
    int dim = w->height;
    int width = w->width;

    cudaSetDevice(w->device);
    CudaBuffers* buffers = get_buffers(w->device);

    const half* x_mapped = x;
    if (w->cuda_x_map)
    {
        column_remap_cuda(x, buffers->temp_state, x_height, dim, w->cuda_x_map);
        x_mapped = buffers->temp_state;
    }

    w->reconstruct(buffers->temp_dq);

    const half alpha = __float2half(1.0f);
    const half beta = no_zero ? __float2half(1.0f) : __float2half(0.0f);
    cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, buffers->temp_dq, width, x_mapped, dim, &beta, out, width);

//     const float alpha = 1.0f;
//     const float beta = no_zero ? 1.0f : 0.0f;
//     cublasSgemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, buffers->temp_dq, CUDA_R_16F, width,
//                 x_mapped, CUDA_R_16F, dim, &beta, out, CUDA_R_16F, width);
}


================================================
FILE: server/exllama_kernels/exllama_kernels/cuda_func/q4_matmul.cuh
================================================
// Adapted from turboderp exllama: https://github.com/turboderp/exllama

#ifndef _q4_matmul_cuh
#define _q4_matmul_cuh

#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cstdint>
#include <cstdio>
#include <ATen/cuda/CUDAContext.h>

#include "q4_matrix.cuh"
#include "../tuning.h"

void q4_matmul_cuda
(
    ExLlamaTuning* tuningParams,
    const half* x,
    const int x_height,
    const Q4Matrix* w,
    half* out,
    bool no_zero,
    cudaStream_t alt_stream
);

void q4_matmul_recons_cuda
(
    ExLlamaTuning* tuningParams,
    const half* x,
    const int x_height,
    Q4Matrix* w,
    half* out,
    bool no_zero,
    const cublasHandle_t handle
);

#endif


================================================
FILE: server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cu
================================================
// Adapted from turboderp exllama: https://github.com/turboderp/exllama

#include <ATen/cuda/CUDAContext.h>
#include "q4_matrix.cuh"
#include <vector>
#include "../util.cuh"
#include "../matrix.cuh"

using namespace std;

const int UNSHUF_BLOCKSIZE_X = 64;

const int RECONS_THREADS_X = 64;      // Block size and thread count along columns in out, each thread converts 1 column
const int RECONS_THREADS_Y = 1;       // Block size and thread count along rows in x and out, each thread converts 8 rows

vector<Q4Matrix*> g_q4_matrices;

void g_q4_keep_matrix(Q4Matrix* m)
{
    g_q4_matrices.push_back(m);
}

void g_q4_free_matrices()
{
    for (const auto& m : g_q4_matrices) delete m;
    g_q4_matrices.clear();
}

Q4Matrix::Q4Matrix
(
    const int _height,
    const int _width,
    const int _groups,

    uint32_t* _qweight,
    uint32_t* _qzeros,
    half* _scales,
    uint32_t* _g_idx,

    const int _device
) :
    height(_height),
    width(_width),
    groups(_groups),
    device(_device)
{
    cudaSetDevice(device);

    cuda_qweight = _qweight;
    cuda_qzeros = _qzeros;
    cuda_scales = _scales;

    groupsize = height / groups;

    if (_g_idx) make_sequential(_g_idx);
}

Q4Matrix::~Q4Matrix()
{
}

// Make sequential

__global__ void make_sequential_kernel
(
    const uint32_t* __restrict__ w,
    uint32_t* __restrict__ w_new,
    const uint32_t* __restrict__ x_map,
    const int w_height,
    const int w_width
)
{
    const uint64_t* w2 = (uint64_t*) w;
    uint64_t* w_new2 = (uint64_t*) w_new;
    int w2_stride = w_width >> 1;

    int w2_column = UNSHUF_BLOCKSIZE_X * blockIdx.x + threadIdx.x;
    int w_new2_row = blockIdx.y;

    int x_map_idx = w_new2_row << 3;

    uint64_t dst = 0;

    #pragma unroll
    for (int i = 0; i < 8; i++)
    {
        int source_row = x_map[x_map_idx++];

        int w2_row = source_row >> 3;
        int w2_subrow = source_row & 0x07;
        int w2_row_shift = w2_subrow << 2;
        int wnew2_row_shift = i << 2;

    uint64_t src = w2[w2_row * w2_stride + w2_column];
        src >>= w2_row_shift;
        src &= 0x0000000f0000000f;
        src <<= wnew2_row_shift;
        dst |= src;
    }

    w_new2[w_new2_row * w2_stride + w2_column] = dst;
}

void Q4Matrix::make_sequential(const uint32_t* cpu_g_idx)
{
    uint32_t* cuda_new_qweight = NULL;
    cudaMalloc(&cuda_new_qweight, height / 8 * width * sizeof(uint32_t));
    cudaMalloc(&cuda_x_map, height * sizeof(uint32_t));  // TODO: Should probably be allocated in PyTorch

    uint32_t* cpu_g_idx_map = (uint32_t*) calloc(groups, sizeof(uint32_t));
    uint32_t* cpu_x_map = (uint32_t*) malloc(height * sizeof(uint32_t));
    uint32_t* cpu_x_map_inv = (uint32_t*) malloc(height * sizeof(uint32_t));

    // Group histogram

    for (int i = 0; i < height; i++) cpu_g_idx_map[cpu_g_idx[i]]++;

    // Group map

    for (int i = 0, acc = 0; i < groups; i++)
    {
        short tmp = cpu_g_idx_map[i];
        cpu_g_idx_map[i] = acc;
        acc += tmp;
    }

    // X map (inverse)

    for (int row = 0; row < height; row++)
    {
        uint32_t target_group = cpu_g_idx[row];
        uint32_t target_row = cpu_g_idx_map[target_group];
        cpu_g_idx_map[target_group]++;
        cpu_x_map_inv[row] = target_row;
    }

    // X map

    for (int row = 0; row < height; row++) cpu_x_map[cpu_x_map_inv[row]] = row;

    // Move to CUDA

    cudaMemcpyAsync(cuda_x_map, cpu_x_map, height * sizeof(uint32_t), cudaMemcpyHostToDevice);

    // Rearrange rows in w

    dim3 threads(UNSHUF_BLOCKSIZE_X, 1, 1);
    dim3 blocks(width / UNSHUF_BLOCKSIZE_X / 2, height / 8, 1);

    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
    make_sequential_kernel<<<blocks, threads, 0, stream>>>(cuda_qweight, cuda_new_qweight, cuda_x_map, height / 8, width);

    // Replace qweights

    cudaMemcpyAsync(cuda_qweight, cuda_new_qweight, height / 8 * width * sizeof(uint32_t), cudaMemcpyDeviceToDevice);

    // Cleanup

    cudaDeviceSynchronize();
    cudaFree(cuda_new_qweight);
    free(cpu_g_idx_map);
    free(cpu_x_map);
    free(cpu_x_map_inv);
}

__global__ void reconstruct_kernel
(
    const uint32_t* __restrict__ w,
    half* __restrict__ out,  // (y)
    const half* __restrict__ w_scales,
    const uint32_t* __restrict__ w_zeros,
    const int height,
    const int width,
    const int groupsize
)
{
    // Start of block

    int column = RECONS_THREADS_X * blockIdx.x + threadIdx.x;
    int row = (RECONS_THREADS_Y * blockIdx.y + threadIdx.y) * 8;

    // Views

    MatrixView_q4_column w_(w, height, width);
    MatrixView_half_rw out_(out, height, width);
    MatrixView_half w_scales_(w_scales, height / groupsize, width);
    MatrixView_q4_row w_zeros_(w_zeros, height / groupsize, width);

    // Groupsize version

    int group = row / groupsize;

    half w_scale = w_scales_.item(group, column);
    uint32_t w_zero = (w_zeros_.item(group, column) + 1) & 0x0F;

    uint32_t w_read = w_.item_uint32_t(row, column);
    half* out_ptr = out_.item_ptr(row, column);

    #pragma unroll
    for (int s = 0; s < 32; s += 4)
    {
        half w_item = __hmul(__int2half_rn((int)((w_read >> s) & 0x0f) - w_zero), w_scale);
        *out_ptr = w_item; out_ptr += out_.width;
    }
}

void Q4Matrix::reconstruct(half* out)
{
    dim3 threads(RECONS_THREADS_X, RECONS_THREADS_Y, 1);

    dim3 blocks
    (
        (width + threads.x - 1) / threads.x,
        (height / 8 + threads.y - 1) / threads.y,
        1
    );

    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
    reconstruct_kernel<<<blocks, threads, 0, stream>>>(cuda_qweight, out, cuda_scales, cuda_qzeros, height / 8, width, groupsize);
}


================================================
FILE: server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cuh
================================================
// Adapted from turboderp exllama: https://github.com/turboderp/exllama

#ifndef _q4_matrix_cuh
#define _q4_matrix_cuh

#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cstdint>

class Q4Matrix
{
public:

    int device;

    int height;
    int width;
    int groups;
    int groupsize;

    uint32_t* cuda_qweight = NULL;
    uint32_t* cuda_qzeros = NULL;
    half* cuda_scales = NULL;
    uint32_t* cuda_x_map = NULL;

    Q4Matrix
    (
        const int _height,
        const int _width,
        const int _groups,

        uint32_t* _qweight,
        uint32_t* _qzeros,
        half* _scales,
        uint32_t* _g_idx,

        const int _device
    );

    ~Q4Matrix();

    void reconstruct(half* out);

private:

    void make_sequential(const uint32_t* cpu_g_idx);

};

void g_q4_keep_matrix(Q4Matrix* m);
void g_q4_free_matrices();

#endif


================================================
FILE: server/exllama_kernels/exllama_kernels/exllama_ext.cpp
================================================
// Adapted from turboderp exllama: https://github.com/turboderp/exllama

#include <torch/extension.h>
#include <c10/cuda/CUDAGuard.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cstdint>
#include <cstdio>
#include "util.cuh"
#include "tuning.h"
#include "cuda_buffers.cuh"
#include "cuda_func/q4_matrix.cuh"
#include "cuda_func/q4_matmul.cuh"
#include "cuda_func/column_remap.cuh"

// Check CUDA return code. We don't want to include Torch headers in the .cu files because parsing them adds almost a
// minute to the compile time on a 12900K. Also passing exceptions back to Python is super tricky, so in place of
// exceptions, CUDA functions return with a cudaError_t which we can parse and dump to the console.

void check_cuda(cudaError_t ret)
{
    switch (ret)
    {
        case cudaSuccess:
            break;

        case cudaUnspecified:
            printf(" **** Unspecified error\n");
            TORCH_CHECK(false, "CUDA error");
            break;

        default:
            printf(" **** CUDA error\n"); \
            printf(" **** %s\n", cudaGetErrorString(ret)); \
            TORCH_CHECK(false, "CUDA error"); \
            break;
    }
}

// Some decluttering macros

#define STRINGIFY_(__x) #__x
#define STRINGIFY(__x) STRINGIFY_(__x)
#define TORCH_CHECK_DTYPE(__x, __dtype) TORCH_CHECK((__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype)
#define TORCH_CHECK_DTYPE_OPT(__x, __dtype) TORCH_CHECK((__x).device().is_meta() || (__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype)
#define TORCH_CHECK_SHAPES(__x, __dim_x, __y, __dim_y, __scale_y) TORCH_CHECK((__x).size(__dim_x) == (__y).size(__dim_y) * __scale_y, #__x " and " #__y " have incompatible shapes")
#define TORCH_CHECK_SHAPES_OPT(__x, __dim_x, __y, __dim_y, __scale_y) TORCH_CHECK((__x).device().is_meta() || (__x).size(__dim_x) == (__y).size(__dim_y) * __scale_y, #__x " and " #__y " have incompatible shapes")
#define TORCH_CHECK_SHAPE_MOD(__x, __dim_x, __mod) TORCH_CHECK((__x).size(__dim_x) % __mod == 0, #__x ".shape[" STRINGIFY(__dim_x) "] must be a multiple of " STRINGIFY(__mod))

#define TORCH_CHECK_DEVICE_INDEX(__index) \
do { \
    TORCH_CHECK(__index >= 0, "no device index"); \
    TORCH_CHECK(__index < CUDA_MAX_DEVICES, "invalid device index"); \
} while(0)

#define TORCH_CHECK_QUANT(__w, __w_scales, __w_zeros, __seq_g_idx, __x_map) \
do { \
    TORCH_CHECK_DTYPE(__w, kInt); \
    TORCH_CHECK_DTYPE(__w_scales, kHalf); \
    TORCH_CHECK_DTYPE(__w_zeros, kInt); \
    TORCH_CHECK_DTYPE_OPT(__seq_g_idx, kShort); \
    TORCH_CHECK_DTYPE_OPT(__x_map, kInt); \
    TORCH_CHECK_SHAPES_OPT(__seq_g_idx, 0, __w, 0, 2 * 8); \
    TORCH_CHECK_SHAPES_OPT(__x_map, 0, __w, 0, 8); \
} while(0)

int get_groupsize(torch::Tensor w, torch::Tensor w_zeros)
{
    int groupsize = w.size(0) * 8 / w_zeros.size(0);
    TORCH_CHECK(groupsize * w_zeros.size(0) == w.size(0) * 8, "w.shape[-2] must be a multiple of zeros.shape[-2]")
    return groupsize;
}


// Tuning parameters

ExLlamaTuning tuningParams;

void set_tuning_params
(
    int matmul_recons_thd,
    bool matmul_fused_remap,
    bool matmul_no_half2
)
{
    tuningParams.matmul_recons_thd = matmul_recons_thd;
    tuningParams.matmul_fused_remap = matmul_fused_remap;
    tuningParams.matmul_no_half2 = matmul_no_half2;
}


// Release all unmanaged objects allocated by the extension

void cleanup()
{
    cleanup_buffers_cuda();
    g_q4_free_matrices();
}


// Prepare buffers for forward pass

void prepare_buffers
(
    torch::Device device,
    torch::Tensor temp_state,
    torch::Tensor temp_dq
)
{
    int device_index = device.index();
    TORCH_CHECK_DEVICE_INDEX(device_index);
    const at::cuda::OptionalCUDAGuard device_guard(device);

    prepare_buffers_cuda
    (
        device_index,
        (half*) temp_state.data_ptr(),
        (half*) temp_dq.data_ptr()
    );
}


// Create Q4Matrix, return handle

uintptr_t make_q4
(
    torch::Tensor qweight,
    torch::Tensor qzeros,
    torch::Tensor scales,
    torch::Tensor g_idx,
    int device
)
{
    TORCH_CHECK_DTYPE(qweight, kInt);
    TORCH_CHECK_DTYPE(qzeros, kInt);
    TORCH_CHECK_DTYPE(scales, kHalf);
    TORCH_CHECK_DTYPE_OPT(g_idx, kInt);
    TORCH_CHECK_SHAPES(qweight, 1, qzeros, 1, 8);
    TORCH_CHECK_SHAPES(scales, 1, qweight, 1, 1);
    TORCH_CHECK_SHAPES(qzeros, 0, scales, 0, 1);

    int width = qweight.size(1);
    int height = qweight.size(0) * 8;
    int groups = qzeros.size(0);

    Q4Matrix* m = new Q4Matrix
    (
        height,
        width,
        groups,

        (uint32_t*) qweight.data_ptr(),
        (uint32_t*) qzeros.data_ptr(),
        (half*) scales.data_ptr(),
        g_idx.device().is_meta() ? NULL : (uint32_t*) g_idx.data_ptr(),

        device
    );

    g_q4_keep_matrix(m);
    return reinterpret_cast<uintptr_t> (m);
}


// Matmul half @ quant -> half

void q4_matmul
(
    torch::Tensor x,
    uintptr_t w,
    torch::Tensor out
)
{
    Q4Matrix* wm = reinterpret_cast<Q4Matrix*> (w);

    TORCH_CHECK_DTYPE(x, kHalf);
    TORCH_CHECK_DTYPE(out, kHalf);
    TORCH_CHECK_SHAPES(x, 0, out, 0, 1);
    TORCH_CHECK(wm->height == x.size(-1), "x and w have incompatible shapes")

    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));

    int x_height = x.size(0);

    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
    if (tuningParams.matmul_recons_thd == 0 || x_height < tuningParams.matmul_recons_thd)
    {
        q4_matmul_cuda
        (
            &tuningParams,
            (half*) x.data_ptr(),
            x_height,
            wm,
            (half*) out.data_ptr(),
            false,
            stream
        );
    }
    else
    {
        q4_matmul_recons_cuda
        (
            &tuningParams,
            (half*) x.data_ptr(),
            x_height,
            wm,
            (half*) out.data_ptr(),
            false,
            at::cuda::getCurrentCUDABlasHandle()
        );
    }
}


// Remap columns in half tensor

void column_remap
(
    torch::Tensor x,
    torch::Tensor x_new,
    torch::Tensor x_map
)
{
    TORCH_CHECK_DTYPE(x, kHalf);
    TORCH_CHECK_DTYPE(x_new, kHalf);
    TORCH_CHECK_DTYPE(x_map, kInt);
    TORCH_CHECK_SHAPES(x_map, 0, x, 1, 1);

    int height = x.size(0);
    int width = x.size(1);

    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));

    column_remap_cuda
    (
        (half*) x.data_ptr(),
        (half*) x_new.data_ptr(),
        height,
        width,
        (uint32_t*) x_map.data_ptr()
    );
}


PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
{
    m.def("set_tuning_params", &set_tuning_params, "set_tuning_params");
    m.def("prepare_buffers", &prepare_buffers, "prepare_buffers");
    m.def("cleanup", &cleanup, "cleanup");
    m.def("make_q4", &make_q4, "make_q4");
    m.def("q4_matmul", &q4_matmul, "q4_matmul");
}


================================================
FILE: server/exllama_kernels/exllama_kernels/hip_compat.cuh
================================================
// Adapted from turboderp exllama: https://github.com/turboderp/exllama

#ifndef _hip_compat_cuh
#define _hip_compat_cuh

// Workaround for a bug in hipamd, backported from upstream, this is fixed in ROCm 5.6.
__device__ __forceinline__ __half __compat_hrcp(__half x) {
    return __half_raw{
        static_cast<_Float16>(__builtin_amdgcn_rcph(static_cast<__half_raw>(x).data))};
}

__device__ __forceinline__ __half2 __compat_h2rcp(__half2 x) {
    return _Float16_2{
        _Float16_2{static_cast<_Float16>(1.0f),
            static_cast<_Float16>(1.0f)} / x.data};
}

#define hrcp __compat_hrcp
#define h2rcp __compat_h2rcp

// Automatic conversion of hipblasHgemm doesn't convert half to hipblasHalf.
__host__ __forceinline__ hipblasStatus_t __compat_hipblasHgemm(hipblasHandle_t    handle,
                                                               hipblasOperation_t transA,
                                                               hipblasOperation_t transB,
                                                               int                m,
                                                               int                n,
                                                               int                k,
                                                               const half*        alpha,
                                                               const half*        AP,
                                                               int                lda,
                                                               const half*        BP,
                                                               int                ldb,
                                                               const half*        beta,
                                                               half*              CP,
                                                               int                ldc) {
    return hipblasHgemm(handle, transA, transB, m, n, k,
                        reinterpret_cast<const hipblasHalf *>(alpha),
                        reinterpret_cast<const hipblasHalf *>(AP), lda,
                        reinterpret_cast<const hipblasHalf *>(BP), ldb,
                        reinterpret_cast<const hipblasHalf *>(beta),
                        reinterpret_cast<hipblasHalf *>(CP), ldc);
}
#define hipblasHgemm __compat_hipblasHgemm

// Previous version of PyTorch were converting to rocBLAS instead of hipBLAS.
#define rocblas_handle hipblasHandle_t
#define rocblas_operation_none HIPBLAS_OP_N
#define rocblas_get_stream hipblasGetStream
#define rocblas_set_stream hipblasSetStream
#define rocblas_hgemm __compat_hipblasHgemm

#endif


================================================
FILE: server/exllama_kernels/exllama_kernels/matrix.cuh
================================================
// Adapted from turboderp exllama: https://github.com/turboderp/exllama

#ifndef _matrix_cuh
#define _matrix_cuh

#include <cuda_runtime.h>
#include <cuda_fp16.h>

class MatrixView_half
{
public:
    const half* data;
    const int height;
    const int width;

    __device__ __forceinline__ MatrixView_half(const half* data, const int height, const int width)
        : data(data), height(height), width(width)
    { }

    __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; }
    __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; }
    __device__ __forceinline__ half2 item_half2half2(int row, int column) const { return __half2half2(data[row * width + column]); }
    __device__ __forceinline__ const half* item_ptr(int row, int column) const { return &data[row * width + column]; }
};

class MatrixView_half_rw
{
public:
    half* data;
    const int height;
    const int width;

    __device__ __forceinline__ MatrixView_half_rw(half* data, const int height, const int width)
        : data(data), height(height), width(width)
    { }

    __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; }
    __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; }
    __device__ __forceinline__ half* item_ptr(int row, int column) { return &data[row * width + column]; }
    __device__ __forceinline__ void set(int row, int column, half value) { data[row * width + column] = value; }
    __device__ __forceinline__ void set_half2(int row, int column, half2 value) { ((half2*)data)[(row * width + column) / 2] = value; }
};

class MatrixView_q4_row
{
public:
    const uint32_t* data;
    const int height;
    const int width;

    __device__ __forceinline__ MatrixView_q4_row(const uint32_t* data, const int height, const int width)
        : data(data), height(height), width(width)
    { }

    __device__ __forceinline__ int item(int row, int column) const
    {
        int shift = (column & 0x07) * 4;
        return (data[row * width / 8 + column / 8] >> shift) & 0x0f;
    }
};

class MatrixView_q4_column
{
public:
    const uint32_t* data;
    const int height;
    const int width;

    __device__ __forceinline__ MatrixView_q4_column(const uint32_t* data, const int height, const int width)
        : data(data), height(height), width(width)
    { }

    __device__ __forceinline__ int item(int row, int column) const
    {
        int shift = (row & 0x07) * 4;
        return (data[row / 8 * width + column] >> shift) & 0x0f;
    }

    __device__ __forceinline__ uint32_t item_uint32_t(int row, int column) { return data[row / 8 * width + column]; }
    __device__ __forceinline__ const uint32_t* item_uint32_ptr(int row, int column) { return &data[row / 8 * width + column]; }
};

// TODO: Rewrite all these dot product functions using functors or something, move to q4_matmul.cu

// Accumulated dot product of 8-element row vectors in h and quantized column vectors in v, constant zero/scale

__device__ __forceinline__ half2 dot_product_8
(
    const half2 acc,
    MatrixView_half& h_,
    const int h_row,
    const int h_column,                 // divisible by 8
    MatrixView_q4_column& v_,
    const int v_row,                    // divisible by 8
    const int v_column,
    const half2 v_scale_2,
    const uint32_t v_zero,              // + 1 (!!)
    const int count
)
{
    const half2* h_ptr = (const half2*) h_.item_ptr(h_row, h_column);
    const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column);
    half2 result = acc;

    for (int i = 0; i < count; i++)
    {
        uint32_t v_read = *v_ptr; v_ptr += v_.width;

        half v_0 = __int2half_rn((int)((v_read      ) & 0x0f) - v_zero);
        half v_1 = __int2half_rn((int)((v_read >>  4) & 0x0f) - v_zero);
        half v_2 = __int2half_rn((int)((v_read >>  8) & 0x0f) - v_zero);
        half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero);
        half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero);
        half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero);
        half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero);
        half v_7 = __int2half_rn((int)((v_read >> 28)       ) - v_zero);

        half2 v_01 = __halves2half2(v_0, v_1);
        half2 v_23 = __halves2half2(v_2, v_3);
        half2 v_45 = __halves2half2(v_4, v_5);
        half2 v_67 = __halves2half2(v_6, v_7);

//         half2 v_01 = q4_table[v_zero - 1][(v_read      ) & 0xff]; // (constant memory is too slow apparently)
//         half2 v_23 = q4_table[v_zero - 1][(v_read >>  8) & 0xff];
//         half2 v_45 = q4_table[v_zero - 1][(v_read >> 16) & 0xff];
//         half2 v_67 = q4_table[v_zero - 1][(v_read >> 24)       ];

        half2 tmp = __hmul2(*h_ptr++, v_01);
        tmp = __hfma2(*h_ptr++, v_23, tmp);
        tmp = __hfma2(*h_ptr++, v_45, tmp);
        tmp = __hfma2(*h_ptr++, v_67, tmp);
        result = __hfma2(v_scale_2, tmp, result);
    }

    return result;
}

__device__ __forceinline__ half dot_product_8_h
(
    const half acc,
    MatrixView_half& h_,
    const int h_row,
    const int h_column,                 // divisible by 8
    MatrixView_q4_column& v_,
    const int v_row,                    // divisible by 8
    const int v_column,
    const half v_scale,
    const uint32_t v_zero,              // + 1 (!!)
    const int count
)
{
    const half* h_ptr = h_.item_ptr(h_row, h_column);
    const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column);
    half result = acc;

    for (int i = 0; i < count; i++)
    {
        uint32_t v_read = *v_ptr; v_ptr += v_.width;

        half v_0 = __int2half_rn((int)((v_read      ) & 0x0f) - v_zero);
        half v_1 = __int2half_rn((int)((v_read >>  4) & 0x0f) - v_zero);
        half v_2 = __int2half_rn((int)((v_read >>  8) & 0x0f) - v_zero);
        half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero);
        half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero);
        half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero);
        half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero);
        half v_7 = __int2half_rn((int)((v_read >> 28)       ) - v_zero);

        half tmp = __hmul(*h_ptr++, v_0);
        tmp = __hfma(*h_ptr++, v_1, tmp);
        tmp = __hfma(*h_ptr++, v_2, tmp);
        tmp = __hfma(*h_ptr++, v_3, tmp);
        tmp = __hfma(*h_ptr++, v_4, tmp);
        tmp = __hfma(*h_ptr++, v_5, tmp);
        tmp = __hfma(*h_ptr++, v_6, tmp);
        tmp = __hfma(*h_ptr++, v_7, tmp);
        result = __hfma(v_scale, tmp, result);
    }

    return result;
}

// Accumulated dot product of 8-element row vectors in h and quantized column vectors in v, constant zero/scale, with x_map

__device__ __forceinline__ half2 dot_product_8_x_map
(
    const half2 acc,
    MatrixView_half& h_,
    const int h_row,
    const int h_column,                 // divisible by 8
    MatrixView_q4_column& v_,
    const int v_row,                    // divisible by 8
    const int v_column,
    const half2 v_scale_2,
    const uint32_t v_zero,              // + 1 (!!)
    const int count,
    const uint32_t* x_map
)
{
    const half* h_ptr = h_.item_ptr(h_row, 0);
    const uint32_t* x_map_ptr = x_map + h_column;
    const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column);
    half2 result = acc;

    for (int i = 0; i < count; i++)
    {
        uint32_t v_read = *v_ptr; v_ptr += v_.width;

        half v_0 = __int2half_rn((int)((v_read      ) & 0x0f) - v_zero);
        half v_1 = __int2half_rn((int)((v_read >>  4) & 0x0f) - v_zero);
        half v_2 = __int2half_rn((int)((v_read >>  8) & 0x0f) - v_zero);
        half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero);
        half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero);
        half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero);
        half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero);
        half v_7 = __int2half_rn((int)((v_read >> 28)       ) - v_zero);

        half2 v_01 = __halves2half2(v_0, v_1);
        half2 v_23 = __halves2half2(v_2, v_3);
        half2 v_45 = __halves2half2(v_4, v_5);
        half2 v_67 = __halves2half2(v_6, v_7);

        half h_0 = h_ptr[*x_map_ptr++];
        half h_1 = h_ptr[*x_map_ptr++];
        half h_2 = h_ptr[*x_map_ptr++];
        half h_3 = h_ptr[*x_map_ptr++];
        half h_4 = h_ptr[*x_map_ptr++];
        half h_5 = h_ptr[*x_map_ptr++];
        half h_6 = h_ptr[*x_map_ptr++];
        half h_7 = h_ptr[*x_map_ptr++];

        half2 h_01 = __halves2half2(h_0, h_1);
        half2 h_23 = __halves2half2(h_2, h_3);
        half2 h_45 = __halves2half2(h_4, h_5);
        half2 h_67 = __halves2half2(h_6, h_7);

        half2 tmp = __hmul2(h_01, v_01);
        tmp = __hfma2(h_23, v_23, tmp);
        tmp = __hfma2(h_45, v_45, tmp);
        tmp = __hfma2(h_67, v_67, tmp);
        result = __hfma2(v_scale_2, tmp, result);
    }

    return result;
}

__device__ __forceinline__ half dot_product_8_x_map_h
(
    const half acc,
    MatrixView_half& h_,
    const int h_row,
    const int h_column,                 // divisible by 8
    MatrixView_q4_column& v_,
    const int v_row,                    // divisible by 8
    const int v_column,
    const half v_scale,
    const uint32_t v_zero,              // + 1 (!!)
    const int count,
    const uint32_t* x_map
)
{
    const half* h_ptr = h_.item_ptr(h_row, 0);
    const uint32_t* x_map_ptr = x_map + h_column;
    const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column);
    half result = acc;

    for (int i = 0; i < count; i++)
    {
        uint32_t v_read = *v_ptr; v_ptr += v_.width;

        half v_0 = __int2half_rn((int)((v_read      ) & 0x0f) - v_zero);
        half v_1 = __int2half_rn((int)((v_read >>  4) & 0x0f) - v_zero);
        half v_2 = __int2half_rn((int)((v_read >>  8) & 0x0f) - v_zero);
        half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero);
        half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero);
        half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero);
        half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero);
        half v_7 = __int2half_rn((int)((v_read >> 28)       ) - v_zero);

        half tmp = __hmul(h_ptr[*x_map_ptr++], v_0);
        tmp = __hfma(h_ptr[*x_map_ptr++], v_1, tmp);
        tmp = __hfma(h_ptr[*x_map_ptr++], v_2, tmp);
        tmp = __hfma(h_ptr[*x_map_ptr++], v_3, tmp);
        tmp = __hfma(h_ptr[*x_map_ptr++], v_4, tmp);
        tmp = __hfma(h_ptr[*x_map_ptr++], v_5, tmp);
        tmp = __hfma(h_ptr[*x_map_ptr++], v_6, tmp);
        tmp = __hfma(h_ptr[*x_map_ptr++], v_7, tmp);
        result = __hfma(v_scale, tmp, result);
    }

    return result;
}

#endif


================================================
FILE: server/exllama_kernels/exllama_kernels/tuning.h
================================================
// Adapted from turboderp exllama: https://github.com/turboderp/exllama

#ifndef _tuning_h
#define _tuning_h

struct ExLlamaTuning
{
    int matmul_recons_thd;
    bool matmul_fused_remap;
    bool matmul_no_half2;
};

#endif


================================================
FILE: server/exllama_kernels/exllama_kernels/util.cuh
================================================
// Adapted from turboderp exllama: https://github.com/turboderp/exllama

#ifndef _util_cuh
#define _util_cuh

#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cstdint>
#include <cstdio>

#if defined(USE_ROCM)
#define cudaUnspecified hipErrorUnknown
#else
#define cudaUnspecified cudaErrorApiFailureBase
#endif

// React to failure on return code != cudaSuccess

#define _cuda_check(fn) \
do { \
    {_cuda_err = fn;} \
    if (_cuda_err != cudaSuccess) goto _cuda_fail; \
} while(false)

// React to failure on return code == 0

#define _alloc_check(fn) \
do { \
    if (!(fn)) { _cuda_err = cudaUnspecified; goto _cuda_fail; } \
    else _cuda_err = cudaSuccess; \
} while(false)

#endif


================================================
FILE: server/exllama_kernels/setup.py
================================================
from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
import torch

extra_cuda_cflags = []
extra_cflags = []
if torch.version.hip:
    extra_cflags = ["-DLEGACY_HIPBLAS_DIRECT=ON"]
    extra_cuda_cflags = ["-DLEGACY_HIPBLAS_DIRECT=ON"]

extra_compile_args = {
    "cxx": extra_cflags,
    "nvcc": extra_cuda_cflags,
}

setup(
    name="exllama_kernels",
    ext_modules=[
        CUDAExtension(
            name="exllama_kernels",
            sources=[
                "exllama_kernels/exllama_ext.cpp",
                "exllama_kernels/cuda_buffers.cu",
                "exllama_kernels/cuda_func/column_remap.cu",
                "exllama_kernels/cuda_func/q4_matmul.cu",
                "exllama_kernels/cuda_func/q4_matrix.cu",
            ],
            extra_compile_args=extra_compile_args,
        )
    ],
    cmdclass={"build_ext": BuildExtension},
)


================================================
FILE: server/exllamav2_kernels/exllamav2_kernels/config.h
================================================
#ifndef _config_h
#define _config_h

#define MAX_Q_GEMM_ROWS 50
#define MAX_Q_GEMM_WEIGHTS 4  // must be <= MAX_Q_GEMM_ROWS

#define QMODE_2BIT 1
#define QMODE_3BIT 1
#define QMODE_4BIT 1
#define QMODE_5BIT 1
#define QMODE_6BIT 0
#define QMODE_8BIT 0


#endif


================================================
FILE: server/exllamav2_kernels/exllamav2_kernels/cpp/util.h
================================================
#ifndef _util_h
#define _util_h

#define DBGS(__x) printf("%s\n", __x)
#define DBGI(__x) printf("%s: %i\n", #__x, __x)
#define DBGI2(__x, __y) printf("%s, %s: %i, %i\n", #__x, #__y, __x, __y)
#define DBGI3(__x, __y, __z) printf("%s, %s, %s: %i, %i, %i\n", #__x, #__y, #__z, __x, __y, __z)
#define DBGF(__x) printf("%s: %f\n", #__x, __x)
#define DBGF2(__x, __y) printf("%s, %s: %f, %f\n", #__x, #__y, __x, __y)
#define DBGF3(__x, __y, __z) printf("%s, %s, %s: %f, %f, %f\n", #__x, #__y, #__z, __x, __y, __z)

#endif


================================================
FILE: server/exllamav2_kernels/exllamav2_kernels/cuda/compat.cuh
================================================
#ifndef _compat_cuh
#define _compat_cuh

// atomicAdd for half types, to support CC < 7.x

__device__ __forceinline__ void atomicAdd_half(half* address, half val)
{
    unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
    unsigned int old = *address_as_ui;
    unsigned int assumed;

    do
    {
        assumed = old;
        __half_raw hsum;
        hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
        half tmpres = __hadd(hsum, val);
        hsum = __half_raw(tmpres);
        old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
        old = atomicCAS(address_as_ui, assumed, old);
    }
    while (assumed != old);
}

// atomicAdd for half2 types

__device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val)
{
    unsigned int* address_as_ui = (unsigned int*)address;
    unsigned int old = *address_as_ui;
    unsigned int assumed;
    do
    {
        assumed = old;
        half2 old_val = *((half2*)&old);
        half2 new_val = __hadd2(old_val, val);
        old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val));
    }
    while (assumed != old);
}

//

#if defined(__CUDA_ARCH__) || defined(USE_ROCM)
#if __CUDA_ARCH__ < 700 || defined(USE_ROCM)

__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }

#if __CUDA_ARCH__ < 600 || defined(USE_ROCM)
__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
#endif

#endif
#endif

#endif


================================================
FILE: server/exllamav2_kernels/exllamav2_kernels/cuda/matrix_view.cuh
================================================
#ifndef _matrix_view_cuh
#define _matrix_view_cuh

#include <cuda_runtime.h>
#include <cuda_fp16.h>

#include "quant/qdq_util.cuh"

class MatrixView_half
{
public:
    const half* data;
    const int height;
    const int width;

    __device__ __forceinline__ MatrixView_half(const half* data, const int height, const int width)
        : data(data), height(height), width(width)
    { }

    __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; }
    __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; }
    __device__ __forceinline__ half2 item_half2half2(int row, int column) const { return __half2half2(data[row * width + column]); }
    __device__ __forceinline__ const half* item_ptr(int row, int column) const { return &data[row * width + column]; }

    __device__ __forceinline__ void item4(half (&items)[4], int row, int column) const
    {
        half2* ptr = (half2*) item_ptr(row, column);
        half2 i01 = ptr[0];
        half2 i23 = ptr[1];
        items[0] = __low2half(i01);
        items[1] = __high2half(i01);
        items[2] = __low2half(i23);
        items[3] = __high2half(i23);
    }
    __device__ __forceinline__ void item4_f(float (&items)[4], int row, int column) const
    {
        half2* ptr = (half2*)item_ptr(row, column);
        half2 i01 = ptr[0];
        half2 i23 = ptr[1];
        items[0] = __half2float(__low2half(i01));
        items[1] = __half2float(__high2half(i01));
        items[2] = __half2float(__low2half(i23));
        items[3] = __half2float(__high2half(i23));
    }

    __device__ __forceinline__ void item4_h2(half2 (&items)[4], int row, int column) const
    {
        half2* ptr = (half2*)item_ptr(row, column);
        half2 i01 = ptr[0];
        half2 i23 = ptr[1];
        items[0] = __half2half2(__low2half(i01));
        items[1] = __half2half2(__high2half(i01));
        items[2] = __half2half2(__low2half(i23));
        items[3] = __half2half2(__high2half(i23));
    }
};

class MatrixView_half_rw
{
public:
    half* data;
    const int height;
    const int width;

    __device__ __forceinline__ MatrixView_half_rw(half* data, const int height, const int width)
        : data(data), height(height), width(width)
    { }

    __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; }
    __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; }
    __device__ __forceinline__ half* item_ptr(int row, int column) { return &data[row * width + column]; }
    __device__ __forceinline__ void set(int row, int column, half value) { data[row * width + column] = value; }
    __device__ __forceinline__ void set_half2(int row, int column, half2 value) { ((half2*)data)[(row * width + column) / 2] = value; }

    __device__ __forceinline__ void set4(int row, int column, half v0, half v1, half v2, half v3)
    {
        half2 v01 = __halves2half2(v0, v1);
        half2 v23 = __halves2half2(v2, v3);
        half2* ptr = (half2*) item_ptr(row, column);
        ptr[0] = v01;
        ptr[1] = v23;
    }
};

class MatrixView_q4_row
{
public:
    const uint32_t* data;
    const int height;
    const int width;

    __device__ __forceinline__ MatrixView_q4_row(const uint32_t* data, const int height, const int width)
        : data(data), height(height), width(width)
    { }

    __device__ __forceinline__ int item(int row, int column) const
    {
        int shift = (column & 0x07) * 4;
        return (data[row * width / 8 + column / 8] >> shift) & 0x0f;
    }

    __device__ __forceinline__ void item2(int (&items)[2], int row, int column) const
    {
        int shift = (column & 0x07) * 4;
        uint32_t d = data[row * width / 8 + column / 8] >> shift;
        items[0] = d & 0x0f;
        items[1] = (d >> 4) & 0x0f;
    }

    __device__ __forceinline__ void item4(int (&items)[4], int row, int column) const
    {
        int shift = (column & 0x07) * 4;
        uint32_t d = data[row * width / 8 + column / 8] >> shift;
        items[0] = d & 0x0f;
        items[1] = (d >> 4) & 0x0f;
        items[2] = (d >> 8) & 0x0f;
        items[3] = (d >> 12) & 0x0f;
    }
};

#endif


================================================
FILE: server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu
================================================
#include "q_gemm.cuh"
#include "util.cuh"
#include "matrix_view.cuh"
#include "../config.h"

#include "quant/qdq_2.cuh"
#include "quant/qdq_3.cuh"
#include "quant/qdq_4.cuh"
#include "quant/qdq_5.cuh"
#include "quant/qdq_6.cuh"
#include "quant/qdq_8.cuh"

#define GPTQ_BLOCK_KN_SIZE 128
#define GPTQ_BLOCK_M_SIZE_MAX 8
#define GPTQ_MAX_GROUPS_IN_BLOCK (GPTQ_BLOCK_KN_SIZE / 32)

#define EXL2_BLOCK_KN_SIZE 64
#define EXL2_BLOCK_M_SIZE_MAX 8
#define EXL2_MAX_GROUPS_IN_BLOCK (EXL2_BLOCK_KN_SIZE / 32)

#define CLEAR_N_SIZE 256

#include "q_gemm_kernel.cuh"
#include "q_gemm_kernel_gptq.cuh"

void gemm_half_q_half_cuda_part
(
    const half* a,
    QMatrix* b,
    half* c,
    int size_m,
    int size_n,
    int size_k,
    int m_count,
    bool clear,
    const half* r_weights,
    int r_weights_stride,
    bool mul_r_weights
)
{
    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
    if (!b->is_gptq)
    {
        dim3 blockDim, gridDim;
        blockDim.x = EXL2_BLOCK_KN_SIZE;
        blockDim.y = 1;
        blockDim.z = 1;
        gridDim.x = DIVIDE(size_n, EXL2_BLOCK_KN_SIZE * 4);
        gridDim.y = DIVIDE(size_m, m_count);
        gridDim.z = DIVIDE(size_k, EXL2_BLOCK_KN_SIZE);

        fp_gemm_half_q_half_kernel kernel = pick_gemm_half_q_half_kernel(m_count, r_weights != NULL, mul_r_weights);

        kernel<<<gridDim, blockDim, 0, stream>>>
        (
            a,
            b->cuda_q_weight,
            b->cuda_q_scale,
            b->cuda_q_scale_max,
            c,
            size_m,
            size_n,
            size_k,
            b->groups,
            b->cuda_q_group_map,
            b->cuda_q_perm,
            b->rows_8,
            b->rows_6,
            b->rows_5,
            b->rows_4,
            b->rows_3,
            b->rows_2,
            clear,
            r_weights,
            r_weights_stride
        );
    }
    else
    {
        dim3 blockDim, gridDim;
        blockDim.x = GPTQ_BLOCK_KN_SIZE;
        blockDim.y = 1;
        blockDim.z = 1;
        gridDim.x = DIVIDE(size_n, GPTQ_BLOCK_KN_SIZE * 4);
        gridDim.y = DIVIDE(size_m, m_count);
        gridDim.z = DIVIDE(size_k, GPTQ_BLOCK_KN_SIZE);

        fp_gemm_half_q_half_gptq_kernel kernel = pick_gemm_half_q_half_gptq_kernel(m_count, r_weights != NULL, mul_r_weights);

//         DBGX((uint64_t) r_weights);
//         if (r_weights)
//             print_global_mem(r_weights, 1, 1, 1);
//         DBGI(r_weights_stride);

        kernel<<<gridDim, blockDim, 0, stream>>>
        (
            a,
            b->cuda_q_weight,
            b->cuda_gptq_qzeros,
            b->cuda_gptq_scales,
            c,
            size_m,
            size_n,
            size_k,
            b->groups,
            b->gptq_groupsize,
            b->cuda_q_perm,
            b->rows_4,
            clear,
            r_weights,
            r_weights_stride
        );
    }
}

void gemm_half_q_half_cuda
(
    cublasHandle_t cublas_handle,
    const half* a,
    QMatrix* b,
    half* c,
    int size_m,
    int size_n,
    int size_k,
    bool clear,
    half* temp_dq,
    bool force_cuda,
    const half* r_weights,
    const int r_weights_stride,
    bool mul_r_weights
)
{
    if (size_m > MAX_Q_GEMM_ROWS && !force_cuda)
    {
        // Reconstruct FP16 matrix, then cuBLAS

        if (!temp_dq) temp_dq = b->temp_dq;
        b->reconstruct(temp_dq);

        //cublasSetMathMode(cublas_handle, CUBLAS_TENSOR_OP_MATH);

        const half alpha = __float2half(1.0f);
        const half beta = clear ? __float2half(0.0f) : __float2half(1.0f);
        cublasHgemm(cublas_handle,
                    CUBLAS_OP_N,
                    CUBLAS_OP_N,
                    size_n, size_m, size_k,
                    &alpha, temp_dq, size_n,
                            a,       size_k,
                    &beta,  c,       size_n);

        //const float alpha = 1.0f;
        //const float beta = clear ? 0.0f : 1.0f;
        //cublasSgemmEx(cublas_handle,
        //             CUBLAS_OP_N,
        //             CUBLAS_OP_N,
        //             size_n, size_m, size_k,
        //             &alpha, temp_dq, CUDA_R_16F, size_n,
        //                     a,       CUDA_R_16F, size_k,
        //             &beta,  c,       CUDA_R_16F, size_n);

        //const float alpha = 1.0f;
        //const float beta = clear ? 0.0f : 1.0f;
        //cublasGemmEx(cublas_handle,
        //             CUBLAS_OP_N, CUBLAS_OP_N,
        //             size_n, size_m, size_k,
        //             &alpha, temp_dq, CUDA_R_16F, size_n,
        //                     a,       CUDA_R_16F, size_k,
        //             &beta,  c,       CUDA_R_16F, size_n,
        //             CUDA_R_16F, CUBLAS_GEMM_DFALT_TENSOR_OP);
    }
    else
    {
        // Quantized matmul

        int block_m_size_max = b->is_gptq ? GPTQ_BLOCK_M_SIZE_MAX : EXL2_BLOCK_M_SIZE_MAX;
        int max_chunks = size_m / block_m_size_max;
        int last_chunk = max_chunks * block_m_size_max;
        int last_chunk_size = size_m - last_chunk;

        if (max_chunks)
        {
            gemm_half_q_half_cuda_part(a, b, c, last_chunk, size_n, size_k, block_m_size_max, clear, r_weights, r_weights_stride, mul_r_weights);
        }

        if (last_chunk_size)
        {
            gemm_half_q_half_cuda_part(a + last_chunk * size_k, b, c + last_chunk * size_n, last_chunk_size, size_n, size_k, last_chunk_size, clear, r_weights, r_weights_stride, mul_r_weights);
        }
    }
}

__global__ void clear_kernel
(
    half* __restrict__ c,
    const int size_m,
    const int size_n
)
{
    int m = blockIdx.y;
    int n = (blockIdx.x * CLEAR_N_SIZE + threadIdx.x) * 8;
    if (n >= size_n) return;
    int4* c_ptr = (int4*)(c + m * size_n + n);
    *c_ptr = {};
}

void clear_tensor_cuda
(
    half* c,
    int size_m,
    int size_n
)
{
//     dim3 blockDim, gridDim;
//     blockDim.x = CLEAR_N_SIZE;
//     blockDim.y = 1;
//     gridDim.x = DIVIDE(size_n / 8, CLEAR_N_SIZE);
//     gridDim.y = size_m;
//     clear_kernel<<<gridDim, blockDim>>>(c, size_m, size_n);
}


================================================
FILE: server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cuh
================================================
#ifndef _q_gemm_cuh
#define _q_gemm_cuh

#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cstdint>
#include <cstdio>
#include <ATen/cuda/CUDAContext.h>

#include "q_matrix.cuh"

void gemm_half_q_half_cuda
(
    cublasHandle_t cublas_handle,
    const half* a,
    QMatrix* b,
    half* c,
    int size_m,
    int size_n,
    int size_k,
    bool clear = false,
    half* reconstruct = NULL,
    bool force_cuda = false,
    const half* r_weights = NULL,
    const int r_weights_stride = 0,
    bool mul_r_weights = false
);

void clear_tensor_cuda
(
    half* c,
    int size_m,
    int size_n
);

#endif


================================================
FILE: server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm_kernel.cuh
================================================
#include "compat.cuh"

__forceinline__ __device__ half2 dot22_8(half2(&dq)[4], const half* a_ptr, const half2 g_result, const half qs_h)
{
    half2 result = {};
    const half2* a2_ptr = (const half2*)a_ptr;
    #pragma unroll
    for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result);
    return __hfma2(result, __halves2half2(qs_h, qs_h), g_result);
}

__forceinline__ __device__ half2 dot22_16(half2(&dq)[8], const half* a_ptr, const half2 g_result, const half qs_h)
{
    half2 result = {};
    const half2* a2_ptr = (const half2*)a_ptr;
    #pragma unroll
    for (int i = 0; i < 8; i++) result = __hfma2(dq[i], *a2_ptr++, result);
    return __hfma2(result, __halves2half2(qs_h, qs_h), g_result);
}

__forceinline__ __device__ half2 dot22_32(half2(&dq)[16], const half* a_ptr, const half2 g_result, const half qs_h)
{
    half2 result = {};
    const half2* a2_ptr = (const half2*)a_ptr;
    #pragma unroll
    for (int i = 0; i < 16; i += 1) result = __hfma2(dq[i], *a2_ptr++, result);
    return __hfma2(result, __halves2half2(qs_h, qs_h), g_result);
}

__forceinline__ __device__ float dot22_8_f(half2(&dq)[4], const half* a_ptr, const float g_result, const float qs_f)
{
    half2 result = {};
    const half2* a2_ptr = (const half2*)a_ptr;
    #pragma unroll
    for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result);
    float result_f = __half2float(__low2half(result)) + __half2float(__high2half(result));
    return fma(result_f, qs_f, g_result);
}

__forceinline__ __device__ float dot22_16_f(half2(&dq)[8], const half* a_ptr, const float g_result, const float qs_f)
{
    half2 result = {};
    const half2* a2_ptr = (const half2*)a_ptr;
    #pragma unroll
    for (int i = 0; i < 8; i++) result = __hfma2(dq[i], *a2_ptr++, result);
    float result_f = __half2float(__low2half(result)) + __half2float(__high2half(result));
    return fma(result_f, qs_f, g_result);
}

__forceinline__ __device__ float dot22_32_f(half2(&dq)[16], const half* a_ptr, const float g_result, const float qs_f)
{
    half2 result = {};
    const half2* a2_ptr = (const half2*)a_ptr;
    #pragma unroll
    for (int i = 0; i < 16; i += 1) result = __hfma2(dq[i], *a2_ptr++, result);
    float result_f = __half2float(__low2half(result)) + __half2float(__high2half(result));
    return fma(result_f, qs_f, g_result);
}

__forceinline__ __device__ half dot22_8_h(half2(&dq)[4], const half* a_ptr, const half g_result, const half qs_h)
{
    // Use FP32 accumulator to avoid potential overflow since unscaled weights are in the range -128..127

    float result = {};
    #pragma unroll
    for (int i = 0; i < 4; i++)
    {
        half2 w01 = dq[i];
        float w0 = __low2float(w01);
        float w1 = __high2float(w01);
        float x0 = __half2float(*a_ptr++);
        float x1 = __half2float(*a_ptr++);
        result = fma(w0, x0, result);
        result = fma(w1, x1, result);
    }
    float qs = __half2float(qs_h);
    result *= qs;
    half result_h = __float2half_rn(result);
    return __hadd(result_h, g_result);
}

__forceinline__ __device__ half dot22_16_h(half2(&dq)[8], const half* a_ptr, const half g_result, const half qs_h)
{
    half2 result = {};
    const half2* a2_ptr = (const half2*)a_ptr;
    #pragma unroll
    for (int i = 0; i < 8; i++) result = __hfma2(dq[i], *a2_ptr++, result);
    half result_h = __hadd(__low2half(result), __high2half(result));
    return __hfma(result_h, qs_h, g_result);
}

__forceinline__ __device__ half dot22_32_h(half2(&dq)[16], const half* a_ptr, const half g_result, const half qs_h)
{
    half2 result = {};
    const half2* a2_ptr = (const half2*)a_ptr;
    #pragma unroll
    for (int i = 0; i < 16; i += 1) result = __hfma2(dq[i], *a2_ptr++, result);
    half result_h = __hadd(__low2half(result), __high2half(result));
    return __hfma(result_h, qs_h, g_result);
}


typedef void (*fp_gemm_half_q_half_kernel)
(
    const half*,
    const uint32_t*,
    const uint32_t*,
    const half*,
    half*,
    const int,
    const int,
    const int,
    const int,
    const uint16_t*,
    const uint16_t*,
    const int,
    const int,
    const int,
    const int,
    const int,
    const int,
    const bool,
    const half*,
    const int
);

template <int m_count, bool use_r_weights, bool mul_r_weights>
__global__ void gemm_half_q_half_kernel
(
    const half*      __restrict__ a,
    const uint32_t*  __restrict__ b_q_weight,
    const uint32_t*  __restrict__ b_q_scale,
    const half*      __restrict__ b_q_scale_max,
    half*            __restrict__ c,
    const int size_m,
    const int size_n,
    const int size_k,
    const int groups,
    const uint16_t* __restrict__ b_q_group_map,
    const uint16_t* __restrict__ b_q_perm,
    const int rows_8,
    const int rows_6,
    const int rows_5,
    const int rows_4,
    const int rows_3,
    const int rows_2,
    const bool clear,
    const half* r_weights,
    const int r_weights_stride
)
{
    MatrixView_half a_(a, size_m, size_k);
    MatrixView_half_rw c_(c, size_m, size_n);
    MatrixView_q4_row b_q_scale_(b_q_scale, groups, size_n);

    int t = threadIdx.x;

    // Block

    int offset_n = blockIdx.x * EXL2_BLOCK_KN_SIZE * 4;
    int offset_m = blockIdx.y * m_count;
    int offset_k = blockIdx.z * EXL2_BLOCK_KN_SIZE;

    int end_n = min(offset_n + EXL2_BLOCK_KN_SIZE * 4, size_n);
    int end_m = min(offset_m + m_count, size_m);
    int end_k = min(offset_k + EXL2_BLOCK_KN_SIZE, size_k);
    int n = offset_n + t * 4;

    // Read weights

    half_uint16 weights[MAX_Q_GEMM_WEIGHTS];
    if constexpr (use_r_weights)
    {
        uint16_t any_w = 0;
        const half* w_ptr = r_weights;
        for (int m = 0; m < m_count; ++m)
        {
            weights[m].as_half = *w_ptr;
            w_ptr += r_weights_stride;
            any_w |= weights[m].as_uint16;
        }
        if (!any_w) return;  // Early exit if all weights are zero -- does not zero output (!!!)
    }

    // Preload block_a

    __shared__ half block_a[m_count][EXL2_BLOCK_KN_SIZE];

    if (offset_k + t < end_k)
    {
        for (int m = 0; m < m_count; ++m)
        {
            const half* a_ptr = a_.item_ptr(offset_m + m, 0);
            half* block_a_ptr = block_a[m];
            half a0 = a_ptr[b_q_perm[offset_k + t]];
//            half a0 = a_ptr[offset_k + t];
            block_a_ptr[t] = a0;
        }
    }

    // Clear

    if (n >= size_n) return;

    if (clear && blockIdx.z == 0) // && (threadIdx.x & 1) == 0)
    {
        for (int m = 0; m < m_count; m++)
            *((uint64_t*) c_.item_ptr(offset_m + m, n)) = 0;
    }

    __syncthreads();

    // Find initial group

    //int group = offset_k / groupsize;
    int group = b_q_group_map[offset_k * 2];

//    if (offset_m == 0 && t == 0)
//        DBGI2(offset_k, group);

    // Preload scales

    half scales[EXL2_MAX_GROUPS_IN_BLOCK][4];

    //int groups_in_block = DIVIDE((end_k - offset_k), groupsize);
    int temp_k = offset_k;
    for (int g = 0; temp_k < end_k; g++)
    {
        int qscales[4];
        b_q_scale_.item4(qscales, group + g, n);
        qscales[0]++;
        qscales[1]++;
        qscales[2]++;
        qscales[3]++;
        half maxscale = b_q_scale_max[group + g];
        scales[g][0] = __hmul(__int2half_rn(qscales[0] * qscales[0]), maxscale);
        scales[g][1] = __hmul(__int2half_rn(qscales[1] * qscales[1]), maxscale);
        scales[g][2] = __hmul(__int2half_rn(qscales[2] * qscales[2]), maxscale);
        scales[g][3] = __hmul(__int2half_rn(qscales[3] * qscales[3]), maxscale);
        temp_k += b_q_group_map[temp_k * 2 + 1];
    }

    // a, b offset

    int pre_rows_8 = min(rows_8, offset_k);
    int pre_rows_6 = offset_k > rows_8 ? min(rows_6, offset_k) - rows_8 : 0;
    int pre_rows_5 = offset_k > rows_6 ? min(rows_5, offset_k) - rows_6 : 0;
    int pre_rows_4 = offset_k > rows_5 ? min(rows_4, offset_k) - rows_5 : 0;
    int pre_rows_3 = offset_k > rows_4 ? min(rows_3, offset_k) - rows_4 : 0;
    int pre_rows_2 = offset_k > rows_3 ? min(rows_2, offset_k) - rows_3 : 0;
    int qk = 0;
    qk += pre_rows_8 / 32 * 8;
    qk += pre_rows_6 / 32 * 6;
    qk += pre_rows_5 / 32 * 5;
    qk += pre_rows_4 / 32 * 4;
    qk += pre_rows_3 / 32 * 3;
    qk += pre_rows_2 / 32 * 2;

    const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
    const half* a_ptr = &block_a[0][0];
    int a_stride = EXL2_BLOCK_KN_SIZE;

    // Initial group

    int scales_idx = 0;
    half qs_h0 = scales[scales_idx][0];
    half qs_h1 = scales[scales_idx][1];
    half qs_h2 = scales[scales_idx][2];
    half qs_h3 = scales[scales_idx][3];
    int nextgroup = offset_k + b_q_group_map[offset_k * 2 + 1];

    // Column result

    half block_c[m_count][4] = {};

    // Dequantize groups

    int k = offset_k;

    while (k < rows_8 && k < end_k)
    {
        if (k == nextgroup)
        {
            group++;
            scales_idx++;
            qs_h0 = scales[scales_idx][0];
            qs_h1 = scales[scales_idx][1];
            qs_h2 = scales[scales_idx][2];
            qs_h3 = scales[scales_idx][3];
            nextgroup += b_q_group_map[k * 2 + 1];
        }

        #pragma unroll
        for (int j = 0; j < 4; j++)
        {
            int4 load_int4[2];
            load_int4[0] = *((int4*) b_ptr); b_ptr += size_n;
            load_int4[1] = *((int4*) b_ptr); b_ptr += size_n;

            half2 dq[4][4];
            dequant_8bit_8(load_int4[0].x, load_int4[1].x, dq[0], size_n);
            dequant_8bit_8(load_int4[0].y, load_int4[1].y, dq[1], size_n);
            dequant_8bit_8(load_int4[0].z, load_int4[1].z, dq[2], size_n);
            dequant_8bit_8(load_int4[0].w, load_int4[1].w, dq[3], size_n);

            for (int m = 0; m < m_count; m++)
            {
                if constexpr (use_r_weights) { if (!weights[m].as_uint16) continue; }
                block_c[m][0] = dot22_8_h(dq[0], a_ptr + m * a_stride, block_c[m][0], qs_h0);
                block_c[m][1] = dot22_8_h(dq[1], a_ptr + m * a_stride, block_c[m][1], qs_h1);
                block_c[m][2] = dot22_8_h(dq[2], a_ptr + m * a_stride, block_c[m][2], qs_h2);
                block_c[m][3] = dot22_8_h(dq[3], a_ptr + m * a_stride, block_c[m][3], qs_h3);
            }
            a_ptr += 8;
        }
        k += 32;
    }

    while (k < rows_6 && k < end_k)
    {
        if (k == nextgroup)
        {
            group++;
            scales_idx++;
            qs_h0 = scales[scales_idx][0];
            qs_h1 = scales[scales_idx][1];
            qs_h2 = scales[scales_idx][2];
            qs_h3 = scales[scales_idx][3];
            nextgroup += b_q_group_map[k * 2 + 1];
        }

        #pragma unroll
        for (int j = 0; j < 2; j++)
        {
            int4 load_int4[3];
            load_int4[0] = *((int4*) b_ptr); b_ptr += size_n;
            load_int4[1] = *((int4*) b_ptr); b_ptr += size_n;
            load_int4[2] = *((int4*) b_ptr); b_ptr += size_n;

            half2 dq[4][8];
            dequant_6bit_16(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0], size_n);
            dequant_6bit_16(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1], size_n);
            dequant_6bit_16(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2], size_n);
            dequant_6bit_16(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3], size_n);

            for (int m = 0; m < m_count; m++)
            {
                if constexpr (use_r_weights) { if (!weights[m].as_uint16) continue; }
                block_c[m][0] = dot22_16_h(dq[0], a_ptr + m * a_stride, block_c[m][0], qs_h0);
                block_c[m][1] = dot22_16_h(dq[1], a_ptr + m * a_stride, block_c[m][1], qs_h1);
                block_c[m][2] = dot22_16_h(dq[2], a_ptr + m * a_stride, block_c[m][2], qs_h2);
                block_c[m][3] = dot22_16_h(dq[3], a_ptr + m * a_stride, block_c[m][3], qs_h3);
            }
            a_ptr += 16;
        }
        k += 32;
    }

    while (k < rows_5 && k < end_k)
    {
        if (k == nextgroup)
        {
            group++;
            scales_idx++;
            qs_h0 = scales[scales_idx][0];
            qs_h1 = scales[scales_idx][1];
            qs_h2 = scales[scales_idx][2];
            qs_h3 = scales[scales_idx][3];
            nextgroup += b_q_group_map[k * 2 + 1];
        }

        #pragma unroll
        for (int j = 0; j < 1; j++)
        {
            int4 load_int4[5];
            load_int4[0] = *((int4*) b_ptr); b_ptr += size_n;
            load_int4[1] = *((int4*) b_ptr); b_ptr += size_n;
            load_int4[2] = *((int4*) b_ptr); b_ptr += size_n;
            load_int4[3] = *((int4*) b_ptr); b_ptr += size_n;
            load_int4[4] = *((int4*) b_ptr); b_ptr += size_n;

            half2 dq[4][16];
            dequant_5bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, load_int4[3].x, load_int4[4].x, dq[0], size_n);
            dequant_5bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, load_int4[3].y, load_int4[4].y, dq[1], size_n);
            dequant_5bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, load_int4[3].z, load_int4[4].z, dq[2], size_n);
            dequant_5bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, load_int4[3].w, load_int4[4].w, dq[3], size_n);

            for (int m = 0; m < m_count; m++)
            {
                if constexpr (use_r_weights) { if (!weights[m].as_uint16) continue; }
                block_c[m][0] = dot22_32_h(dq[0], a_ptr + m * a_stride, block_c[m][0], qs_h0);
                block_c[m][1] = dot22_32_h(dq[1], a_ptr + m * a_stride, block_c[m][1], qs_h1);
                block_c[m][2] = dot22_32_h(dq[2], a_ptr + m * a_stride, block_c[m][2], qs_h2);
                block_c[m][3] = dot22_32_h(dq[3], a_ptr + m * a_stride, block_c[m][3], qs_h3);
            }
            a_ptr += 32;
        }

        k += 32;
    }

    while (k < rows_4 && k < end_k)
    {
        if (k == nextgroup)
        {
            group++;
            scales_idx++;
            qs_h0 = scales[scales_idx][0];
            qs_h1 = scales[scales_idx][1];
            qs_h2 = scales[scales_idx][2];
            qs_h3 = scales[scales_idx][3];
            nextgroup += b_q_group_map[k * 2 + 1];
        }

        #pragma unroll
        for (int j = 0; j < 4; j++)
        {
            int4 load_int4[1];
            load_int4[0] = *((int4*) b_ptr); b_ptr += size_n;

            half2 dq[4][4];
            dequant_4bit_8(load_int4[0].x, dq[0], size_n);
            dequant_4bit_8(load_int4[0].y, dq[1], size_n);
            dequant_4bit_8(load_int4[0].z, dq[2], size_n);
            dequant_4bit_8(load_int4[0].w, dq[3], size_n);

            for (int m = 0; m < m_count; m++)
            {
                if constexpr (use_r_weights) { if (!weights[m].as_uint16) continue; }
                block_c[m][0] = dot22_8_h(dq[0], a_ptr + m * a_stride, block_c[m][0], qs_h0);
                block_c[m][1] = dot22_8_h(dq[1], a_ptr + m * a_stride, block_c[m][1], qs_h1);
                block_c[m][2] = dot22_8_h(dq[2], a_ptr + m * a_stride, block_c[m][2], qs_h2);
                block_c[m][3] = dot22_8_h(dq[3], a_ptr + m * a_stride, block_c[m][3], qs_h3);
            }
            a_ptr += 8;
        }
        k += 32;
    }

    while (k < rows_3 && k < end_k)
    {
        if (k == nextgroup)
        {
            group++;
            scales_idx++;
            qs_h0 = scales[scales_idx][0];
            qs_h1 = scales[scales_idx][1];
            qs_h2 = scales[scales_idx][2];
            qs_h3 = scales[scales_idx][3];
            nextgroup += b_q_group_map[k * 2 + 1];
        }

        #pragma unroll
        for (int j = 0; j < 1; j++)
        {
            int4 load_int4[3];
            load_int4[0] = *((int4*) b_ptr); b_ptr += size_n;
            load_int4[1] = *((int4*) b_ptr); b_ptr += size_n;
            load_int4[2] = *((int4*) b_ptr); b_ptr += size_n;

            half2 dq[4][16];
            dequant_3bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0], size_n);
            dequant_3bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1], size_n);
            dequant_3bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2], size_n);
            dequant_3bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3], size_n);

            for (int m = 0; m < m_count; m++)
            {
                if constexpr (use_r_weights) { if (!weights[m].as_uint16) continue; }
                block_c[m][0] = dot22_32_h(dq[0], a_ptr + m * a_stride, block_c[m][0], qs_h0);
                block_c[m][1] = dot22_32_h(dq[1], a_ptr + m * a_stride, block_c[m][1], qs_h1);
                block_c[m][2] = dot22_32_h(dq[2], a_ptr + m * a_stride, block_c[m][2], qs_h2);
                block_c[m][3] = dot22_32_h(dq[3], a_ptr + m * a_stride, block_c[m][3], qs_h3);
            }
            a_ptr += 32;
        }
        k += 32;
    }

    while (k < rows_2 && k < end_k)
    {
        if (k == nextgroup)
        {
            group++;
            scales_idx++;
            qs_h0 = scales[scales_idx][0];
            qs_h1 = scales[scales_idx][1];
            qs_h2 = scales[scales_idx][2];
            qs_h3 = scales[scales_idx][3];
            nextgroup += b_q_group_map[k * 2 + 1];
        }

        #pragma unroll
        for (int j = 0; j < 1; j++)
        {
            int4 load_int4[1];
            load_int4[0] = *((int4*) b_ptr); b_ptr += size_n;

            half2 dq[4][8];
            dequant_2bit_16(load_int4[0].x, dq[0], size_n);
            dequant_2bit_16(load_int4[0].y, dq[1], size_n);
            dequant_2bit_16(load_int4[0].z, dq[2], size_n);
            dequant_2bit_16(load_int4[0].w, dq[3], size_n);

            for (int m = 0; m < m_count; m++)
            {
                if constexpr (use_r_weights) { if (!weights[m].as_uint16) continue; }
                block_c[m][0] = dot22_16_h(dq[0], a_ptr + m * a_stride, block_c[m][0], qs_h0);
                block_c[m][1] = dot22_16_h(dq[1], a_ptr + m * a_stride, block_c[m][1], qs_h1);
                block_c[m][2] = dot22_16_h(dq[2], a_ptr + m * a_stride, block_c[m][2], qs_h2);
                block_c[m][3] = dot22_16_h(dq[3], a_ptr + m * a_stride, block_c[m][3], qs_h3);
            }

            a_ptr += 16;
        }
        k += 16;
    }

    // Accumulate column sums in c

    for (int m = 0; m < m_count; m++)
    {
        half2* out = (half2*)c_.item_ptr(offset_m + m, n);
        half2 result01 = __halves2half2(block_c[m][0], block_c[m][1]);
        half2 result23 = __halves2half2(block_c[m][2], block_c[m][3]);

        if constexpr (mul_r_weights)
        {
            half2 w_mul2 = __half2half2(weights[m].as_half);
            result01 = __hmul2(result01, w_mul2);
            result23 = __hmul2(result23, w_mul2);
        }

        atomicAdd(out    , result01);
        atomicAdd(out + 1, result23);
//        *out = result01;
//        *(out + 1) = result23;
    }
}

template <bool use_r_weights, bool mul_r_weights>
struct map_m_count_exl2 {
    static constexpr fp_gemm_half_q_half_kernel pick_gemm_half_q_half_kernel(const int m_count)
    {
        #if EXL2_BLOCK_M_SIZE_MAX >= 1
        if (m_count == 1) return gemm_half_q_half_kernel<1, use_r_weights, mul_r_weights>;
        #endif
        #if EXL2_BLOCK_M_SIZE_MAX >= 2
        if (m_count == 2) return gemm_half_q_half_kernel<2, use_r_weights, mul_r_weights>;
        #endif
        #if EXL2_BLOCK_M_SIZE_MAX >= 3
        if (m_count == 3) return gemm_half_q_half_kernel<3, use_r_weights, mul_r_weights>;
        #endif
        #if EXL2_BLOCK_M_SIZE_MAX >= 4
        if (m_count == 4) return gemm_half_q_half_kernel<4, use_r_weights, mul_r_weights>;
        #endif
        #if EXL2_BLOCK_M_SIZE_MAX >= 5
        if (m_count == 5) return gemm_half_q_half_kernel<5, use_r_weights, mul_r_weights>;
        #endif
        #if EXL2_BLOCK_M_SIZE_MAX >= 6
        if (m_count == 6) return gemm_half_q_half_kernel<6, use_r_weights, mul_r_weights>;
        #endif
        #if EXL2_BLOCK_M_SIZE_MAX >= 7
        if (m_count == 7) return gemm_half_q_half_kernel<7, use_r_weights, mul_r_weights>;
        #endif
        #if EXL2_BLOCK_M_SIZE_MAX >= 8
        if (m_count == 8) return gemm_half_q_half_kernel<8, use_r_weights, mul_r_weights>;
        #endif
        return NULL;
    }
};

fp_gemm_half_q_half_kernel pick_gemm_half_q_half_kernel(const int m_count, bool r_weights, bool mul_r_weights)
{
    if (!r_weights && !mul_r_weights) return map_m_count_exl2<false, false>::pick_gemm_half_q_half_kernel(m_count);
    if (!r_weights &&  mul_r_weights) return map_m_count_exl2<false,  true>::pick_gemm_half_q_half_kernel(m_count);
    if ( r_weights && !mul_r_weights) return map_m_count_exl2< true, false>::pick_gemm_half_q_half_kernel(m_count);
    if ( r_weights &&  mul_r_weights) return map_m_count_exl2< true,  true>::pick_gemm_half_q_half_kernel(m_count);
    return NULL;
}


================================================
FILE: server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm_kernel_gptq.cuh
================================================
#include "compat.cuh"

__forceinline__ __device__ half2 dot22_8(half2(&dq)[4], const half* a_ptr, const half2 g_result)
{
    half2 result = {};
    const half2* a2_ptr = (const half2*)a_ptr;
    #pragma unroll
    for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result);
    return __hadd2(result, g_result);
}

__forceinline__ __device__ float dot22_8_f(half2(&dq)[4], const half* a_ptr)
{
    half2 result = {};
    const half2* a2_ptr = (const half2*)a_ptr;
    #pragma unroll
    for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result);
    return __half2float(__low2half(result)) + __half2float(__high2half(result));
}

__forceinline__ __device__ half2 dot22_8_h2(half2(&dq)[4], const half* a_ptr)
{
    half2 result = {};
    const half2* a2_ptr = (const half2*)a_ptr;
    #pragma unroll
    for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result);
    return result;
}

typedef void (*fp_gemm_half_q_half_gptq_kernel)
(
    const half*,
    const uint32_t*,
    const uint32_t*,
    const half*,
    half*,
    const int,
    const int,
    const int,
    const int,
    const int,
    const uint16_t*,
    const int,
    const bool,
    const half*,
    const int
);

template <int m_count, bool use_r_weights, bool mul_r_weights>
__global__ void gemm_half_q_half_gptq_kernel
(
    const half* __restrict__ a,
    const uint32_t* __restrict__ b_q_weight,
    const uint32_t* __restrict__ b_gptq_qzeros,
    const half* __restrict__ b_gptq_scales,
    half* __restrict__ c,
    const int size_m,
    const int size_n,
    const int size_k,
    const int groups,
    const int groupsize,
    const uint16_t* __restrict__ b_q_perm,
    const int rows_4,
    const bool clear,
    const half* r_weights,
    const int r_weights_stride
)
{
    MatrixView_half a_(a, size_m, size_k);
    MatrixView_half_rw c_(c, size_m, size_n);
    MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
    MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);

    int t = threadIdx.x;

    // Block

    int offset_n = blockIdx.x * GPTQ_BLOCK_KN_SIZE * 4;
    int offset_m = blockIdx.y * m_count;
    int offset_k = blockIdx.z * GPTQ_BLOCK_KN_SIZE;

    int end_n = min(offset_n + GPTQ_BLOCK_KN_SIZE * 4, size_n);
    int end_m = min(offset_m + m_count, size_m);
    int end_k = min(offset_k + GPTQ_BLOCK_KN_SIZE, size_k);

    int n = offset_n + t * 4;

    // Read weights

    half_uint16 weights[MAX_Q_GEMM_WEIGHTS];
    if constexpr (use_r_weights)
    {
        uint16_t any_w = 0;
        const half* w_ptr = r_weights;
        for (int m = 0; m < m_count; ++m)
        {
            weights[m].as_half = *w_ptr;
            w_ptr += r_weights_stride;
            any_w |= weights[m].as_uint16;
        }
        if (!any_w) return;  // Early exit if all weights are zero -- does not zero output (!!!)
    }

    // Preload block_a

    __shared__ half block_a[m_count][GPTQ_BLOCK_KN_SIZE];

    if (offset_k + t < end_k)
    {
        for (int m = 0; m < m_count; ++m)
        {
            const half* a_ptr = a_.item_ptr(offset_m + m, 0);
            half* block_a_ptr = block_a[m];

            half a0;
            if (b_q_perm) a0 = a_ptr[b_q_perm[offset_k + t]];
            else a0 = a_ptr[offset_k + t];
            block_a_ptr[t] = a0;
        }
    }

    // Zero output

    if (n >= size_n) return;

    if (clear && blockIdx.z == 0) // && (threadIdx.x & 1) == 0)
    {
        for (int m = 0; m < m_count; m++)
            *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0;
    }

    __syncthreads();

    // Find initial group

    int group = offset_k / groupsize;
    int nextgroup = offset_k + groupsize;

    // a, b offset

    int qk = offset_k / (32 / 4);

    const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
    const half* a_ptr = &block_a[0][0];
    int a_stride = GPTQ_BLOCK_KN_SIZE;

    // Initial group

    int zeros[4];
    half2 scales[4];
    half2 z1z16[4][2];
    half2 y1y16[4][2];
    b_gptq_qzeros_.item4(zeros, group, n);
    b_gptq_scales_.item4_h2(scales, group, n);
    dequant_4bit_8_prep_zero((zeros[0] + 1) & 0x0F, z1z16[0], y1y16[0]);
    dequant_4bit_8_prep_zero((zeros[1] + 1) & 0x0F, z1z16[1], y1y16[1]);
    dequant_4bit_8_prep_zero((zeros[2] + 1) & 0x0F, z1z16[2], y1y16[2]);
    dequant_4bit_8_prep_zero((zeros[3] + 1) & 0x0F, z1z16[3], y1y16[3]);

//    __syncthreads();

    // Column result

    half2 block_c[m_count][4] = {};

    // Dequantize and multiply

    int k = offset_k;
    while (k < end_k)
    {
        if (k == nextgroup)
        {
            group++;
            nextgroup += groupsize;
            b_gptq_qzeros_.item4(zeros, group, n);
            b_gptq_scales_.item4_h2(scales, group, n);
            dequant_4bit_8_prep_zero((zeros[0] + 1) & 0x0F, z1z16[0], y1y16[0]);
            dequant_4bit_8_prep_zero((zeros[1] + 1) & 0x0F, z1z16[1], y1y16[1]);
            dequant_4bit_8_prep_zero((zeros[2] + 1) & 0x0F, z1z16[2], y1y16[2]);
            dequant_4bit_8_prep_zero((zeros[3] + 1) & 0x0F, z1z16[3], y1y16[3]);
        }

        #pragma unroll
        for (int j = 0; j < 4; j++)
        {
            const int4* b_ptr4 = (int4*) b_ptr;
            int4 load_int4 = *b_ptr4;

            half2 dq[4][4];
            dequant_4bit_8_gptq(load_int4.x, dq[0], z1z16[0], y1y16[0], size_n, false);
            dequant_4bit_8_gptq(load_int4.y, dq[1], z1z16[1], y1y16[1], size_n, false);
            dequant_4bit_8_gptq(load_int4.z, dq[2], z1z16[2], y1y16[2], size_n, false);
            dequant_4bit_8_gptq(load_int4.w, dq[3], z1z16[3], y1y16[3], size_n, false);

            #pragma unroll
            for (int m = 0; m < m_count; m++)
            {
                if constexpr (use_r_weights) { if (!weights[m].as_uint16) continue; }
                block_c[m][0] = __hfma2(dot22_8_h2(dq[0], a_ptr + m * a_stride), scales[0], block_c[m][0]);
                block_c[m][1] = __hfma2(dot22_8_h2(dq[1], a_ptr + m * a_stride), scales[1], block_c[m][1]);
                block_c[m][2] = __hfma2(dot22_8_h2(dq[2], a_ptr + m * a_stride), scales[2], block_c[m][2]);
                block_c[m][3] = __hfma2(dot22_8_h2(dq[3], a_ptr + m * a_stride), scales[3], block_c[m][3]);
            }

            b_ptr += size_n;
            a_ptr += 8;
        }

        k += 32;
    }

    for (int m = 0; m < m_count; m++)
    {
        half2 *out = (half2*) c_.item_ptr(offset_m + m, n);
        half result0 = __hadd(__low2half(block_c[m][0]), __high2half(block_c[m][0]));
        half result1 = __hadd(__low2half(block_c[m][1]), __high2half(block_c[m][1]));
        half result2 = __hadd(__low2half(block_c[m][2]), __high2half(block_c[m][2]));
        half result3 = __hadd(__low2half(block_c[m][3]), __high2half(block_c[m][3]));
        half2 result01 = __halves2half2(result0, result1);
        half2 result23 = __halves2half2(result2, result3);

        if constexpr (mul_r_weights)
        {
            half2 w_mul2 = __half2half2(weights[m].as_half);
            result01 = __hmul2(result01, w_mul2);
            result23 = __hmul2(result23, w_mul2);
        }

        atomicAdd(out    , result01);
        atomicAdd(out + 1, result23);
    }
}

template <bool use_r_weights, bool mul_r_weights>
struct map_m_count_gptq {
    static constexpr fp_gemm_half_q_half_gptq_kernel pick_gemm_half_q_half_gptq_kernel(int m_count)
    {
        #if GPTQ_BLOCK_M_SIZE_MAX >= 1
        if (m_count == 1) return gemm_half_q_half_gptq_kernel<1, use_r_weights, mul_r_weights>;
        #endif
        #if GPTQ_BLOCK_M_SIZE_MAX >= 2
        if (m_count == 2) return gemm_half_q_half_gptq_kernel<2, use_r_weights, mul_r_weights>;
        #endif
        #if GPTQ_BLOCK_M_SIZE_MAX >= 3
        if (m_count == 3) return gemm_half_q_half_gptq_kernel<3, use_r_weights, mul_r_weights>;
        #endif
        #if GPTQ_BLOCK_M_SIZE_MAX >= 4
        if (m_count == 4) return gemm_half_q_half_gptq_kernel<4, use_r_weights, mul_r_weights>;
        #endif
        #if GPTQ_BLOCK_M_SIZE_MAX >= 5
        if (m_count == 5) return gemm_half_q_half_gptq_kernel<5, use_r_weights, mul_r_weights>;
        #endif
        #if GPTQ_BLOCK_M_SIZE_MAX >= 6
        if (m_count == 6) return gemm_half_q_half_gptq_kernel<6, use_r_weights, mul_r_weights>;
        #endif
        #if GPTQ_BLOCK_M_SIZE_MAX >= 7
        if (m_count == 7) return gemm_half_q_half_gptq_kernel<7, use_r_weights, mul_r_weights>;
        #endif
        #if GPTQ_BLOCK_M_SIZE_MAX >= 8
        if (m_count == 8) return gemm_half_q_half_gptq_kernel<8, use_r_weights, mul_r_weights>;
        #endif
        return NULL;
    }
};

fp_gemm_half_q_half_gptq_kernel pick_gemm_half_q_half_gptq_kernel(const int m_count, bool r_weights, bool mul_r_weights)
{
    if (!r_weights && !mul_r_weights) return map_m_count_gptq<false, false>::pick_gemm_half_q_half_gptq_kernel(m_count);
    if (!r_weights &&  mul_r_weights) return map_m_count_gptq<false,  true>::pick_gemm_half_q_half_gptq_kernel(m_count);
    if ( r_weights && !mul_r_weights) return map_m_count_gptq< true, false>::pick_gemm_half_q_half_gptq_kernel(m_count);
    if ( r_weights &&  mul_r_weights) return map_m_count_gptq< true,  true>::pick_gemm_half_q_half_gptq_kernel(m_count);
    return NULL;
}


================================================
FILE: server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cu
================================================
#include "q_matrix.cuh"
#include "matrix_view.cuh"
#include "util.cuh"

#include "quant/qdq_2.cuh"
#include "quant/qdq_3.cuh"
#include "quant/qdq_4.cuh"
#include "quant/qdq_5.cuh"
#include "quant/qdq_6.cuh"
#include "quant/qdq_8.cuh"

#define BLOCK_KN_SIZE 128

#define THREADS_X 32
#define THREADS_Y 32

// Shuffle quantized data on load

__global__ void shuffle_kernel
(
    uint32_t* __restrict__ b_q_weight,
    const int size_k,
    const int size_n,
    const int rows_8,
    const int rows_6,
    const int rows_5,
    const int rows_4,
    const int rows_3,
    const int rows_2
)
{
    int n = blockIdx.x * THREADS_X + threadIdx.x;
    if (n >= size_n) return;
    int k = 0;
    uint32_t* b_ptr = b_q_weight + n;
    while (k < rows_8) { shuffle_8bit_4 (b_ptr, size_n); b_ptr += 1 * size_n; k +=  4; }
    while (k < rows_6) { shuffle_6bit_16(b_ptr, size_n); b_ptr += 3 * size_n; k += 16; }
    while (k < rows_5) { shuffle_5bit_32(b_ptr, size_n); b_ptr += 5 * size_n; k += 32; }
    while (k < rows_4) { shuffle_4bit_8 (b_ptr, size_n); b_ptr += 1 * size_n; k +=  8; }
    while (k < rows_3) { shuffle_3bit_32(b_ptr, size_n); b_ptr += 3 * size_n; k += 32; }
    while (k < rows_2) { shuffle_2bit_16(b_ptr, size_n); b_ptr += 1 * size_n; k += 16; }
}


// QMatrix constructor

QMatrix::QMatrix
(
    const int _device,
    const int _height,
    const int _width,
    const int _groups,

    uint32_t* _q_weight,
    uint16_t* _q_perm,
    uint16_t* _q_invperm,
    uint32_t* _q_scale,
    half* _q_scale_max,
    uint16_t* _q_groups,
    uint16_t* _q_group_map,

    uint32_t* _gptq_qzeros,
    half* _gptq_scales,
    uint32_t* _gptq_g_idx,

    half* _temp_dq
) :
    device(_device),
    height(_height),
    width(_width),
    groups(_groups),
    temp_dq(_temp_dq)
{
    cudaSetDevice(device);

    failed = false;

    cuda_q_weight = _q_weight;
    cuda_q_perm = _q_perm;
    cuda_q_invperm = _q_invperm;
    cuda_q_scale = _q_scale;
    cuda_q_scale_max = _q_scale_max;
    cuda_q_groups = _q_groups;
    cuda_q_group_map = _q_group_map;
    cuda_gptq_qzeros = _gptq_qzeros;
    cuda_gptq_scales = _gptq_scales;

    is_gptq = (_gptq_qzeros != NULL);

    if (is_gptq)
    {
        gptq_groupsize = 1;
        while (gptq_groupsize * groups < height) gptq_groupsize *= 2;
    }

    // Create group map

    rows_8 = 0;
    rows_6 = 0;
    rows_5 = 0;
    rows_4 = 0;
    rows_3 = 0;
    rows_2 = 0;

    if (!is_gptq)
    {
        uint16_t* cpu_q_groups = (uint16_t*)calloc(groups * 2, sizeof(uint16_t));
        cudaMemcpy(cpu_q_groups, cuda_q_groups, groups * 2 * sizeof(uint16_t), cudaMemcpyDeviceToHost);

        int row = 0;
        for (int i = 0; i < groups; i++)
        {
            int bits = cpu_q_groups[i * 2];

            int rows;
            if (i < groups - 1)
            {
                int qrows = cpu_q_groups[i * 2 + 3] - cpu_q_groups[i * 2 + 1];
                rows = qrows * 32 / bits;
            }
            else rows = height - row;

            if (bits == 8) rows_8 += rows;
            if (bits == 6) rows_6 += rows;
            if (bits == 5) rows_5 += rows;
            if (bits == 4) rows_4 += rows;
            if (bits == 3) rows_3 += rows;
            if (bits == 2) rows_2 += rows;
            row += rows;
        }

        free(cpu_q_groups);

        rows_6 += rows_8;
        rows_5 += rows_6;
        rows_4 += rows_5;
        rows_3 += rows_4;
        rows_2 += rows_3;
    }
    else
    {
        rows_4 = height;
        rows_3 = height;
        rows_2 = height;

        if (_gptq_g_idx)
        {
            if (!make_sequential(_gptq_g_idx))
            {
                failed = true;
                //printf("FAIL\n");
                return;
            }
        }
    }

//     DBGI(rows_8);
//     DBGI(rows_6);
//     DBGI(rows_5);
//     DBGI(rows_4);
//     DBGI(rows_3);
//     DBGI(rows_2);

    // Shuffle quantized data

    dim3 blockDim, gridDim;
    blockDim.x = THREADS_X;
    blockDim.y = 1;
    gridDim.x = DIVIDE(width, THREADS_X);
    gridDim.y = 1;
    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();

    shuffle_kernel<<<gridDim, blockDim, 0, stream>>>(cuda_q_weight, height, width, rows_8, rows_6, rows_5, rows_4, rows_3, rows_2);
}

QMatrix::~QMatrix()
{
}

// Reconstruct b[k,n] (GPTQ)

__global__ void reconstruct_gptq_kernel
(
    const uint32_t* __restrict__ b_q_weight,
    const uint16_t* __restrict__ b_q_perm,
    const uint32_t* __restrict__ b_gptq_qzeros,
    const half* __restrict__ b_gptq_scales,
    //const uint16_t* __restrict__ b_q_groups,
    const int size_k,
    const int size_n,
    const int groupsize,
    const int groups,
    half* __restrict__ b,
    const int rows_4
)
{
    MatrixView_half_rw b_(b, size_k, size_n);
    MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
    MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);

    int offset_k = BLOCK_KN_SIZE * blockIdx.y;
    int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;

    int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);

    // Preload remapping table

    __shared__ uint16_t perm[BLOCK_KN_SIZE];
    int t = threadIdx.x;

    if (b_q_perm)
    {
        if (offset_k + t < size_k)
            perm[t] = b_q_perm[offset_k + t];
    }

    // Column

    int n = offset_n + t * 4;
    if (n >= size_n) return;

    // Find initial group

    int group = offset_k / groupsize;
    int nextgroup = offset_k + groupsize;

    // b offset

    int qk = offset_k / (32 / 4);

    const uint32_t* b_ptr = b_q_weight + qk * size_n + n;

    // Initial zeros/scale

    int zeros[4];
    half2 scales[4];
    half2 z1z16[4][2];
    half2 y1y16[4][2];
    b_gptq_qzeros_.item4(zeros, group, n);
    b_gptq_scales_.item4_h2(scales, group, n);
    dequant_4bit_8_prep_zero((zeros[0] + 1) & 0x0F, z1z16[0], y1y16[0]);
    dequant_4bit_8_prep_zero((zeros[1] + 1) & 0x0F, z1z16[1], y1y16[1]);
    dequant_4bit_8_prep_zero((zeros[2] + 1) & 0x0F, z1z16[2], y1y16[2]);
    dequant_4bit_8_prep_zero((zeros[3] + 1) & 0x0F, z1z16[3], y1y16[3]);

    __syncthreads();

    int k = offset_k;
    int lk = 0;

    while (k < end_k)
    {
        if (k == nextgroup)
        {
            group++;
            nextgroup += groupsize;
            b_gptq_qzeros_.item4(zeros, group, n);
            b_gptq_scales_.item4_h2(scales, group, n);
            dequant_4bit_8_prep_zero((zeros[0] + 1) & 0x0F, z1z16[0], y1y16[0]);
            dequant_4bit_8_prep_zero((zeros[1] + 1) & 0x0F, z1z16[1], y1y16[1]);
            dequant_4bit_8_prep_zero((zeros[2] + 1) & 0x0F, z1z16[2], y1y16[2]);
            dequant_4bit_8_prep_zero((zeros[3] + 1) & 0x0F, z1z16[3], y1y16[3]);
        }

        for (int p = 0; p < 4; p++)
        {
            half2 dq[4][4];
            const int4* b_ptr4 = (int4*) b_ptr;
            int4 load_int4 = *b_ptr4;

            dequant_4bit_8_gptq(load_int4.x, dq[0], z1z16[0], y1y16[0], size_n, false);
            dequant_4bit_8_gptq(load_int4.y, dq[1], z1z16[1], y1y16[1], size_n, false);
            dequant_4bit_8_gptq(load_int4.z, dq[2], z1z16[2], y1y16[2], size_n, false);
            dequant_4bit_8_gptq(load_int4.w, dq[3], z1z16[3], y1y16[3], size_n, false);

            b_ptr += size_n;
            //half* dqh = (half*)dq;
            if (b_q_perm)
            {
                for (int j = 0; j < 4; j++)
                {
                    for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
                    b_.set4(perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j]));
                    b_.set4(perm[lk++], n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j]));
                }
            }
            else
            {
                for (int j = 0; j < 4; j++)
                {
                    for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
                    b_.set4(offset_k + lk++, n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j]));
                    b_.set4(offset_k + lk++, n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j]));
                }
            }
        }
        k += 32;
    }
}


// Reconstruct b[k,n]

__global__ void reconstruct_kernel
(
    const uint32_t* __restrict__ b_q_weight,
    const uint16_t* __restrict__ b_q_perm,
    const uint32_t* __restrict__ b_q_scale,
    const half* __restrict__ b_q_scale_max,
    const uint16_t* __restrict__ b_q_group_map,
    const int size_k,
    const int size_n,
    //const int groupsize,
    const int groups,
    half* __restrict__ b,
    const int rows_8,
    const int rows_6,
    const int rows_5,
    const int rows_4,
    const int rows_3,
    const int rows_2
)
{
    MatrixView_half_rw b_(b, size_k, size_n);
    MatrixView_q4_row b_q_scale_(b_q_scale, groups, size_n);

    int offset_k = BLOCK_KN_SIZE * blockIdx.y;
    int offset_n = BLOCK_KN_SIZE * blockIdx.x;

    // Preload remapping table

    int t = threadIdx.x;
    __shared__ uint16_t perm[BLOCK_KN_SIZE];
    if (offset_k + t < size_k)
        perm[t] = b_q_perm[offset_k + t];

    // Column

    int n = offset_n + t;
    if (n >= size_n) return;

    // Find initial group

    // int group = offset_k / groupsize;
    int group = b_q_group_map[offset_k * 2];

    int pre_rows_8 = min(rows_8, offset_k);
    int pre_rows_6 = offset_k > rows_8 ? min(rows_6, offset_k) - rows_8 : 0;
    int pre_rows_5 = offset_k > rows_6 ? min(rows_5, offset_k) - rows_6 : 0;
    int pre_rows_4 = offset_k > rows_5 ? min(rows_4, offset_k) - rows_5 : 0;
    int pre_rows_3 = offset_k > rows_4 ? min(rows_3, offset_k) - rows_4 : 0;
    int pre_rows_2 = offset_k > rows_3 ? min(rows_2, offset_k) - rows_3 : 0;
    int qk = 0;
    qk += pre_rows_8 / 32 * 8;
    qk += pre_rows_6 / 32 * 6;
    qk += pre_rows_5 / 32 * 5;
    qk += pre_rows_4 / 32 * 4;
    qk += pre_rows_3 / 32 * 3;
    qk += pre_rows_2 / 32 * 2;

    const uint32_t* b_ptr = b_q_weight + qk * size_n + n;

    half qs_h = dq_scale(b_q_scale_.item(group, n), b_q_scale_max[group]);
    half2 qs_h2 = __halves2half2(qs_h, qs_h);
    int nextgroup = offset_k + b_q_group_map[offset_k * 2 + 1];

    int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
    int k = offset_k;
    int lk = 0;

    __syncthreads();

    while (k < rows_8 && k < end_k)
    {
        if (k == nextgroup) { group++; qs_h = dq_scale(b_q_scale_.item(group, n), b_q_scale_max[group]); nextgroup += b_q_group_map[k * 2 + 1]; qs_h2 = __halves2half2(qs_h, qs_h); }
        for (int p = 0; p < 4; p++)
        {
            half2 dq[4];
            uint32_t q_0 = *b_ptr; b_ptr += size_n;
            uint32_t q_1 = *b_ptr; b_ptr += size_n;
            dequant_8bit_8(q_0, q_1, dq, size_n);
            for (int j = 0; j < 4; j++) dq[j] = __hmul2(dq[j], qs_h2);
            half* dqh = (half*) dq;
            for (int j = 0; j < 8; j++) b_.set(perm[lk++], n, dqh[j]);
        }
        k += 32;
    }

    while (k < rows_6 && k < end_k)
    {
        if (k == nextgroup) { group++; qs_h = dq_scale(b_q_scale_.item(group, n), b_q_scale_max[group]); nextgroup += b_q_group_map[k * 2 + 1]; qs_h2 = __halves2half2(qs_h, qs_h); }
        for (int p = 0; p < 2; p++)
        {
            half2 dq[8];
            uint32_t q_0 = *b_ptr; b_ptr += size_n;
            uint32_t q_1 = *b_ptr; b_ptr += size_n;
            uint32_t q_2 = *b_ptr; b_ptr += size_n;
            dequant_6bit_16(q_0, q_1, q_2, dq, size_n);
            for (int j = 0; j < 8; j++) dq[j] = __hmul2(dq[j], qs_h2);
            half* dqh = (half*) dq;
            for (int j = 0; j < 16; j++) b_.set(perm[lk++], n, dqh[j]);
        }
        k += 32;
    }

    while (k < rows_5 && k < end_k)
    {
        if (k == nextgroup) { group++; qs_h = dq_scale(b_q_scale_.item(group, n), b_q_scale_max[group]); nextgroup += b_q_group_map[k * 2 + 1]; qs_h2 = __halves2half2(qs_h, qs_h); }
        for (int p = 0; p < 1; p++)
        {
            half2 dq[16];
            uint32_t q_0 = *b_ptr; b_ptr += size_n;
            uint32_t q_1 = *b_ptr; b_ptr += size_n;
            uint32_t q_2 = *b_ptr; b_ptr += size_n;
            uint32_t q_3 = *b_ptr; b_ptr += size_n;
            uint32_t q_4 = *b_ptr; b_ptr += size_n;
            dequant_5bit_32(q_0, q_1, q_2, q_3, q_4, dq, size_n);
            for (int j = 0; j < 16; j++) dq[j] = __hmul2(dq[j], qs_h2);
            half* dqh = (half*) dq;
            for (int j = 0; j < 32; j++) b_.set(perm[lk++], n, dqh[j]);
        }
        k += 32;
    }

    while (k < rows_4 && k < end_k)
    {
        if (k == nextgroup) { group++; qs_h = dq_scale(b_q_scale_.item(group, n), b_q_scale_max[group]); nextgroup += b_q_group_map[k * 2 + 1]; qs_h2 = __halves2half2(qs_h, qs_h); }
        for (int p = 0; p < 4; p++)
        {
            half2 dq[4];
            uint32_t q_0 = *b_ptr; b_ptr += size_n;
            dequant_4bit_8(q_0, dq, size_n);
            for (int j = 0; j < 4; j++) dq[j] = __hmul2(dq[j], qs_h2);
            half* dqh = (half*) dq;
            for (int j = 0; j < 8; j++) b_.set(perm[lk++], n, dqh[j]);
        }
        k += 32;
    }

    while (k < rows_3 && k < end_k)
    {
        if (k == nextgroup) { group++; qs_h = dq_scale(b_q_scale_.item(group, n), b_q_scale_max[group]); nextgroup += b_q_group_map[k * 2 + 1]; qs_h2 = __halves2half2(qs_h, qs_h); }
        for (int p = 0; p < 1; p++)
        {
            half2 dq[16];
            uint32_t q_0 = *b_ptr; b_ptr += size_n;
            uint32_t q_1 = *b_ptr; b_ptr += size_n;
            uint32_t q_2 = *b_ptr; b_ptr += size_n;
            dequant_3bit_32(q_0, q_1, q_2, dq, size_n);
            for (int j = 0; j < 16; j++) dq[j] = __hmul2(dq[j], qs_h2);
            half* dqh = (half*) dq;
            for (int j = 0; j < 32; j++) b_.set(perm[lk++], n, dqh[j]);
        }
        k += 32;
    }

    while (k < rows_2 && k < end_k)
    {
        if (k == nextgroup) { group++; qs_h = dq_scale(b_q_scale_.item(group, n), b_q_scale_max[group]); nextgroup += b_q_group_map[k * 2 + 1]; qs_h2 = __halves2half2(qs_h, qs_h); }
        for (int p = 0; p < 1; p++)
        {
            half2 dq[8];
            uint32_t q_0 = *b_ptr; b_ptr += size_n;
            dequant_2bit_16(q_0, dq, size_n);
            for (int j = 0; j < 8; j++) dq[j] = __hmul2(dq[j], qs_h2);
            half* dqh = (half*) dq;
            for (int j = 0; j < 16; j++) b_.set(perm[lk++], n, dqh[j]);
        }
        k += 16;
    }
}

void QMatrix::reconstruct(half* out)
{
    dim3 blockDim, gridDim;
    blockDim.x = BLOCK_KN_SIZE;
    blockDim.y = 1;
    gridDim.y = DIVIDE(height, BLOCK_KN_SIZE);
    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();

    if (!is_gptq)
    {
        gridDim.x = DIVIDE(width, BLOCK_KN_SIZE);
        reconstruct_kernel<<<gridDim, blockDim, 0, stream>>>
        (
            cuda_q_weight,
            cuda_q_perm,
            cuda_q_scale,
            cuda_q_scale_max,
            cuda_q_group_map,
            height,
            width,
            //groupsize,
            groups,
            out,
            rows_8,
            rows_6,
            rows_5,
            rows_4,
            rows_3,
            rows_2
        );
    }
    else
    {
        gridDim.x = DIVIDE(width, BLOCK_KN_SIZE * 4);
        reconstruct_gptq_kernel<<<gridDim, blockDim, 0, stream>>>
        (
            cuda_q_weight,
            cuda_q_perm,
            cuda_gptq_qzeros,
            cuda_gptq_scales,
            //const uint16_t* __restrict__ b_q_groups,
            height,
            width,
            gptq_groupsize,
            groups,
            out,
            rows_4
        );
    }
}

__global__ void make_sequential_kernel
(
    const uint32_t* __restrict__ w,
    uint32_t* __restrict__ w_new,
    const uint16_t* __restrict__ q_perm,
    const int w_height,
    const int w_width
)
{
    const uint64_t* w2 = (uint64_t*) w;
    uint64_t* w_new2 = (uint64_t*) w_new;
    int w2_stride = w_width >> 1;

    int w2_column = THREADS_X * blockIdx.x + threadIdx.x;
    if (w2_column >= w2_stride) return;

    int w_new2_row = blockIdx.y;

    int q_perm_idx = w_new2_row << 3;

    uint64_t dst = 0;

    #pragma unroll
    for (int i = 0; i < 8; i++)
    {
        int source_row = q_perm[q_perm_idx++];

        int w2_row = source_row >> 3;
        int w2_subrow = source_row & 0x07;
        int w2_row_shift = w2_subrow << 2;
        int wnew2_row_shift = i << 2;

        uint64_t src = w2[w2_row * w2_stride + w2_column];
        src >>= w2_row_shift;
        src &= 0x0000000f0000000f;
        src <<= wnew2_row_shift;
        dst |= src;
    }

    w_new2[w_new2_row * w2_stride + w2_column] = dst;
}

bool QMatrix::make_sequential(const uint32_t* cpu_g_idx)
{
    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
    uint32_t* cuda_new_qweight = NULL;
    cudaError_t err = cudaMalloc(&cuda_new_qweight, height / 8 * width * sizeof(uint32_t));
    if (err != cudaSuccess) {
        cudaError_t cuda_status = cudaGetLastError(); // Clear error
        return false;
    }

    uint32_t* cpu_g_idx_map = (uint32_t*) calloc(groups, sizeof(uint32_t));
    uint32_t* cpu_x_map = (uint32_t*) malloc(height * sizeof(uint32_t));
    uint32_t* cpu_x_map_inv = (uint32_t*) malloc(height * sizeof(uint32_t));

    // Group histogram

    for (int i = 0; i < height; i++) cpu_g_idx_map[cpu_g_idx[i]]++;

    // Group map

    for (int i = 0, acc = 0; i < groups; i++)
    {
        short tmp = cpu_g_idx_map[i];
        cpu_g_idx_map[i] = acc;
        acc += tmp;
    }

    // X map (inverse)

    for (int row = 0; row < height; row++)
    {
        uint32_t target_group = cpu_g_idx[row];
        uint32_t target_row = cpu_g_idx_map[target_group];
        cpu_g_idx_map[target_group]++;
        cpu_x_map_inv[row] = target_row;
    }

    // X map

    for (int row = 0; row < height; row++) cpu_x_map[cpu_x_map_inv[row]] = row;

    // Reduce to uint16_t

    uint16_t* cpu_x_map16 = (uint16_t*)cpu_x_map;
    uint16_t* cpu_x_map_inv16 = (uint16_t*)cpu_x_map_inv;
    for (int row = 0; row < height; row++) cpu_x_map16[row] = (uint16_t) cpu_x_map[row];
    for (int row = 0; row < height; row++) cpu_x_map_inv16[row] = (uint16_t) cpu_x_map_inv[row];

    // Move to CUDA

    cudaMemcpyAsync(cuda_q_perm, cpu_x_map16, height * sizeof(uint16_t), cudaMemcpyHostToDevice);
    cudaMemcpyAsync(cuda_q_invperm, cpu_x_map_inv16, height * sizeof(uint16_t), cudaMemcpyHostToDevice);

    // Rearrange rows in w

    dim3 blockDim, gridDim;
    blockDim.x = THREADS_X;
    blockDim.y = 1;
    gridDim.x = DIVIDE(width, THREADS_X);
    gridDim.y = height / 8;

    make_sequential_kernel<<<gridDim, blockDim, 0, stream>>>
    (
        cuda_q_weight,
        cuda_new_qweight,
        cuda_q_perm,
        height / 8,
        width
    );

    // Replace qweights

    cudaMemcpyAsync(cuda_q_weight, cuda_new_qweight, height / 8 * width * sizeof(uint32_t), cudaMemcpyDeviceToDevice);

    // Cleanup

    cudaDeviceSynchronize();

    cudaFree(cuda_new_qweight);
    free(cpu_g_idx_map);
    free(cpu_x_map);
    free(cpu_x_map_inv);

    return true;
}


================================================
FILE: server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cuh
================================================
#ifndef _q_matrix_cuh
#define _q_matrix_cuh

#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cstdint>
#include <cstdio>

#define MAX_SUPERGROUPS 16

class QMatrix
{
public:

    int device;
    bool is_gptq;

    int height;
    int width;
    int groups;
    int gptq_groupsize;

    int rows_8;
    int rows_6;
    int rows_5;
    int rows_4;
    int rows_3;
    int rows_2;

    uint32_t* cuda_q_weight = NULL;
    uint16_t* cuda_q_perm = NULL;
    uint16_t* cuda_q_invperm = NULL;
    uint32_t* cuda_q_scale = NULL;
    half* cuda_q_scale_max = NULL;
    uint16_t* cuda_q_groups = NULL;
    uint16_t* cuda_q_group_map = NULL;
    uint32_t* cuda_gptq_qzeros = NULL;
    half* cuda_gptq_scales = NULL;

    half* temp_dq;

    bool failed;

    QMatrix
    (
        const int _device,
        const int _height,
        const int _width,
        const int _groups,

        uint32_t* _q_weight,
        uint16_t* _q_perm,
        uint16_t* _q_invperm,
        uint32_t* _q_scale,
        half* _q_scale_max,
        uint16_t* _q_groups,
        uint16_t* _q_group_map,

        uint32_t* _gptq_qzeros,
        half* _gptq_scales,
        uint32_t* _gptq_g_idx,

        half* _temp_dq
    );

    ~QMatrix();

    void reconstruct(half* out);
    bool make_sequential(const uint32_t* cpu_g_idx);

private:

};

#endif


================================================
FILE: server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_2.cuh
================================================
#ifndef _qdq_2_cuh
#define _qdq_2_cuh

#include "qdq_util.cuh"
#include "../../config.h"

#if QMODE_2BIT == 1

// Permutation:
//
// ffddbb99 77553311  eeccaa88 66442200

__forceinline__ __device__ void shuffle_2bit_16
(
    uint32_t* q,
    int stride
)
{
    uint32_t qa = q[0];
    uint32_t qb = 0;

    #pragma unroll
    for (int i = 0; i < 8; i++)
    {
        uint32_t qa0 = qa & 0x03;
        uint32_t qa1 = (qa & 0x0c) >> 2;
        qa >>= 4;
        qb |= (qa1 << (i * 2 + 16));
        qb |= (qa0 << (i * 2));
    }
    q[0] = qb;
}

__forceinline__ __device__ void dequant_2bit_16
(
    const uint32_t q_0,
    half2 (&dq)[8],
    int stride
)
{
    const uint32_t c0 = 0x64006400;
    const half y4_  = __float2half_rn(1.0f /  4.0f);
    const half y16_ = __float2half_rn(1.0f / 16.0f);
    const half y64_ = __float2half_rn(1.0f / 64.0f);
    const half2 y4  = __halves2half2(y4_,  y4_);
    const half2 y16 = __halves2half2(y16_, y16_);
    const half2 y64 = __halves2half2(y64_, y64_);
    const half z1_  = __float2half_rn(-1024.0f         - 2.0f);
    const half z4_  = __float2half_rn(-1024.0f /  4.0f - 2.0f);
    const half z16_ = __float2half_rn(-1024.0f / 16.0f - 2.0f);
    const half z64_ = __float2half_rn(-1024.0f / 64.0f - 2.0f);
    const half2 z1  = __halves2half2(z1_,  z1_);
    const half2 z4  = __halves2half2(z4_,  z4_);
    const half2 z16 = __halves2half2(z16_, z16_);
    const half2 z64 = __halves2half2(z64_, z64_);

    uint32_t qa = q_0;
    half2_uint32 q0((qa & 0x00030003) | c0); // half2(q[ 0], q[ 1])      + 1024
    half2_uint32 q1((qa & 0x000c000c) | c0); // half2(q[ 2], q[ 3]) *  4 + 1024
    half2_uint32 q2((qa & 0x00300030) | c0); // half2(q[ 4], q[ 5]) * 16 + 1024
    half2_uint32 q3((qa & 0x00c000c0) | c0); // half2(q[ 6], q[ 7]) * 64 + 1024
    qa >>= 8;
    half2_uint32 q4((qa & 0x00030003) | c0); // half2(q[ 8], q[ 8])      + 1024
    half2_uint32 q5((qa & 0x000c000c) | c0); // half2(q[10], q[11]) *  4 + 1024
    half2_uint32 q6((qa & 0x00300030) | c0); // half2(q[12], q[13]) * 16 + 1024
    half2_uint32 q7((qa & 0x00c000c0) | c0); // half2(q[14], q[15]) * 64 + 1024

    dq[0] = __hadd2(q0.as_half2, z1);
    dq[1] = __hfma2(q1.as_half2, y4,  z4);
    dq[2] = __hfma2(q2.as_half2, y16, z16);
    dq[3] = __hfma2(q3.as_half2, y64, z64);
    dq[4] = __hadd2(q4.as_half2, z1);
    dq[5] = __hfma2(q5.as_half2, y4,  z4);
    dq[6] = __hfma2(q6.as_half2, y16, z16);
    dq[7] = __hfma2(q7.as_half2, y64, z64);
}

#else

__forceinline__ __device__ void shuffle_2bit_16
(
    uint32_t* q,
    int stride
)
{
}

__forceinline__ __device__ void dequant_2bit_16
(
    const uint32_t q_0,
    half2 (&dq)[8],
    int stride
)
{
    half dqh[16];
    for (int i = 0; i < 16; i++) dqh[i] = dq_ns(exb(q_0, i * 2, 0x03), 2);

    for (int i = 0; i < 8; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
}

#endif

#endif


================================================
FILE: server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_3.cuh
================================================
#ifndef _qdq_3_cuh
#define _qdq_3_cuh

#include "qdq_util.cuh"
#include "../../config.h"

#if QMODE_3BIT == 1

// Permutation:
//
// v9997775 55333111  u8886664 44222000  (u, v lsb)
// vjjjhhhf ffdddbbb  uiiiggge eecccaaa
// vtttrrrp ppnnnlll  usssqqqo oommmkkk

__forceinline__ __device__ void shuffle_3bit_32
(
    uint32_t* q,
    int stride
)
{
    uint32_t qa = q[0 * stride];
    uint32_t qb = q[1 * stride];
    uint32_t qc = q[2 * stride];

    // qa: aa999888 77766655  54443332 22111000
    // qb: lkkkjjji iihhhggg  fffeeedd dcccbbba
    // qc: vvvuuutt tsssrrrq  qqpppooo nnnmmmll

    uint32_t qd = qc >> 26;
    qc <<= 4;
    qc |= qb >> 28;
    qb <<= 2;
    qb |= qa >> 30;

    // qa: ..999888 77766655  54443332 22111000
    // qb: ..jjjiii hhhgggff  feeedddc ccbbbaaa
    // qc: ..tttsss rrrqqqpp  pooonnnm mmlllkkk
    // qd:                               vvvuuu

    uint32_t za = 0;
    uint32_t zb = 0;
    uint32_t zc = 0;

    for (int i = 0; i < 5; i++) { uint32_t t0 = qa & 0x07; uint32_t t1 = (qa & 0x38) >> 3; qa >>= 6; za |= (t0 << (i * 3)); za |= (t1 << (i * 3 + 16)); }
    for (int i = 0; i < 5; i++) { uint32_t t0 = qb & 0x07; uint32_t t1 = (qb & 0x38) >> 3; qb >>= 6; zb |= (t0 << (i * 3)); zb |= (t1 << (i * 3 + 16)); }
    for (int i = 0; i < 5; i++) { uint32_t t0 = qc & 0x07; uint32_t t1 = (qc & 0x38) >> 3; qc >>= 6; zc |= (t0 << (i * 3)); zc |= (t1 << (i * 3 + 16)); }

    // za:  9997775 55333111   8886664 44222000
    // zb:  jjjhhhf ffdddbbb   iiiggge eecccaaa
    // zc:  tttrrrp ppnnnlll   sssqqqo oommmkkk
    // qd:                               vvvuuu

    za |= ((qd & 0x01) >> 0) << 15;
    zb |= ((qd & 0x02) >> 1) << 15;
    zc |= ((qd & 0x04) >> 2) << 15;
    za |= ((qd & 0x08) >> 3) << 31;
    zb |= ((qd & 0x10) >> 4) << 31;
    zc |= ((qd & 0x20) >> 5) << 31;

    // za: v9997775 55333111  u8886664 44222000  (u, v lsb)
    // zb: vjjjhhhf ffdddbbb  uiiiggge eecccaaa
    // zc: vtttrrrp ppnnnlll  usssqqqo oommmkkk

    q[0 * stride] = za;
    q[1 * stride] = zb;
    q[2 * stride] = zc;
}

__forceinline__ __device__ void dequant_3bit_32
(
    const uint32_t q_0,
    const uint32_t q_1,
    const uint32_t q_2,
    half2 (&dq)[16],
    int stride
)
{
    const uint32_t c0 = 0x64006400;
    const half y8_  = __float2half_rn(1.0f /  8.0f);
    const half y64_ = __float2half_rn(1.0f / 64.0f);
    const half2 y8  = __halves2half2(y8_,  y8_);
    const half2 y64 = __halves2half2(y64_, y64_);
    const half z1_  = __float2half_rn(-1024.0f         - 4.0f);
    const half z8_  = __float2half_rn(-1024.0f /  8.0f - 4.0f);
    const half z64_ = __float2half_rn(-1024.0f / 64.0f - 4.0f);
    const half2 z1  = __halves2half2(z1_,  z1_);
    const half2 z8  = __halves2half2(z8_,  z8_);
    const half2 z64 = __halves2half2(z64_, z64_);

    uint32_t qa = q_0;
    uint32_t qb = q_1;
    uint32_t qc = q_2;

    half2_uint32 q0((qa & 0x00070007) | c0); // half2(q[ 0], q[ 1])      + 1024
    half2_uint32 q1((qa & 0x00380038) | c0); // half2(q[ 2], q[ 3]) *  8 + 1024
    qa >>= 6;
    half2_uint32 q2((qa & 0x00070007) | c0); // half2(q[ 4], q[ 5])      + 1024
    half2_uint32 q3((qa & 0x00380038) | c0); // half2(q[ 6], q[ 7]) *  8 + 1024
    half2_uint32 q4((qa & 0x01c001c0) | c0); // half2(q[ 8], q[ 9]) * 64 + 1024
    qa >>= 9;
    qa &= 0x00010001;
    half2_uint32 q5((qb & 0x00070007) | c0); // half2(q[10], q[11])      + 1024
    half2_uint32 q6((qb & 0x00380038) | c0); // half2(q[12], q[13]) *  8 + 1024
    qb >>= 6;
    half2_uint32 q7((qb & 0x00070007) | c0); // half2(q[14], q[15])      + 1024
    half2_uint32 q8((qb & 0x00380038) | c0); // half2(q[16], q[17]) *  8 + 1024
    half2_uint32 q9((qb & 0x01c001c0) | c0); // half2(q[18], q[19]) * 64 + 1024
    qb >>= 8;
    qb &= 0x00020002;
    half2_uint32 q10((qc & 0x00070007) | c0); // half2(q[20], q[21])      + 1024
    half2_uint32 q11((qc & 0x00380038) | c0); // half2(q[22], q[23]) *  8 + 1024
    qc >>= 6;
    half2_uint32 q12((qc & 0x00070007) | c0); // half2(q[24], q[25])      + 1024
    half2_uint32 q13((qc & 0x00380038) | c0); // half2(q[26], q[27]) *  8 + 1024
    half2_uint32 q14((qc & 0x01c001c0) | c0); // half2(q[28], q[29]) * 64 + 1024
    qc >>= 7;
    qc &= 0x00040004;
    half2_uint32 q15((qa | qb | qc) | c0);

    dq[ 0] = __hadd2( q0.as_half2, z1);
    dq[ 1] = __hfma2( q1.as_half2, y8,  z8);
    dq[ 2] = __hadd2( q2.as_half2, z1);
    dq[ 3] = __hfma2( q3.as_half2, y8,  z8);
    dq[ 4] = __hfma2( q4.as_half2, y64, z64);
    dq[ 5] = __hadd2( q5.as_half2, z1);
    dq[ 6] = __hfma2( q6.as_half2, y8,  z8);
    dq[ 7] = __hadd2( q7.as_half2, z1);
    dq[ 8] = __hfma2( q8.as_half2, y8,  z8);
    dq[ 9] = __hfma2( q9.as_half2, y64, z64);
    dq[10] = __hadd2(q10.as_half2, z1);
    dq[11] = __hfma2(q11.as_half2, y8,  z8);
    dq[12] = __hadd2(q12.as_half2, z1);
    dq[13] = __hfma2(q13.as_half2, y8,  z8);
    dq[14] = __hfma2(q14.as_half2, y64, z64);
    dq[15] = __hadd2(q15.as_half2, z1);
}

#else

__forceinline__ __device__ void shuffle_3bit_32
(
    uint32_t* q,
    int stride
)
{
}

__forceinline__ __device__ void dequant_3bit_32
(
    const uint32_t q_0,
    const uint32_t q_1,
    const uint32_t q_2,
    half2 (&dq)[16],
    int stride
)
{
    half dqh[32];
    for (int i = 0; i < 10; i++) dqh[     i] = dq_ns(exb(     q_0, i * 3    , 0x07), 4);
                                 dqh[10    ] = dq_ns(exb(q_1, q_0,        30, 0x07), 4);
    for (int i = 0; i < 10; i++) dqh[11 + i] = dq_ns(exb(     q_1, i * 3 + 1, 0x07), 4);
                                 dqh[21    ] = dq_ns(exb(q_2, q_1,        31, 0x07), 4);
    for (int i = 0; i < 10; i++) dqh[22 + i] = dq_ns(exb(     q_2, i * 3 + 2, 0x07), 4);

    for (int i = 0; i < 16; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
}

#endif

#endif


================================================
FILE: server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_4.cuh
================================================
#ifndef _qdq_4_cuh
#define _qdq_4_cuh

#include "qdq_util.cuh"
#include "../../config.h"

#if QMODE_4BIT == 1

// Permutation:
//
// 77775555 33331111  66664444 22220000

__forceinline__ __device__ void shuffle_4bit_8
(
    uint32_t* q,
    int stride
)
{
    uint32_t qa = q[0];
    uint32_t qb = 0;

    #pragma unroll
    for (int i = 0; i < 4; i++)
    {
        uint32_t qa0 = qa & 0x0f;
        uint32_t qa1 = (qa & 0xf0) >> 4;
        qa >>= 8;
        qb |= (qa1 << (i * 4 + 16));
        qb |= (qa0 << (i * 4));
    }
    q[0] = qb;
}

__forceinline__ __device__ void dequant_4bit_8
(
    const uint32_t q_0,
    half2 (&dq)[4],
    int stride
)
{
    const uint32_t c0 = 0x64006400;
    const half y16_ = __float2half_rn(1.0f / 16.0f);
    const half2 y16 = __halves2half2(y16_, y16_);
    const half z1_  = __float2half_rn(-1024.0f         - 8.0f);
    const half z16_ = __float2half_rn(-1024.0f / 16.0f - 8.0f);
    const half2 z1  = __halves2half2(z1_,  z1_);
    const half2 z16 = __halves2half2(z16_, z16_);

    uint32_t qa = q_0;
    half2_uint32 q0((qa & 0x000f000f) | c0); // half2(q[ 0], q[ 1])      + 1024
    half2_uint32 q1((qa & 0x00f000f0) | c0); // half2(q[ 2], q[ 3]) * 16 + 1024
    qa >>= 8;
    half2_uint32 q2((qa & 0x000f000f) | c0); // half2(q[ 4], q[ 5])      + 1024
    half2_uint32 q3((qa & 0x00f000f0) | c0); // half2(q[ 6], q[ 7]) * 16 + 1024

    dq[0] = __hadd2(q0.as_half2, z1);
    dq[1] = __hfma2(q1.as_half2, y16, z16);
    dq[2] = __hadd2(q2.as_half2, z1);
    dq[3] = __hfma2(q3.as_half2, y16, z16);
}

__forceinline__ __device__ void dequant_4bit_8_prep_zero_scale
(
    const uint32_t zero,
    const half scale,
    half2 (&z1z16)[2],
    half2 (&y1y16)[2]
)
{
    half_uint16 z1(0xe400 | zero); // half(-1024.0f - zero);
    half z16 = __hsub(__int2half_rn(-64), __int2half_rn(zero));

    half2 scale2 = __half2half2(scale);

    z1z16[0] = __hmul2(scale2, __half2half2(z1.as_half));
    z1z16[1] = __hmul2(scale2, __half2half2(z16));

    const half y1 = __float2half_rn(1.0f);
    const half y16 = __float2half_rn(1.0f / 16.0f);

    y1y16[0] = __hmul2(scale2, __half2half2(y1));
    y1y16[1] = __hmul2(scale2, __half2half2(y16));
}

__forceinline__ __device__ void dequant_4bit_8_prep_zero
(
    const uint32_t zero,
    half2(&z1z16)[2],
    half2(&y1y16)[2]
)
{
    half_uint16 z1(0xe400 | zero); // half(-1024.0f - zero);
    half z16 = __hsub(__int2half_rn(-64), __int2half_rn(zero));

    z1z16[0] = __half2half2(z1.as_half);
    z1z16[1] = __half2half2(z16);

    const half y1 = __float2half_rn(1.0f);
    const half y16 = __float2half_rn(1.0f / 16.0f);

    y1y16[0] = __half2half2(y1);
    y1y16[1] = __half2half2(y16);
}


__forceinline__ __device__ void dequant_4bit_8_gptq
(
    const uint32_t q_0,
    half2 (&dq)[4],
    half2 (&z1z16)[2],
    half2 (&y1y16)[2],
    int stride,
    bool scaled
)
{
    const uint32_t c0 = 0x64006400;

    uint32_t qa = q_0;
    half2_uint32 q0((qa & 0x000f000f) | c0); // half2( q[0]      + 1024, q[1]      + 1024 )
    half2_uint32 q1((qa & 0x00f000f0) | c0); // half2( q[2] * 16 + 1024, q[3] * 16 + 1024 )
    qa >>= 8;
    half2_uint32 q2((qa & 0x000f000f) | c0); // half2( q[4]      + 1024, q[5]      + 1024 )
    half2_uint32 q3((qa & 0x00f000f0) | c0); // half2( q[6] * 16 + 1024, q[7] * 16 + 1024 )

    if (scaled)
    {
        dq[0] = __hfma2(q0.as_half2, y1y16[0], z1z16[0]);  // half2( q[0] * s - z * s, q[1] * s - z * s)
        dq[1] = __hfma2(q1.as_half2, y1y16[1], z1z16[1]);  // half2( q[2] * s - z * s, q[3] * s - z * s)
        dq[2] = __hfma2(q2.as_half2, y1y16[0], z1z16[0]);
        dq[3] = __hfma2(q3.as_half2, y1y16[1], z1z16[1]);
    }
    else
    {
        dq[0] = __hadd2(q0.as_half2,           z1z16[0]);  // half2( q[0] - z, q[1] - z )
        dq[1] = __hfma2(q1.as_half2, y1y16[1], z1z16[1]);  // half2( q[2] - z, q[3] - z )
        dq[2] = __hadd2(q2.as_half2,           z1z16[0]);  // half2( q[4] - z, q[5] - z )
        dq[3] = __hfma2(q3.as_half2, y1y16[1], z1z16[1]);  // half2( q[6] - z, q[7] - z )
    }
}

#else

__forceinline__ __device__ void shuffle_4bit_8
(
    uint32_t* q,
    int stride
)
{
}

__forceinline__ __device__ void dequant_4bit_8
(
    const uint32_t q_0,
    half2 (&dq)[4],
    int stride
)
{
    half dqh[8];
    for (int i = 0; i < 8; i++) dqh[i] = dq_ns(exb(q_0, i * 4, 0x0f), 8);

    for (int i = 0; i < 4; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
}

__forceinline__ __device__ void dequant_4bit_8_prep_zero_scale
(
    const uint32_t zero,
    const half scale,
    half2 (&z1)[2],
    half2 (&y1)[2]
)
{
    half z = __int2half_rn(-((int)zero));
    z = __hmul(z, scale);
    z1[0] = __half2half2(z);
    y1[0] = __half2half2(scale);
}

__forceinline__ __device__ void dequant_4bit_8_prep_zero
(
    const uint32_t zero,
    half2(&z1)[2],
    half2(&y1)[2]
)
{
    half z = __int2half_rn(-((int)zero));
    z1[0] = __half2half2(z);
}

__forceinline__ __device__ void dequant_4bit_8_gptq
(
    const uint32_t q_0,
    half2 (&dq)[4],
    half2 (&z1)[2],
    half2 (&y1)[2],
    int stride,
    bool scaled
)
{
    half2 dqh2[8];

    uint32_t qa = q_0;
    for (int i = 0; i < 4; i++)
    {
        half d0 = __int2half_rn(qa & 0x0f); qa >>= 4;
        half d1 = __int2half_rn(qa & 0x0f); qa >>= 4;
        dqh2[i] = __halves2half2(d0, d1);
    }

    if (scaled)
    {
        dq[0] = __hfma2(dqh2[0], y1[0], z1[0]);
        dq[1] = __hfma2(dqh2[1], y1[0], z1[0]);
        dq[2] = __hfma2(dqh2[2], y1[0], z1[0]);
        dq[3] = __hfma2(dqh2[3], y1[0], z1[0]);
    }
    else
    {
        dq[0] = __hadd2(dqh2[0], z1[0]);
        dq[1] = __hadd2(dqh2[1], z1[0]);
        dq[2] = __hadd2(dqh2[2], z1[0]);
        dq[3] = __hadd2(dqh2[3], z1[0]);
    }
}

#endif

#endif


================================================
FILE: server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_5.cuh
================================================
#ifndef _qdq_5_cuh
#define _qdq_5_cuh

#include "qdq_util.cuh"
#include "../../config.h"

#if QMODE_5BIT == 1

// Permutation:
//
// v5555533 33311111  u4444422 22200000  (u, v lsb)
// vbbbbb99 99977777  uaaaaa88 88866666
// vhhhhhff fffddddd  ugggggee eeeccccc
// vnnnnnll llljjjjj  ummmmmkk kkkiiiii
// vtttttrr rrrppppp  usssssqq qqqooooo

__forceinline__ __device__ void shuffle_5bit_32
(
    uint32_t* q,
    int stride
)
{
    uint32_t qa = q[0 * stride];
    uint32_t qb = q[1 * stride];
    uint32_t qc = q[2 * stride];
    uint32_t qd = q[3 * stride];
    uint32_t qe = q[4 * stride];

    // qa: 66555554 44443333  32222211 11100000
    // qb: ccccbbbb baaaaa99  99988888 77777666
    // qc: jiiiiihh hhhggggg  fffffeee eedddddc
    // qd: pppooooo nnnnnmmm  mmlllllk kkkkjjjj
    // qe: vvvvvuuu uuttttts  ssssrrrr rqqqqqpp

    uint32_t qf = qe >> 22;
    qe <<= 8;
    qe |= qd >> 24;
    qd <<= 6;
    qd |= qc >> 26;
    qc <<= 4;
    qc |= qb >> 28;
    qb <<= 2;
    qb |= qa >> 30;

    // qa:   555554 44443333  32222211 11100000
    // qb:   bbbbba aaaa9999  98888877 77766666
    // qc:   hhhhhg ggggffff  feeeeedd dddccccc
    // qd:   nnnnnm mmmmllll  lkkkkkjj jjjiiiii
    // qe:   ttttts ssssrrrr  rqqqqqpp pppooooo
    // qf:                          vv vvvuuuuu

    uint32_t za = 0;
    uint32_t zb = 0;
    uint32_t zc = 0;
    uint32_t zd = 0;
    uint32_t ze = 0;

    for (int i = 0; i < 3; i++) { uint32_t t0 = qa & 0x1f; uint32_t t1 = (qa & 0x3e0) >> 5; qa >>= 10; za |= (t0 << (i * 5)); za |= (t1 << (i * 5 + 16)); }
    for (int i = 0; i < 3; i++) { uint32_t t0 = qb & 0x1f; uint32_t t1 = (qb & 0x3e0) >> 5; qb >>= 10; zb |= (t0 << (i * 5)); zb |= (t1 << (i * 5 + 16)); }
    for (int i = 0; i < 3; i++) { uint32_t t0 = qc & 0x1f; uint32_t t1 = (qc & 0x3e0) >> 5; qc >>= 10; zc |= (t0 << (i * 5)); zc |= (t1 << (i * 5 + 16)); }
    for (int i = 0; i < 3; i++) { uint32_t t0 = qd & 0x1f; uint32_t t1 = (qd & 0x3e0) >> 5; qd >>= 10; zd |= (t0 << (i * 5)); zd |= (t1 << (i * 5 + 16)); }
    for (int i = 0; i < 3; i++) { uint32_t t0 = qe & 0x1f; uint32_t t1 = (qe & 0x3e0) >> 5; qe >>= 10; ze |= (t0 << (i * 5)); ze |= (t1 << (i * 5 + 16)); }

    // za:  5555533 33311111   4444422 22200000
    // zb:  bbbbb99 99977777   aaaaa88 88866666
    // zc:  hhhhhff fffddddd   gggggee eeeccccc
    // zd:  nnnnnll llljjjjj   mmmmmkk kkkiiiii
    // ze:  tttttrr rrrppppp   sssssqq qqqooooo
    // qf:                          vv vvvuuuuu

    za |= ((qf & 0x001) >> 0) << 15;
    zb |= ((qf & 0x002) >> 1) << 15;
    zc |= ((qf & 0x004) >> 2) << 15;
    zd |= ((qf & 0x008) >> 3) << 15;
    ze |= ((qf & 0x010) >> 4) << 15;
    za |= ((qf & 0x020) >> 5) << 31;
    zb |= ((qf & 0x040) >> 6) << 31;
    zc |= ((qf & 0x080) >> 7) << 31;
    zd |= ((qf & 0x100) >> 8) << 31;
    ze |= ((qf & 0x200) >> 9) << 31;

    // za: v5555533 33311111  u4444422 22200000  (u, v lsb)
    // zb: vbbbbb99 99977777  uaaaaa88 88866666
    // zc: vhhhhhff fffddddd  ugggggee eeeccccc
    // zd: vnnnnnll llljjjjj  ummmmmkk kkkiiiii
    // ze: vtttttrr rrrppppp  usssssqq qqqooooo

    q[0 * stride] = za;
    q[1 * stride] = zb;
    q[2 * stride] = zc;
    q[3 * stride] = zd;
    q[4 * stride] = ze;
}

__forceinline__ __device__ void dequant_5bit_32
(
    const uint32_t q_0,
    const uint32_t q_1,
    const uint32_t q_2,
    const uint32_t q_3,
    const uint32_t q_4,
    half2 (&dq)[16],
    int stride
)
{
    const uint32_t c0 = 0x64006400;
    const half y32_ = __float2half_rn(1.0f / 32.0f);
    const half2 y32 = __halves2half2(y32_, y32_);
    const half z1_  = __float2half_rn(-1024.0f         - 16.0f);
    const half z32_ = __float2half_rn(-1024.0f / 32.0f - 16.0f);
    const half2 z1  = __halves2half2(z1_,  z1_);
    const half2 z32 = __halves2half2(z32_, z32_);

    uint32_t qa = q_0;
    uint32_t qb = q_1;
    uint32_t qc = q_2;
    uint32_t qd = q_3;
    uint32_t qe = q_4;

    half2_uint32 q0 ((qa & 0x001f001f) | c0); // half2(q[ 0], q[ 1])      + 1024
    half2_uint32 q1 ((qa & 0x03e003e0) | c0); // half2(q[ 2], q[ 3]) * 32 + 1024
    qa >>= 10;
    half2_uint32 q2 ((qa & 0x001f001f) | c0); // half2(q[ 4], q[ 5])      + 1024
    qa >>= 5;
    qa &= 0x00010001;
    half2_uint32 q3 ((qb & 0x001f001f) | c0); // half2(q[ 6], q[ 7])      + 1024
    half2_uint32 q4 ((qb & 0x03e003e0) | c0); // half2(q[ 8], q[ 9]) * 32 + 1024
    qb >>= 10;
    half2_uint32 q5 ((qb & 0x001f001f) | c0); // half2(q[10], q[11])      + 1024
    qb >>= 4;
    qb &= 0x00020002;
    half2_uint32 q6 ((qc & 0x001f001f) | c0); // half2(q[12], q[13])      + 1024
    half2_uint32 q7 ((qc & 0x03e003e0) | c0); // half2(q[14], q[15]) * 32 + 1024
    qc >>= 10;
    half2_uint32 q8 ((qc & 0x001f001f) | c0); // half2(q[16], q[17])      + 1024
    qc >>= 3;
    qc &= 0x00040004;
    half2_uint32 q9 ((qd & 0x001f001f) | c0); // half2(q[18], q[19])      + 1024
    half2_uint32 q10((qd & 0x03e003e0) | c0); // half2(q[20], q[21]) * 32 + 1024
    qd >>= 10;
    half2_uint32 q11((qd & 0x001f001f) | c0); // half2(q[22], q[23])      + 1024
    qd >>= 2;
    qd &= 0x00080008;
    half2_uint32 q12((qe & 0x001f001f) | c0); // half2(q[24], q[25])      + 1024
    half2_uint32 q13((qe & 0x03e003e0) | c0); // half2(q[26], q[27]) * 32 + 1024
    qe >>= 10;
    half2_uint32 q14((qe & 0x001f001f) | c0); // half2(q[28], q[29])      + 1024
    qe >>= 1;
    qe &= 0x00100010;
    half2_uint32 q15((qa | qb | qc | qd | qe) | c0);

    dq[ 0] = __hadd2( q0.as_half2, z1);
    dq[ 1] = __hfma2( q1.as_half2, y32, z32);
    dq[ 2] = __hadd2( q2.as_half2, z1);
    dq[ 3] = __hadd2( q3.as_half2, z1);
    dq[ 4] = __hfma2( q4.as_half2, y32, z32);
    dq[ 5] = __hadd2( q5.as_half2, z1);
    dq[ 6] = __hadd2( q6.as_half2, z1);
    dq[ 7] = __hfma2( q7.as_half2, y32, z32);
    dq[ 8] = __hadd2( q8.as_half2, z1);
    dq[ 9] = __hadd2( q9.as_half2, z1);
    dq[10] = __hfma2(q10.as_half2, y32, z32);
    dq[11] = __hadd2(q11.as_half2, z1);
    dq[12] = __hadd2(q12.as_half2, z1);
    dq[13] = __hfma2(q13.as_half2, y32, z32);
    dq[14] = __hadd2(q14.as_half2, z1);
    dq[15] = __hadd2(q15.as_half2, z1);
}

#else

__forceinline__ __device__ void shuffle_5bit_32
(
    uint32_t* q,
    int stride
)
{
}

__forceinline__ __device__ void dequant_5bit_32
(
    const uint32_t q_0,
    const uint32_t q_1,
    const uint32_t q_2,
    const uint32_t q_3,
    const uint32_t q_4,
    half2 (&dq)[16],
    int stride
)
{
    half dqh[32];
    for (int i = 0; i <  6; i++) dqh[     i] = dq_ns(exb(     q_0, i * 5    , 0x1f), 16);
                                 dqh[ 6    ] = dq_ns(exb(q_1, q_0,        30, 0x1f), 16);
    for (int i = 0; i <  5; i++) dqh[ 7 + i] = dq_ns(exb(     q_1, i * 5 + 3, 0x1f), 16);
                                 dqh[12    ] = dq_ns(exb(q_2, q_1,        28, 0x1f), 16);
    for (int i = 0; i <  6; i++) dqh[13 + i] = dq_ns(exb(     q_2, i * 5 + 1, 0x1f), 16);
                                 dqh[19    ] = dq_ns(exb(q_3, q_2,        31, 0x1f), 16);
    for (int i = 0; i <  5; i++) dqh[20 + i] = dq_ns(exb(     q_3, i * 5 + 4, 0x1f), 16);
                                 dqh[25    ] = dq_ns(exb(q_4, q_3,        29, 0x1f), 16);
    for (int i = 0; i <  6; i++) dqh[26 + i] = dq_ns(exb(     q_4, i * 5 + 2, 0x1f), 16);

    for (int i = 0; i < 16; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
}

#endif

#endif


================================================
FILE: server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_6.cuh
================================================
#ifndef _qdq_6_cuh
#define _qdq_6_cuh

#include "qdq_util.cuh"
#include "../../config.h"

#if QMODE_6BIT == 1

  // Not implemented

#else

__forceinline__ __device__ void shuffle_6bit_16
(
    uint32_t* q,
    int stride
)
{
}

__forceinline__ __device__ void dequant_6bit_16
(
    const uint32_t q_0,
    const uint32_t q_1,
    const uint32_t q_2,
    half2 (&dq)[8],
    int stride
)
{
    half dqh[16];
    for (int i = 0; i < 5; i++) dqh[     i] = dq_ns(exb(     q_0, i * 6    , 0x3f), 32);
                                dqh[ 5    ] = dq_ns(exb(q_1, q_0,        30, 0x3f), 32);
    for (int i = 0; i < 4; i++) dqh[ 6 + i] = dq_ns(exb(     q_1, i * 6 + 4, 0x3f), 32);
                                dqh[10    ] = dq_ns(exb(q_2, q_1,        28, 0x3f), 32);
    for (int i = 0; i < 5; i++) dqh[11 + i] = dq_ns(exb(     q_2, i * 6 + 2, 0x3f), 32);

    for (int i = 0; i < 8; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
}

#endif

#endif


================================================
FILE: server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_8.cuh
================================================
#ifndef _qdq_8_cuh
#define _qdq_8_cuh

#include "qdq_util.cuh"
#include "../../config.h"

#if QMODE_8BIT == 1

  // Not implemented

#else

__forceinline__ __device__ void shuffle_8bit_4
(
    uint32_t* q,
    int stride
)
{
}

__forceinline__ __device__ void dequant_8bit_8
(
    const uint32_t q_0,
    const uint32_t q_1,
    half2 (&dq)[4],
    int stride
)
{
    half dqh[8];
    for (int i = 0; i < 4; i++) dqh[i    ] = dq_ns(exb(q_0, i * 8, 0xff), 128);
    for (int i = 0; i < 4; i++) dqh[i + 4] = dq_ns(exb(q_1, i * 8, 0xff), 128);

    for (int i = 0; i < 4; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
}

#endif

#endif


================================================
FILE: server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_util.cuh
================================================
#ifndef _qdq_util_cuh
#define _qdq_util_cuh

union half2_uint32
{
    uint32_t as_uint32;
    half2 as_half2;
    __device__ half2_uint32(uint32_t val) : as_uint32(val) {}
    __device__ half2_uint32(half2 val) : as_half2(val) {}
    __device__ half2_uint32() : as_uint32(0) {}
};

union half_uint16
{
    uint16_t as_uint16;
    half as_half;
    __device__ half_uint16(uint16_t val) : as_uint16(val) {}
    __device__ half_uint16(half val) : as_half(val) {}
    __device__ half_uint16() : as_uint16(0) {}
};

// Max_scale premultiplied by 1/256

__forceinline__ __device__ half dq_scale(const int qs, const half max_scale)
{
    int qs_i = qs + 1;
    half qs_h = __int2half_rn(qs_i * qs_i);
    qs_h = __hmul(qs_h, max_scale);
    return qs_h;
}

__forceinline__ __device__ half dq(const int q, const int qzero, const half scale)
{
    return __hmul(__int2half_rn(q - qzero), scale);
}

__forceinline__ __device__ half dq_ns(const int q, const int qzero)
{
    //return __hsub(__int2half_rn(q), __int2half_rn(qzero));
    return __int2half_rn(q - qzero);
}

__forceinline__ __device__ int exb(const uint32_t q, const int shift, const int mask)
{
    return (int)((q >> shift) & mask);
}

__forceinline__ __device__ int exb(const uint32_t q1, const uint32_t q0, const int shift, const int mask)
{
    return (int)(__funnelshift_rc(q0, q1, shift) & mask);
}

#endif


================================================
FILE: server/exllamav2_kernels/exllamav2_kernels/cuda/util.cuh
================================================
#ifndef _util_cuh
#define _util_cuh

#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cstdint>
#include <cstdio>
#include <ATen/cuda/CUDAContext.h>

#define DIVIDE(x, size) (((x) + (size) - 1) / (size))

#define DBGS(__x) printf("%s\n", __x)
#define DBGI(__x) printf("%s: %i\n", #__x, __x)
#define DBGI2(__x, __y) printf("%s, %s: %i, %i\n", #__x, #__y, __x, __y)
#define DBGI3(__x, __y, __z) printf("%s, %s, %s: %i, %i, %i\n", #__x, #__y, #__z, __x, __y, __z)
#define DBGX(__x) printf("%s: %x\n", #__x, __x)
#define DBGX2(__x, __y) printf("%s, %s: %x, %x\n", #__x, #__y, __x, __y)
#define DBGX3(__x, __y, __z) printf("%s, %s, %s: %x, %x, %x\n", #__x, #__y, #__z, __x, __y, __z)
#define DBGF(__x) printf("%s: %f\n", #__x, __x)
#define DBGF2(__x, __y) printf("%s, %s: %f, %f\n", #__x, #__y, __x, __y)
#define DBGF3(__x, __y, __z) printf("%s, %s, %s: %f, %f, %f\n", #__x, #__y, #__z, __x, __y, __z)
#define DBGH(__x) printf("%s: %f\n", #__x, __half2float(__x))
#define DBGH2(__x, __y) printf("%s, %s: %f, %f\n", #__x, #__y, __half2float(__x), __half2float(__y))
#define DBGH3(__x, __y, __z) printf("%s, %s, %s: %f, %f, %f\n", #__x, #__y, #__z, __half2float(__x), __half2float(__y), __half2float(__z))

#define DBGIH(__x, __y) printf("%s, %s: %i, %f\n", #__x, #__y, __x, __half2float(__y))
#define DBGIH2(__x, __y, __z) printf("%s, %s, %s: %i, %f, %f\n", #__x, #__y, #__z, __x, __half2float(__y), __half2float(__z))

__forceinline__ __device__ half dq_scale_(const int qs, const half max_scale)
{
    half qs_h = __hmul(__int2half_rn(qs + 1), __float2half_rn(1.0f / 16.0f));
    qs_h = __hmul(qs_h, qs_h);
    qs_h = __hmul(qs_h, max_scale);
    return qs_h;
}

__forceinline__ __device__ float clamp(float x, float a, float b)
{
    return fmaxf(a, fminf(b, x));
}

#define cuda_check(ans) { gpu_assert((ans), __FILE__, __LINE__); }
inline void gpu_assert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"CUDA error: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

void print_global_mem(const half* ptr, int rows, int columns, int stride);

#endif


================================================
FILE: server/exllamav2_kernels/exllamav2_kernels/ext.cpp
================================================
#include <torch/extension.h>
#include <c10/cuda/CUDAGuard.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cstdint>
#include <cstdio>

#include "config.h"

#include "cuda/q_matrix.cuh"
#include "cuda/q_gemm.cuh"

#include "cpp/util.h"

// Some decluttering macros

#define TORCH_CHECK_DTYPE(__x, __dtype) TORCH_CHECK((__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype)
#define TORCH_CHECK_DTYPE_OPT(__x, __dtype) TORCH_CHECK((__x).device().is_meta() || (__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype)
#define TORCH_CHECK_SHAPES(__x, __dim_x, __y, __dim_y, __scale_y) TORCH_CHECK((__x).size(__dim_x) == (__y).size(__dim_y) * __scale_y, #__x " and " #__y " have incompatible shapes")
#define TORCH_CHECK_SHAPES_OPT(__x, __dim_x, __y, __dim_y, __scale_y) TORCH_CHECK((__x).device().is_meta() || (__x).size(__dim_x) == (__y).size(__dim_y) * __scale_y, #__x " and " #__y " have incompatible shapes")


// Quant matrix

uintptr_t make_q_matrix
(
    torch::Tensor q_weight,
    torch::Tensor q_perm,
    torch::Tensor q_invperm,
    torch::Tensor q_scale,
    torch::Tensor q_scale_max,
    torch::Tensor q_groups,
    torch::Tensor q_group_map,
    torch::Tensor gptq_qzeros,
    torch::Tensor gptq_scales,
    torch::Tensor gptq_g_idx,
    torch::Tensor temp_dq
)
{
    TORCH_CHECK_DTYPE(q_weight, kInt);
    TORCH_CHECK_DTYPE_OPT(q_perm, kShort);
    TORCH_CHECK_DTYPE_OPT(q_invperm, kShort);
    TORCH_CHECK_DTYPE_OPT(q_scale, kInt);
    TORCH_CHECK_DTYPE_OPT(q_scale_max, kHalf);
    TORCH_CHECK_DTYPE_OPT(q_groups, kShort);
    TORCH_CHECK_DTYPE_OPT(q_group_map, kShort);
    TORCH_CHECK_DTYPE_OPT(gptq_qzeros, kInt);
    TORCH_CHECK_DTYPE_OPT(gptq_scales, kHalf);
    TORCH_CHECK_DTYPE_OPT(gptq_g_idx, kInt);

    TORCH_CHECK_SHAPES(q_perm, 0, q_invperm, 0, 1);

    int device = q_weight.device().index();
    int width = q_weight.size(1);
    int groups;
    int height;

    if (!q_scale.device().is_meta())
    {
        TORCH_CHECK_SHAPES(q_weight, 1, q_scale, 1, 8);
        TORCH_CHECK_SHAPES(q_scale_max, 0, q_scale, 0, 1);
        groups = q_scale.size(0);
        height = q_invperm.size(0);
    }
    else
    {
        TORCH_CHECK_SHAPES(q_weight, 1, gptq_qzeros, 1, 8);
        TORCH_CHECK_SHAPES(q_weight, 1, gptq_scales, 1, 1);
        groups = gptq_qzeros.size(0);
        height = q_weight.size(0) * 8;
    }

    TORCH_CHECK(temp_dq.size(0) >= width * height, "Insufficient size of temp_dq buffer")

    QMatrix* m = new QMatrix
    (
        device,
        height,
        width,
        groups,
        (uint32_t*) q_weight.data_ptr(),
        q_perm.device().is_meta() ? NULL : (uint16_t*) q_perm.data_ptr(),
        q_invperm.device().is_meta() ? NULL : (uint16_t*) q_invperm.data_ptr(),
        q_scale.device().is_meta() ? NULL : (uint32_t*) q_scale.data_ptr(),
        q_scale_max.device().is_meta() ? NULL : (half*) q_scale_max.data_ptr(),
        q_groups.device().is_meta() ? NULL : (uint16_t*) q_groups.data_ptr(),
        q_group_map.device().is_meta() ? NULL : (uint16_t*) q_group_map.data_ptr(),
        gptq_qzeros.device().is_meta() ? NULL : (uint32_t*) gptq_qzeros.data_ptr(),
        gptq_scales.device().is_meta() ? NULL : (half*) gptq_scales.data_ptr(),
        gptq_g_idx.device().is_meta() ? NULL : (uint32_t*) gptq_g_idx.data_ptr(),
        (half*) temp_dq.data_ptr()
    );

    if (m->failed) throw std::runtime_error("CUDA out of memory");

    return reinterpret_cast<uintptr_t> (m);
}

void gemm_half_q_half
(
    torch::Tensor a,
    uintptr_t b,
    torch::Tensor c,
    bool force_cuda
)
{
    QMatrix* qm = reinterpret_cast<QMatrix*> (b);

    TORCH_CHECK_DTYPE(a, kHalf);
    TORCH_CHECK_DTYPE(c, kHalf);
    TORCH_CHECK_SHAPES(a, 0, c, 0, 1);
    TORCH_CHECK(qm->height == a.size(1), "a and b have incompatible shapes")
    TORCH_CHECK(qm->width == c.size(1), "b and c have incompatible shapes")

    const at::cuda::OptionalCUDAGuard device_guard(device_of(a));

    gemm_half_q_half_cuda
    (
        at::cuda::getCurrentCUDABlasHandle(),
        (const half*) a.data_ptr(),
        qm,
        (half*) c.data_ptr(),
        c.size(0), // m
        c.size(1), // n
        a.size(1), // k
        true,
        NULL,
        force_cuda
    );
}

// Bindings

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
{
    m.def("make_q_matrix", &make_q_matrix, "make_q_matrix");
    m.def("gemm_half_q_half", &gemm_half_q_half, "gemm_half_q_half");
}


================================================
FILE: server/exllamav2_kernels/setup.py
================================================
from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
import torch

extra_cuda_cflags = ["-lineinfo", "-O3"]
extra_cflags = []
if torch.version.hip:
    extra_cflags = ["-DLEGACY_HIPBLAS_DIRECT=ON"]
    extra_cuda_cflags += ["-DHIPBLAS_USE_HIP_HALF", "-DLEGACY_HIPBLAS_DIRECT=ON"]

extra_compile_args = {
    "cxx": extra_cflags,
    "nvcc": extra_cuda_cflags,
}

setup(
    name="exllamav2_kernels",
    ext_modules=[
        CUDAExtension(
            name="exllamav2_kernels",
            sources=[
                "exllamav2_kernels/ext.cpp",
                "exllamav2_kernels/cuda/q_matrix.cu",
                "exllamav2_kernels/cuda/q_gemm.cu",
            ],
            extra_compile_args=extra_compile_args,
        )
    ],
    cmdclass={"build_ext": BuildExtension},
)


================================================
FILE: server/pyproject.toml
================================================
[project]
name = "text-generation-server"
version = "2.0.5-dev0"
description = "Text Generation Inference Python gRPC Server"
readme = "README.md"
requires-python = ">=3.9"
authors = [
  {name = "Olivier Dehaene", email = "olivier@huggingface.co"},
  {name = "Nicolas Patry", email = "nicolas@huggingface.co"},
]
dependencies = [
    # Remove explicit click dependency once typer/click are compatible again.
    "click<8.2.0",
    "einops>=0.8.0",
    "grpc-interceptor>=0.15.4",
    "grpcio>=1.67.0",
    "grpcio-reflection>=1.67.0",
    "grpcio-status>=1.67.0",
    "kernels>=0.2.1",
    "hf-transfer>=0.1.8",
    "loguru>=0.7.3",
    "numpy>=1.26,<3",
    "opentelemetry-api>=1.27.0",
    "opentelemetry-exporter-otlp>=1.27.0",
    "opentelemetry-instrumentation-grpc>=0.50b0",
    "pillow>=11.1.0",
    "prometheus-client>=0.21.0",
    "protobuf>=5.28.3",
    "py-cpuinfo>=9.0.0",
    "rich>=13.8.1",
    "safetensors>=0.4.5",
    "scipy>=1.13.1",
    "sentencepiece>=0.2.0",
    "tokenizers>=0.20.3",
    "typer>=0.15.1",
    "transformers>=4.51.0",
    "huggingface-hub>=0.30.1",
    "hf-xet>=1.0.0",
]

[[tool.uv.index]]
name = "pytorch-cu128"
url = "https://download.pytorch.org/whl/cu128"
explicit = true

[tool.uv.sources]
torch = [
  { index = "pytorch-cu128", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
]
torchvision = [
  { index = "pytorch-cu128", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
]

[build-system]
requires = ["kernels>=0.1.7", "setuptools"]
build-backend = "setuptools.build_meta"

[tool.kernels.dependencies]
"kernels-community/paged-attention" = ">=0.0.2"
"kernels-community/moe" = ">=0.1.1"
"kernels-community/punica-sgmv" = ">=0.0.1"
"kernels-community/quantization" = ">=0.0.3"
"kernels-community/quantization-eetq" = ">=0.0.1"
"kernels-community/rotary" = ">=0.0.1"

[project.scripts]
text-generation-server = "text_generation_server.cli:app"

[project.optional-dependencies]
accelerate = [
    "accelerate>=1.2.1,<2",
]
bnb = [
    "bitsandbytes>=0.45.0",
]
compressed-tensors = [
    "compressed-tensors>=0.9.0",
]
peft = [
    "peft>=0.14.0",
]
outlines = [
    "outlines>=0.1.13,<1.0",
]
dev = [
    "grpcio-tools>=1.51.1,<2.0",
    "pytest>=7.3.0,<8"
]
quantize = [
    "texttable>=1.6.7,<2",
    "datasets>=2.21,<3",
]
gen = [
    "grpcio-tools>=1.69.0",
    "mypy-protobuf>=3.6.0",
]
torch = [
    "torch==2.7.0",
    "torchvision==0.22.0",
]

[tool.pytest.ini_options]
markers = ["private: marks tests as requiring an admin hf token (deselect with '-m \"not private\"')"]

[tool.isort]
profile = "black"

[tool.uv]
package = true

[tool.setuptools.packages.find]
include = ["text_generation_server*"]


================================================
FILE: server/req.txt
================================================
# This file was autogenerated by uv via the following command:
#    uv pip compile pyproject.toml --extra attention --extra bnb -o req.txt
attention-kernels @ https://github.com/danieldk/attention-kernels/releases/download/v0.2.0.post2/attention_kernels-0.2.0.post2+cu123torch2.5-cp39-abi3-linux_x86_64.whl
    # via text-generation-server (pyproject.toml)
bitsandbytes==0.45.1
    # via text-generation-server (pyproject.toml)
certifi==2025.1.31
    # via requests
charset-normalizer==3.4.1
    # via requests
click==8.1.8
    # via typer
deprecated==1.2.18
    # via
    #   opentelemetry-api
    #   opentelemetry-exporter-otlp-proto-grpc
    #   opentelemetry-exporter-otlp-proto-http
    #   opentelemetry-semantic-conventions
einops==0.8.0
    # via text-generation-server (pyproject.toml)
filelock==3.17.0
    # via
    #   huggingface-hub
    #   torch
fsspec==2025.2.0
    # via
    #   huggingface-hub
    #   torch
googleapis-common-protos==1.66.0
    # via
    #   grpcio-status
    #   opentelemetry-exporter-otlp-proto-grpc
    #   opentelemetry-exporter-otlp-proto-http
grpc-interceptor==0.15.4
    # via text-generation-server (pyproject.toml)
grpcio==1.70.0
    # via
    #   text-generation-server (pyproject.toml)
    #   grpc-interceptor
    #   grpcio-reflection
    #   grpcio-status
    #   opentelemetry-exporter-otlp-proto-grpc
grpcio-reflection==1.70.0
    # via text-generation-server (pyproject.toml)
grpcio-status==1.70.0
    # via text-generation-server (pyproject.toml)
hf-transfer==0.1.9
    # via text-generation-server (pyproject.toml)
huggingface-hub==0.28.1
    # via tokenizers
idna==3.10
    # via requests
importlib-metadata==8.5.0
    # via opentelemetry-api
jinja2==3.1.5
    # via torch
loguru==0.7.3
    # via text-generation-server (pyproject.toml)
markdown-it-py==3.0.0
    # via rich
markupsafe==3.0.2
    # via jinja2
mdurl==0.1.2
    # via markdown-it-py
mpmath==1.3.0
    # via sympy
networkx==3.4.2
    # via torch
numpy==2.2.2
    # via
    #   text-generation-server (pyproject.toml)
    #   bitsandbytes
    #   scipy
nvidia-cublas-cu12==12.4.5.8
    # via
    #   nvidia-cudnn-cu12
    #   nvidia-cusolver-cu12
    #   torch
nvidia-cuda-cupti-cu12==12.4.127
    # via torch
nvidia-cuda-nvrtc-cu12==12.4.127
    # via torch
nvidia-cuda-runtime-cu12==12.4.127
    # via torch
nvidia-cudnn-cu12==9.1.0.70
    # via torch
nvidia-cufft-cu12==11.2.1.3
    # via torch
nvidia-curand-cu12==10.3.5.147
    # via torch
nvidia-cusolver-cu12==11.6.1.9
    # via torch
nvidia-cusparse-cu12==12.3.1.170
    # via
    #   nvidia-cusolver-cu12
    #   torch
nvidia-cusparselt-cu12==0.6.2
    # via torch
nvidia-nccl-cu12==2.21.5
    # via torch
nvidia-nvjitlink-cu12==12.4.127
    # via
    #   nvidia-cusolver-cu12
    #   nvidia-cusparse-cu12
    #   torch
nvidia-nvtx-cu12==12.4.127
    # via torch
opentelemetry-api==1.30.0
    # via
    #   text-generation-server (pyproject.toml)
    #   opentelemetry-exporter-otlp-proto-grpc
    #   opentelemetry-exporter-otlp-proto-http
    #   opentelemetry-instrumentation
    #   opentelemetry-instrumentation-grpc
    #   opentelemetry-sdk
    #   opentelemetry-semantic-conventions
opentelemetry-exporter-otlp==1.30.0
    # via text-generation-server (pyproject.toml)
opentelemetry-exporter-otlp-proto-common==1.30.0
    # via
    #   opentelemetry-exporter-otlp-proto-grpc
    #   opentelemetry-exporter-otlp-proto-http
opentelemetry-exporter-otlp-proto-grpc==1.30.0
    # via opentelemetry-exporter-otlp
opentelemetry-exporter-otlp-proto-http==1.30.0
    # via opentelemetry-exporter-otlp
opentelemetry-instrumentation==0.51b0
    # via opentelemetry-instrumentation-grpc
opentelemetry-instrumentation-grpc==0.51b0
    # via text-generation-server (pyproject.toml)
opentelemetry-proto==1.30.0
    # via
    #   opentelemetry-exporter-otlp-proto-common
    #   opentelemetry-exporter-otlp-proto-grpc
    #   opentelemetry-exporter-otlp-proto-http
opentelemetry-sdk==1.30.0
    # via
    #   opentelemetry-exporter-otlp-proto-grpc
    #   opentelemetry-exporter-otlp-proto-http
opentelemetry-semantic-conventions==0.51b0
    # via
    #   opentelemetry-instrumentation
    #   opentelemetry-instrumentation-grpc
    #   opentelemetry-sdk
packaging==24.2
    # via
    #   huggingface-hub
    #   opentelemetry-instrumentation
pillow==11.1.0
    # via text-generation-server (pyproject.toml)
prometheus-client==0.21.1
    # via text-generation-server (pyproject.toml)
protobuf==5.29.3
    # via
    #   text-generation-server (pyproject.toml)
    #   googleapis-common-protos
    #   grpcio-reflection
    #   grpcio-status
    #   opentelemetry-proto
py-cpuinfo==9.0.0
    # via text-generation-server (pyproject.toml)
pygments==2.19.1
    # via rich
pyyaml==6.0.2
    # via huggingface-hub
requests==2.32.3
    # via
    #   huggingface-hub
    #   opentelemetry-exporter-otlp-proto-http
rich==13.9.4
    # via
    #   text-generation-server (pyproject.toml)
    #   typer
safetensors==0.5.2
    # via text-generation-server (pyproject.toml)
scipy==1.15.1
    # via text-generation-server (pyproject.toml)
sentencepiece==0.2.0
    # via text-generation-server (pyproject.toml)
setuptools==75.8.0
    # via torch
shellingham==1.5.4
    # via typer
sympy==1.13.1
    # via torch
tokenizers==0.21.0
    # via text-generation-server (pyproject.toml)
torch==2.6.0
    # via
    #   attention-kernels
    #   bitsandbytes
tqdm==4.67.1
    # via huggingface-hub
triton==3.2.0
    # via torch
typer==0.15.1
    # via text-generation-server (pyproject.toml)
typing-extensions==4.12.2
    # via
    #   huggingface-hub
    #   opentelemetry-sdk
    #   torch
    #   typer
urllib3==2.3.0
    # via requests
wrapt==1.17.2
    # via
    #   deprecated
    #   opentelemetry-instrumentation
    #   opentelemetry-instrumentation-grpc
zipp==3.21.0
    # via importlib-metadata


================================================
FILE: server/requirements_cuda.txt
================================================
# This file was autogenerated by uv via the following command:
#    uv pip compile pyproject.toml --extra bnb --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines -o requirements_cuda.txt --python-version 3.11
accelerate==1.6.0
    # via
    #   text-generation-server (pyproject.toml)
    #   peft
aiohappyeyeballs==2.6.1
    # via aiohttp
aiohttp==3.11.18
    # via
    #   datasets
    #   fsspec
aiosignal==1.3.2
    # via aiohttp
airportsdata==20250224
    # via outlines
annotated-types==0.7.0
    # via pydantic
attrs==25.3.0
    # via
    #   aiohttp
    #   jsonschema
    #   referencing
bitsandbytes==0.45.5
    # via text-generation-server (pyproject.toml)
certifi==2025.4.26
    # via requests
charset-normalizer==3.4.2
    # via requests
click==8.1.8
    # via
    #   text-generation-server (pyproject.toml)
    #   typer
cloudpickle==3.1.1
    # via outlines
compressed-tensors==0.9.4
    # via text-generation-server (pyproject.toml)
datasets==2.21.0
    # via text-generation-server (pyproject.toml)
deprecated==1.2.18
    # via
    #   opentelemetry-api
    #   opentelemetry-exporter-otlp-proto-grpc
    #   opentelemetry-exporter-otlp-proto-http
    #   opentelemetry-semantic-conventions
dill==0.3.8
    # via
    #   datasets
    #   multiprocess
diskcache==5.6.3
    # via outlines
einops==0.8.1
    # via text-generation-server (pyproject.toml)
filelock==3.18.0
    # via
    #   datasets
    #   huggingface-hub
    #   torch
    #   transformers
frozenlist==1.6.0
    # via
    #   aiohttp
    #   aiosignal
fsspec==2024.6.1
    # via
    #   datasets
    #   huggingface-hub
    #   torch
genson==1.3.0
    # via outlines
googleapis-common-protos==1.70.0
    # via
    #   grpcio-status
    #   opentelemetry-exporter-otlp-proto-grpc
    #   opentelemetry-exporter-otlp-proto-http
grpc-interceptor==0.15.4
    # via text-generation-server (pyproject.toml)
grpcio==1.71.0
    # via
    #   text-generation-server (pyproject.toml)
    #   grpc-interceptor
    #   grpcio-reflection
    #   grpcio-status
    #   opentelemetry-exporter-otlp-proto-grpc
grpcio-reflection==1.71.0
    # via text-generation-server (pyproject.toml)
grpcio-status==1.71.0
    # via text-generation-server (pyproject.toml)
hf-transfer==0.1.9
    # via text-generation-server (pyproject.toml)
hf-xet==1.1.0
    # via
    #   text-generation-server (pyproject.toml)
    #   huggingface-hub
huggingface-hub==0.31.1
    # via
    #   text-generation-server (pyproject.toml)
    #   accelerate
    #   datasets
    #   kernels
    #   peft
    #   tokenizers
    #   transformers
idna==3.10
    # via
    #   requests
    #   yarl
importlib-metadata==8.6.1
    # via opentelemetry-api
interegular==0.3.3
    # via
    #   outlines
    #   outlines-core
iso3166==2.1.1
    # via outlines
jinja2==3.1.6
    # via
    #   outlines
    #   torch
jsonschema==4.23.0
    # via
    #   outlines
    #   outlines-core
jsonschema-specifications==2025.4.1
    # via jsonschema
kernels==0.5.0
    # via text-generation-server (pyproject.toml)
lark==1.2.2
    # via outlines
loguru==0.7.3
    # via text-generation-server (pyproject.toml)
markdown-it-py==3.0.0
    # via rich
markupsafe==3.0.2
    # via jinja2
mdurl==0.1.2
    # via markdown-it-py
mpmath==1.3.0
    # via sympy
multidict==6.4.3
    # via
    #   aiohttp
    #   yarl
multiprocess==0.70.16
    # via datasets
nest-asyncio==1.6.0
    # via outlines
networkx==3.4.2
    # via torch
numpy==2.2.5
    # via
    #   text-generation-server (pyproject.toml)
    #   accelerate
    #   bitsandbytes
    #   datasets
    #   outlines
    #   pandas
    #   peft
    #   scipy
    #   transformers
nvidia-cublas-cu12==12.6.4.1
    # via
    #   nvidia-cudnn-cu12
    #   nvidia-cusolver-cu12
    #   torch
nvidia-cuda-cupti-cu12==12.6.80
    # via torch
nvidia-cuda-nvrtc-cu12==12.6.77
    # via torch
nvidia-cuda-runtime-cu12==12.6.77
    # via torch
nvidia-cudnn-cu12==9.5.1.17
    # via torch
nvidia-cufft-cu12==11.3.0.4
    # via torch
nvidia-cufile-cu12==1.11.1.6
    # via torch
nvidia-curand-cu12==10.3.7.77
    # via torch
nvidia-cusolver-cu12==11.7.1.2
    # via torch
nvidia-cusparse-cu12==12.5.4.2
    # via
    #   nvidia-cusolver-cu12
    #   torch
nvidia-cusparselt-cu12==0.6.3
    # via torch
nvidia-nccl-cu12==2.26.2
    # via torch
nvidia-nvjitlink-cu12==12.6.85
    # via
    #   nvidia-cufft-cu12
    #   nvidia-cusolver-cu12
    #   nvidia-cusparse-cu12
    #   torch
nvidia-nvtx-cu12==12.6.77
    # via torch
opentelemetry-api==1.33.0
    # via
    #   text-generation-server (pyproject.toml)
    #   opentelemetry-exporter-otlp-proto-grpc
    #   opentelemetry-exporter-otlp-proto-http
    #   opentelemetry-instrumentation
    #   opentelemetry-instrumentation-grpc
    #   opentelemetry-sdk
    #   opentelemetry-semantic-conventions
opentelemetry-exporter-otlp==1.33.0
    # via text-generation-server (pyproject.toml)
opentelemetry-exporter-otlp-proto-common==1.33.0
    # via
    #   opentelemetry-exporter-otlp-proto-grpc
    #   opentelemetry-exporter-otlp-proto-http
opentelemetry-exporter-otlp-proto-grpc==1.33.0
    # via opentelemetry-exporter-otlp
opentelemetry-exporter-otlp-proto-http==1.33.0
    # via opentelemetry-exporter-otlp
opentelemetry-instrumentation==0.54b0
    # via opentelemetry-instrumentation-grpc
opentelemetry-instrumentation-grpc==0.54b0
    # via text-generation-server (pyproject.toml)
opentelemetry-proto==1.33.0
    # via
    #   opentelemetry-exporter-otlp-proto-common
    #   opentelemetry-exporter-otlp-proto-grpc
    #   opentelemetry-exporter-otlp-proto-http
opentelemetry-sdk==1.33.0
    # via
    #   opentelemetry-exporter-otlp-proto-grpc
    #   opentelemetry-exporter-otlp-proto-http
opentelemetry-semantic-conventions==0.54b0
    # via
    #   opentelemetry-instrumentation
    #   opentelemetry-instrumentation-grpc
    #   opentelemetry-sdk
outlines==0.2.3
    # via text-generation-server (pyproject.toml)
outlines-core==0.1.26
    # via outlines
packaging==25.0
    # via
    #   accelerate
    #   datasets
    #   huggingface-hub
    #   kernels
    #   opentelemetry-instrumentation
    #   peft
    #   transformers
pandas==2.2.3
    # via datasets
peft==0.15.2
    # via text-generation-server (pyproject.toml)
pillow==11.2.1
    # via text-generation-server (pyproject.toml)
prometheus-client==0.21.1
    # via text-generation-server (pyproject.toml)
propcache==0.3.1
    # via
    #   aiohttp
    #   yarl
protobuf==5.29.4
    # via
    #   text-generation-server (pyproject.toml)
    #   googleapis-common-protos
    #   grpcio-reflection
    #   grpcio-status
    #   opentelemetry-proto
psutil==7.0.0
    # via
    #   accelerate
    #   peft
py-cpuinfo==9.0.0
    # via text-generation-server (pyproject.toml)
pyarrow==20.0.0
    # via datasets
pydantic==2.11.4
    # via
    #   compressed-tensors
    #   outlines
pydantic-core==2.33.2
    # via pydantic
pygments==2.19.1
    # via rich
python-dateutil==2.9.0.post0
    # via pandas
pytz==2025.2
    # via pandas
pyyaml==6.0.2
    # via
    #   accelerate
    #   datasets
    #   huggingface-hub
    #   peft
    #   transformers
referencing==0.36.2
    # via
    #   jsonschema
    #   jsonschema-specifications
    #   outlines
regex==2024.11.6
    # via transformers
requests==2.32.3
    # via
    #   datasets
    #   huggingface-hub
    #   opentelemetry-exporter-otlp-proto-http
    #   outlines
    #   transformers
rich==14.0.0
    # via
    #   text-generation-server (pyproject.toml)
    #   typer
rpds-py==0.24.0
    # via
    #   jsonschema
    #   referencing
safetensors==0.5.3
    # via
    #   text-generation-server (pyproject.toml)
    #   accelerate
    #   peft
    #   transformers
scipy==1.15.3
    # via text-generation-server (pyproject.toml)
sentencepiece==0.2.0
    # via text-generation-server (pyproject.toml)
setuptools==80.4.0
    # via triton
shellingham==1.5.4
    # via typer
six==1.17.0
    # via python-dateutil
sympy==1.14.0
    # via torch
texttable==1.7.0
    # via text-generation-server (pyproject.toml)
tokenizers==0.21.1
    # via
    #   text-generation-server (pyproject.toml)
    #   transformers
torch==2.7.0
    # via
    #   accelerate
    #   bitsandbytes
    #   compressed-tensors
    #   outlines
    #   peft
tqdm==4.67.1
    # via
    #   datasets
    #   huggingface-hub
    #   outlines
    #   peft
    #   transformers
transformers==4.51.3
    # via
    #   text-generation-server (pyproject.toml)
    #   compressed-tensors
    #   peft
triton==3.3.0
    # via torch
typer==0.15.3
    # via text-generation-server (pyproject.toml)
typing-extensions==4.13.2
    # via
    #   huggingface-hub
    #   opentelemetry-sdk
    #   outlines
    #   pydantic
    #   pydantic-core
    #   referencing
    #   torch
    #   typer
    #   typing-inspection
typing-inspection==0.4.0
    # via pydantic
tzdata==2025.2
    # via pandas
urllib3==2.4.0
    # via requests
wrapt==1.17.2
    # via
    #   deprecated
    #   opentelemetry-instrumentation
    #   opentelemetry-instrumentation-grpc
xxhash==3.5.0
    # via datasets
yarl==1.20.0
    # via aiohttp
zipp==3.21.0
    # via importlib-metadata


================================================
FILE: server/requirements_gen.txt
================================================
# This file was autogenerated by uv via the following command:
#    uv pip compile pyproject.toml --extra gen -o requirements_gen.txt --python-version 3.11
certifi==2025.4.26
    # via requests
charset-normalizer==3.4.2
    # via requests
click==8.1.8
    # via
    #   text-generation-server (pyproject.toml)
    #   typer
deprecated==1.2.18
    # via
    #   opentelemetry-api
    #   opentelemetry-exporter-otlp-proto-grpc
    #   opentelemetry-exporter-otlp-proto-http
    #   opentelemetry-semantic-conventions
einops==0.8.1
    # via text-generation-server (pyproject.toml)
filelock==3.18.0
    # via
    #   huggingface-hub
    #   transformers
fsspec==2025.3.2
    # via huggingface-hub
googleapis-common-protos==1.70.0
    # via
    #   grpcio-status
    #   opentelemetry-exporter-otlp-proto-grpc
    #   opentelemetry-exporter-otlp-proto-http
grpc-interceptor==0.15.4
    # via text-generation-server (pyproject.toml)
grpcio==1.71.0
    # via
    #   text-generation-server (pyproject.toml)
    #   grpc-interceptor
    #   grpcio-reflection
    #   grpcio-status
    #   grpcio-tools
    #   opentelemetry-exporter-otlp-proto-grpc
grpcio-reflection==1.71.0
    # via text-generation-server (pyproject.toml)
grpcio-status==1.71.0
    # via text-generation-server (pyproject.toml)
grpcio-tools==1.71.0
    # via text-generation-server (pyproject.toml)
hf-transfer==0.1.9
    # via text-generation-server (pyproject.toml)
hf-xet==1.1.0
    # via
    #   text-generation-server (pyproject.toml)
    #   huggingface-hub
huggingface-hub==0.31.1
    # via
    #   text-generation-server (pyproject.toml)
    #   kernels
    #   tokenizers
    #   transformers
idna==3.10
    # via requests
importlib-metadata==8.6.1
    # via opentelemetry-api
kernels==0.5.0
    # via text-generation-server (pyproject.toml)
loguru==0.7.3
    # via text-generation-server (pyproject.toml)
markdown-it-py==3.0.0
    # via rich
mdurl==0.1.2
    # via markdown-it-py
mypy-protobuf==3.6.0
    # via text-generation-server (pyproject.toml)
numpy==2.2.5
    # via
    #   text-generation-server (pyproject.toml)
    #   scipy
    #   transformers
opentelemetry-api==1.33.0
    # via
    #   text-generation-server (pyproject.toml)
    #   opentelemetry-exporter-otlp-proto-grpc
    #   opentelemetry-exporter-otlp-proto-http
    #   opentelemetry-instrumentation
    #   opentelemetry-instrumentation-grpc
    #   opentelemetry-sdk
    #   opentelemetry-semantic-conventions
opentelemetry-exporter-otlp==1.33.0
    # via text-generation-server (pyproject.toml)
opentelemetry-exporter-otlp-proto-common==1.33.0
    # via
    #   opentelemetry-exporter-otlp-proto-grpc
    #   opentelemetry-exporter-otlp-proto-http
opentelemetry-exporter-otlp-proto-grpc==1.33.0
    # via opentelemetry-exporter-otlp
opentelemetry-exporter-otlp-proto-http==1.33.0
    # via opentelemetry-exporter-otlp
opentelemetry-instrumentation==0.54b0
    # via opentelemetry-instrumentation-grpc
opentelemetry-instrumentation-grpc==0.54b0
    # via text-generation-server (pyproject.toml)
opentelemetry-proto==1.33.0
    # via
    #   opentelemetry-exporter-otlp-proto-common
    #   opentelemetry-exporter-otlp-proto-grpc
    #   opentelemetry-exporter-otlp-proto-http
opentelemetry-sdk==1.33.0
    # via
    #   opentelemetry-exporter-otlp-proto-grpc
    #   opentelemetry-exporter-otlp-proto-http
opentelemetry-semantic-conventions==0.54b0
    # via
    #   opentelemetry-instrumentation
    #   opentelemetry-instrumentation-grpc
    #   opentelemetry-sdk
packaging==25.0
    # via
    #   huggingface-hub
    #   kernels
    #   opentelemetry-instrumentation
    #   transformers
pillow==11.2.1
    # via text-generation-server (pyproject.toml)
prometheus-client==0.21.1
    # via text-generation-server (pyproject.toml)
protobuf==5.29.4
    # via
    #   text-generation-server (pyproject.toml)
    #   googleapis-common-protos
    #   grpcio-reflection
    #   grpcio-status
    #   grpcio-tools
    #   mypy-protobuf
    #   opentelemetry-proto
py-cpuinfo==9.0.0
    # via text-generation-server (pyproject.toml)
pygments==2.19.1
    # via rich
pyyaml==6.0.2
    # via
    #   huggingface-hub
    #   transformers
regex==2024.11.6
    # via transformers
requests==2.32.3
    # via
    #   huggingface-hub
    #   opentelemetry-exporter-otlp-proto-http
    #   transformers
rich==14.0.0
    # via
    #   text-generation-server (pyproject.toml)
    #   typer
safetensors==0.5.3
    # via
    #   text-generation-server (pyproject.toml)
    #   transformers
scipy==1.15.3
    # via text-generation-server (pyproject.toml)
sentencepiece==0.2.0
    # via text-generation-server (pyproject.toml)
setuptools==80.4.0
    # via grpcio-tools
shellingham==1.5.4
    # via typer
tokenizers==0.21.1
    # via
    #   text-generation-server (pyproject.toml)
    #   transformers
tqdm==4.67.1
    # via
    #   huggingface-hub
    #   transformers
transformers==4.51.3
    # via text-generation-server (pyproject.toml)
typer==0.15.3
    # via text-generation-server (pyproject.toml)
types-protobuf==6.30.2.20250506
    # via mypy-protobuf
typing-extensions==4.13.2
    # via
    #   huggingface-hub
    #   opentelemetry-sdk
    #   typer
urllib3==2.4.0
    # via requests
wrapt==1.17.2
    # via
    #   deprecated
    #   opentelemetry-instrumentation
    #   opentelemetry-instrumentation-grpc
zipp==3.21.0
    # via importlib-metadata


================================================
FILE: server/requirements_intel.txt
================================================
# This file was autogenerated by uv via the following command:
#    uv pip compile pyproject.toml --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines -o requirements_intel.txt --python-version 3.11
accelerate==1.6.0
    # via
    #   text-generation-server (pyproject.toml)
    #   peft
aiohappyeyeballs==2.6.1
    # via aiohttp
aiohttp==3.11.18
    # via
    #   datasets
    #   fsspec
aiosignal==1.3.2
    # via aiohttp
airportsdata==20250224
    # via outlines
annotated-types==0.7.0
    # via pydantic
attrs==25.3.0
    # via
    #   aiohttp
    #   jsonschema
    #   referencing
certifi==2025.4.26
    # via requests
charset-normalizer==3.4.2
    # via requests
click==8.1.8
    # via
    #   text-generation-server (pyproject.toml)
    #   typer
cloudpickle==3.1.1
    # via outlines
compressed-tensors==0.9.4
    # via text-generation-server (pyproject.toml)
datasets==2.21.0
    # via text-generation-server (pyproject.toml)
deprecated==1.2.18
    # via
    #   opentelemetry-api
    #   opentelemetry-exporter-otlp-proto-grpc
    #   opentelemetry-exporter-otlp-proto-http
    #   opentelemetry-semantic-conventions
dill==0.3.8
    # via
    #   datasets
    #   multiprocess
diskcache==5.6.3
    # via outlines
einops==0.8.1
    # via text-generation-server (pyproject.toml)
filelock==3.18.0
    # via
    #   datasets
    #   huggingface-hub
    #   torch
    #   transformers
frozenlist==1.6.0
    # via
    #   aiohttp
    #   aiosignal
fsspec==2024.6.1
    # via
    #   datasets
    #   huggingface-hub
    #   torch
genson==1.3.0
    # via outlines
googleapis-common-protos==1.70.0
    # via
    #   grpcio-status
    #   opentelemetry-exporter-otlp-proto-grpc
    #   opentelemetry-exporter-otlp-proto-http
grpc-interceptor==0.15.4
    # via text-generation-server (pyproject.toml)
grpcio==1.71.0
    # via
    #   text-generation-server (pyproject.toml)
    #   grpc-interceptor
    #   grpcio-reflection
    #   grpcio-status
    #   opentelemetry-exporter-otlp-proto-grpc
grpcio-reflection==1.71.0
    # via text-generation-server (pyproject.toml)
grpcio-status==1.71.0
    # via text-generation-server (pyproject.toml)
hf-transfer==0.1.9
    # via text-generation-server (pyproject.toml)
hf-xet==1.1.0
    # via
    #   text-generation-server (pyproject.toml)
    #   huggingface-hub
huggingface-hub==0.31.1
    # via
    #   text-generation-server (pyproject.toml)
    #   accelerate
    #   datasets
    #   kernels
    #   peft
    #   tokenizers
    #   transformers
idna==3.10
    # via
    #   requests
    #   yarl
importlib-metadata==8.6.1
    # via opentelemetry-api
interegular==0.3.3
    # via
    #   outlines
    #   outlines-core
iso3166==2.1.1
    # via outlines
jinja2==3.1.6
    # via
    #   outlines
    #   torch
jsonschema==4.23.0
    # via
    #   outlines
    #   outlines-core
jsonschema-specifications==2025.4.1
    # via jsonschema
kernels==0.5.0
    # via text-generation-server (pyproject.toml)
lark==1.2.2
    # via outlines
loguru==0.7.3
    # via text-generation-server (pyproject.toml)
markdown-it-py==3.0.0
    # via rich
markupsafe==3.0.2
    # via jinja2
mdurl==0.1.2
    # via markdown-it-py
mpmath==1.3.0
    # via sympy
multidict==6.4.3
    # via
    #   aiohttp
    #   yarl
multiprocess==0.70.16
    # via datasets
nest-asyncio==1.6.0
    # via outlines
networkx==3.4.2
    # via torch
numpy==2.2.5
    # via
    #   text-generation-server (pyproject.toml)
    #   accelerate
    #   datasets
    #   outlines
    #   pandas
    #   peft
    #   scipy
    #   transformers
nvidia-cublas-cu12==12.6.4.1
    # via
    #   nvidia-cudnn-cu12
    #   nvidia-cusolver-cu12
    #   torch
nvidia-cuda-cupti-cu12==12.6.80
    # via torch
nvidia-cuda-nvrtc-cu12==12.6.77
    # via torch
nvidia-cuda-runtime-cu12==12.6.77
    # via torch
nvidia-cudnn-cu12==9.5.1.17
    # via torch
nvidia-cufft-cu12==11.3.0.4
    # via torch
nvidia-cufile-cu12==1.11.1.6
    # via torch
nvidia-curand-cu12==10.3.7.77
    # via torch
nvidia-cusolver-cu12==11.7.1.2
    # via torch
nvidia-cusparse-cu12==12.5.4.2
    # via
    #   nvidia-cusolver-cu12
    #   torch
nvidia-cusparselt-cu12==0.6.3
    # via torch
nvidia-nccl-cu12==2.26.2
    # via torch
nvidia-nvjitlink-cu12==12.6.85
    # via
    #   nvidia-cufft-cu12
    #   nvidia-cusolver-cu12
    #   nvidia-cusparse-cu12
    #   torch
nvidia-nvtx-cu12==12.6.77
    # via torch
opentelemetry-api==1.33.0
    # via
    #   text-generation-server (pyproject.toml)
    #   opentelemetry-exporter-otlp-proto-grpc
    #   opentelemetry-exporter-otlp-proto-http
    #   opentelemetry-instrumentation
    #   opentelemetry-instrumentation-grpc
    #   opentelemetry-sdk
    #   opentelemetry-semantic-conventions
opentelemetry-exporter-otlp==1.33.0
    # via text-generation-server (pyproject.toml)
opentelemetry-exporter-otlp-proto-common==1.33.0
    # via
    #   opentelemetry-exporter-otlp-proto-grpc
    #   opentelemetry-exporter-otlp-proto-http
opentelemetry-exporter-otlp-proto-grpc==1.33.0
    # via opentelemetry-exporter-otlp
opentelemetry-exporter-otlp-proto-http==1.33.0
    # via opentelemetry-exporter-otlp
opentelemetry-instrumentation==0.54b0
    # via opentelemetry-instrumentation-grpc
opentelemetry-instrumentation-grpc==0.54b0
    # via text-generation-server (pyproject.toml)
opentelemetry-proto==1.33.0
    # via
    #   opentelemetry-exporter-otlp-proto-common
    #   opentelemetry-exporter-otlp-proto-grpc
    #   opentelemetry-exporter-otlp-proto-http
opentelemetry-sdk==1.33.0
    # via
    #   opentelemetry-exporter-otlp-proto-grpc
    #   opentelemetry-exporter-otlp-proto-http
opentelemetry-semantic-conventions==0.54b0
    # via
    #   opentelemetry-instrumentation
    #   opentelemetry-instrumentation-grpc
    #   opentelemetry-sdk
outlines==0.2.3
    # via text-generation-server (pyproject.toml)
outlines-core==0.1.26
    # via outlines
packaging==25.0
    # via
    #   accelerate
    #   datasets
    #   huggingface-hub
    #   kernels
    #   opentelemetry-instrumentation
    #   peft
    #   transformers
pandas==2.2.3
    # via datasets
peft==0.15.2
    # via text-generation-server (pyproject.toml)
pillow==11.2.1
    # via text-generation-server (pyproject.toml)
prometheus-client==0.21.1
    # via text-generation-server (pyproject.toml)
propcache==0.3.1
    # via
    #   aiohttp
    #   yarl
protobuf==5.29.4
    # via
    #   text-generation-server (pyproject.toml)
    #   googleapis-common-protos
    #   grpcio-reflection
    #   grpcio-status
    #   opentelemetry-proto
psutil==7.0.0
    # via
    #   accelerate
    #   peft
py-cpuinfo==9.0.0
    # via text-generation-server (pyproject.toml)
pyarrow==20.0.0
    # via datasets
pydantic==2.11.4
    # via
    #   compressed-tensors
    #   outlines
pydantic-core==2.33.2
    # via pydantic
pygments==2.19.1
    # via rich
python-dateutil==2.9.0.post0
    # via pandas
pytz==2025.2
    # via pandas
pyyaml==6.0.2
    # via
    #   accelerate
    #   datasets
    #   huggingface-hub
    #   peft
    #   transformers
referencing==0.36.2
    # via
    #   jsonschema
    #   jsonschema-specifications
    #   outlines
regex==2024.11.6
    # via transformers
requests==2.32.3
    # via
    #   datasets
    #   huggingface-hub
    #   opentelemetry-exporter-otlp-proto-http
    #   outlines
    #   transformers
rich==14.0.0
    # via
    #   text-generation-server (pyproject.toml)
    #   typer
rpds-py==0.24.0
    # via
    #   jsonschema
    #   referencing
safetensors==0.5.3
    # via
    #   text-generation-server (pyproject.toml)
    #   accelerate
    #   peft
    #   transformers
scipy==1.15.3
    # via text-generation-server (pyproject.toml)
sentencepiece==0.2.0
    # via text-generation-server (pyproject.toml)
setuptools==80.4.0
    # via triton
shellingham==1.5.4
    # via typer
six==1.17.0
    # via python-dateutil
sympy==1.14.0
    # via torch
texttable==1.7.0
    # via text-generation-server (pyproject.toml)
tokenizers==0.21.1
    # via
    #   text-generation-server (pyproject.toml)
    #   transformers
torch==2.7.0
    # via
    #   accelerate
    #   compressed-tensors
    #   outlines
    #   peft
tqdm==4.67.1
    # via
    #   datasets
    #   huggingface-hub
    #   outlines
    #   peft
    #   transformers
transformers==4.51.3
    # via
    #   text-generation-server (pyproject.toml)
    #   compressed-tensors
    #   peft
triton==3.3.0
    # via torch
typer==0.15.3
    # via text-generation-server (pyproject.toml)
typing-extensions==4.13.2
    # via
    #   huggingface-hub
    #   opentelemetry-sdk
    #   outlines
    #   pydantic
    #   pydantic-core
    #   referencing
    #   torch
    #   typer
    #   typing-inspection
typing-inspection==0.4.0
    # via pydantic
tzdata==2025.2
    # via pandas
urllib3==2.4.0
    # via requests
wrapt==1.17.2
    # via
    #   deprecated
    #   opentelemetry-instrumentation
    #   opentelemetry-instrumentation-grpc
xxhash==3.5.0
    # via datasets
yarl==1.20.0
    # via aiohttp
zipp==3.21.0
    # via importlib-metadata


================================================
FILE: server/requirements_rocm.txt
================================================
# This file was autogenerated by uv via the following command:
#    uv pip compile pyproject.toml --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines -o requirements_rocm.txt --python-version 3.11
accelerate==1.6.0
    # via
    #   text-generation-server (pyproject.toml)
    #   peft
aiohappyeyeballs==2.6.1
    # via aiohttp
aiohttp==3.11.18
    # via
    #   datasets
    #   fsspec
aiosignal==1.3.2
    # via aiohttp
airportsdata==20250224
    # via outlines
annotated-types==0.7.0
    # via pydantic
attrs==25.3.0
    # via
    #   aiohttp
    #   jsonschema
    #   referencing
certifi==2025.4.26
    # via requests
charset-normalizer==3.4.2
    # via requests
click==8.1.8
    # via
    #   text-generation-server (pyproject.toml)
    #   typer
cloudpickle==3.1.1
    # via outlines
compressed-tensors==0.9.4
    # via text-generation-server (pyproject.toml)
datasets==2.21.0
    # via text-generation-server (pyproject.toml)
deprecated==1.2.18
    # via
    #   opentelemetry-api
    #   opentelemetry-exporter-otlp-proto-grpc
    #   opentelemetry-exporter-otlp-proto-http
    #   opentelemetry-semantic-conventions
dill==0.3.8
    # via
    #   datasets
    #   multiprocess
diskcache==5.6.3
    # via outlines
einops==0.8.1
    # via text-generation-server (pyproject.toml)
filelock==3.18.0
    # via
    #   datasets
    #   huggingface-hub
    #   torch
    #   transformers
frozenlist==1.6.0
    # via
    #   aiohttp
    #   aiosignal
fsspec==2024.6.1
    # via
    #   datasets
    #   huggingface-hub
    #   torch
genson==1.3.0
    # via outlines
googleapis-common-protos==1.70.0
    # via
    #   grpcio-status
    #   opentelemetry-exporter-otlp-proto-grpc
    #   opentelemetry-exporter-otlp-proto-http
grpc-interceptor==0.15.4
    # via text-generation-server (pyproject.toml)
grpcio==1.71.0
    # via
    #   text-generation-server (pyproject.toml)
    #   grpc-interceptor
    #   grpcio-reflection
    #   grpcio-status
    #   opentelemetry-exporter-otlp-proto-grpc
grpcio-reflection==1.71.0
    # via text-generation-server (pyproject.toml)
grpcio-status==1.71.0
    # via text-generation-server (pyproject.toml)
hf-transfer==0.1.9
    # via text-generation-server (pyproject.toml)
hf-xet==1.1.0
    # via
    #   text-generation-server (pyproject.toml)
    #   huggingface-hub
huggingface-hub==0.31.1
    # via
    #   text-generation-server (pyproject.toml)
    #   accelerate
    #   datasets
    #   kernels
    #   peft
    #   tokenizers
    #   transformers
idna==3.10
    # via
    #   requests
    #   yarl
importlib-metadata==8.6.1
    # via opentelemetry-api
interegular==0.3.3
    # via
    #   outlines
    #   outlines-core
iso3166==2.1.1
    # via outlines
jinja2==3.1.6
    # via
    #   outlines
    #   torch
jsonschema==4.23.0
    # via
    #   outlines
    #   outlines-core
jsonschema-specifications==2025.4.1
    # via jsonschema
kernels==0.5.0
    # via text-generation-server (pyproject.toml)
lark==1.2.2
    # via outlines
loguru==0.7.3
    # via text-generation-server (pyproject.toml)
markdown-it-py==3.0.0
    # via rich
markupsafe==3.0.2
    # via jinja2
mdurl==0.1.2
    # via markdown-it-py
mpmath==1.3.0
    # via sympy
multidict==6.4.3
    # via
    #   aiohttp
    #   yarl
multiprocess==0.70.16
    # via datasets
nest-asyncio==1.6.0
    # via outlines
networkx==3.4.2
    # via torch
numpy==2.2.5
    # via
    #   text-generation-server (pyproject.toml)
    #   accelerate
    #   datasets
    #   outlines
    #   pandas
    #   peft
    #   scipy
    #   transformers
nvidia-cublas-cu12==12.6.4.1
    # via
    #   nvidia-cudnn-cu12
    #   nvidia-cusolver-cu12
    #   torch
nvidia-cuda-cupti-cu12==12.6.80
    # via torch
nvidia-cuda-nvrtc-cu12==12.6.77
    # via torch
nvidia-cuda-runtime-cu12==12.6.77
    # via torch
nvidia-cudnn-cu12==9.5.1.17
    # via torch
nvidia-cufft-cu12==11.3.0.4
    # via torch
nvidia-cufile-cu12==1.11.1.6
    # via torch
nvidia-curand-cu12==10.3.7.77
    # via torch
nvidia-cusolver-cu12==11.7.1.2
    # via torch
nvidia-cusparse-cu12==12.5.4.2
    # via
    #   nvidia-cusolver-cu12
    #   torch
nvidia-cusparselt-cu12==0.6.3
    # via torch
nvidia-nccl-cu12==2.26.2
    # via torch
nvidia-nvjitlink-cu12==12.6.85
    # via
    #   nvidia-cufft-cu12
    #   nvidia-cusolver-cu12
    #   nvidia-cusparse-cu12
    #   torch
nvidia-nvtx-cu12==12.6.77
    # via torch
opentelemetry-api==1.33.0
    # via
    #   text-generation-server (pyproject.toml)
    #   opentelemetry-exporter-otlp-proto-grpc
    #   opentelemetry-exporter-otlp-proto-http
    #   opentelemetry-instrumentation
    #   opentelemetry-instrumentation-grpc
    #   opentelemetry-sdk
    #   opentelemetry-semantic-conventions
opentelemetry-exporter-otlp==1.33.0
    # via text-generation-server (pyproject.toml)
opentelemetry-exporter-otlp-proto-common==1.33.0
    # via
    #   opentelemetry-exporter-otlp-proto-grpc
    #   opentelemetry-exporter-otlp-proto-http
opentelemetry-exporter-otlp-proto-grpc==1.33.0
    # via opentelemetry-exporter-otlp
opentelemetry-exporter-otlp-proto-http==1.33.0
    # via opentelemetry-exporter-otlp
opentelemetry-instrumentation==0.54b0
    # via opentelemetry-instrumentation-grpc
opentelemetry-instrumentation-grpc==0.54b0
    # via text-generation-server (pyproject.toml)
opentelemetry-proto==1.33.0
    # via
    #   opentelemetry-exporter-otlp-proto-common
    #   opentelemetry-exporter-otlp-proto-grpc
    #   opentelemetry-exporter-otlp-proto-http
opentelemetry-sdk==1.33.0
    # via
    #   opentelemetry-exporter-otlp-proto-grpc
    #   opentelemetry-exporter-otlp-proto-http
opentelemetry-semantic-conventions==0.54b0
    # via
    #   opentelemetry-instrumentation
    #   opentelemetry-instrumentation-grpc
    #   opentelemetry-sdk
outlines==0.2.3
    # via text-generation-server (pyproject.toml)
outlines-core==0.1.26
    # via outlines
packaging==25.0
    # via
    #   accelerate
    #   datasets
    #   huggingface-hub
    #   kernels
    #   opentelemetry-instrumentation
    #   peft
    #   transformers
pandas==2.2.3
    # via datasets
peft==0.15.2
    # via text-generation-server (pyproject.toml)
pillow==11.2.1
    # via text-generation-server (pyproject.toml)
prometheus-client==0.21.1
    # via text-generation-server (pyproject.toml)
propcache==0.3.1
    # via
    #   aiohttp
    #   yarl
protobuf==5.29.4
    # via
    #   text-generation-server (pyproject.toml)
    #   googleapis-common-protos
    #   grpcio-reflection
    #   grpcio-status
    #   opentelemetry-proto
psutil==7.0.0
    # via
    #   accelerate
    #   peft
py-cpuinfo==9.0.0
    # via text-generation-server (pyproject.toml)
pyarrow==20.0.0
    # via datasets
pydantic==2.11.4
    # via
    #   compressed-tensors
    #   outlines
pydantic-core==2.33.2
    # via pydantic
pygments==2.19.1
    # via rich
python-dateutil==2.9.0.post0
    # via pandas
pytz==2025.2
    # via pandas
pyyaml==6.0.2
    # via
    #   accelerate
    #   datasets
    #   huggingface-hub
    #   peft
    #   transformers
referencing==0.36.2
    # via
    #   jsonschema
    #   jsonschema-specifications
    #   outlines
regex==2024.11.6
    # via transformers
requests==2.32.3
    # via
    #   datasets
    #   huggingface-hub
    #   opentelemetry-exporter-otlp-proto-http
    #   outlines
    #   transformers
rich==14.0.0
    # via
    #   text-generation-server (pyproject.toml)
    #   typer
rpds-py==0.24.0
    # via
    #   jsonschema
    #   referencing
safetensors==0.5.3
    # via
    #   text-generation-server (pyproject.toml)
    #   accelerate
    #   peft
    #   transformers
scipy==1.15.3
    # via text-generation-server (pyproject.toml)
sentencepiece==0.2.0
    # via text-generation-server (pyproject.toml)
setuptools==80.4.0
    # via triton
shellingham==1.5.4
    # via typer
six==1.17.0
    # via python-dateutil
sympy==1.14.0
    # via torch
texttable==1.7.0
    # via text-generation-server (pyproject.toml)
tokenizers==0.21.1
    # via
    #   text-generation-server (pyproject.toml)
    #   transformers
torch==2.7.0
    # via
    #   accelerate
    #   compressed-tensors
    #   outlines
    #   peft
tqdm==4.67.1
    # via
    #   datasets
    #   huggingface-hub
    #   outlines
    #   peft
    #   transformers
transformers==4.51.3
    # via
    #   text-generation-server (pyproject.toml)
    #   compressed-tensors
    #   peft
triton==3.3.0
    # via torch
typer==0.15.3
    # via text-generation-server (pyproject.toml)
typing-extensions==4.13.2
    # via
    #   huggingface-hub
    #   opentelemetry-sdk
    #   outlines
    #   pydantic
    #   pydantic-core
    #   referencing
    #   torch
    #   typer
    #   typing-inspection
typing-inspection==0.4.0
    # via pydantic
tzdata==2025.2
    # via pandas
urllib3==2.4.0
    # via requests
wrapt==1.17.2
    # via
    #   deprecated
    #   opentelemetry-instrumentation
    #   opentelemetry-instrumentation-grpc
xxhash==3.5.0
    # via datasets
yarl==1.20.0
    # via aiohttp
zipp==3.21.0
    # via importlib-metadata


================================================
FILE: server/tests/conftest.py
================================================
import pytest
import os
from text_generation_server.pb import generate_pb2

os.environ["PREFIX_CACHING"] = "1"
os.environ["ATTENTION"] = "flashinfer"


@pytest.fixture
def default_pb_parameters():
    return generate_pb2.NextTokenChooserParameters(
        temperature=1.0,
        repetition_penalty=1.0,
        top_k=0,
        top_p=1.0,
        typical_p=1.0,
        do_sample=False,
    )


@pytest.fixture
def default_pb_stop_parameters():
    return generate_pb2.StoppingCriteriaParameters(stop_sequences=[], max_new_tokens=10)


================================================
FILE: server/tests/models/test_bloom.py
================================================
import pytest
import torch

from copy import copy
from transformers import AutoTokenizer

from text_generation_server.pb import generate_pb2
from text_generation_server.models.causal_lm import CausalLMBatch
from text_generation_server.utils import weight_hub_files, download_weights
from text_generation_server.models.bloom import BloomCausalLMBatch, BLOOMSharded
from text_generation_server.models.custom_modeling.bloom_modeling import (
    BloomForCausalLM,
)


@pytest.fixture(scope="session")
def default_bloom():
    model_id = "bigscience/bloom-560m"
    revision = "main"
    filenames = weight_hub_files(model_id, revision, ".safetensors")
    download_weights(filenames, model_id, revision)
    return BLOOMSharded(
        model_id,
        model_class=BloomForCausalLM,
    )


@pytest.fixture(scope="session")
def bloom_560m_tokenizer():
    return AutoTokenizer.from_pretrained("bigscience/bloom-560m", padding_side="left")


@pytest.fixture
def default_pb_request(default_pb_parameters, default_pb_stop_parameters):
    return generate_pb2.Request(
        id=0,
        inputs="Test",
        input_chunks=generate_pb2.Input(chunks=[generate_pb2.InputChunk(text="Test")]),
        prefill_logprobs=True,
        truncate=100,
        parameters=default_pb_parameters,
        stopping_parameters=default_pb_stop_parameters,
    )


@pytest.fixture
def default_pb_batch(default_pb_request):
    return generate_pb2.Batch(id=0, requests=[default_pb_request], size=1)


@pytest.fixture
def default_bloom_batch(default_pb_batch, bloom_560m_tokenizer):
    return BloomCausalLMBatch.from_pb(
        default_pb_batch, bloom_560m_tokenizer, torch.float32, torch.device("cpu")
    )


@pytest.fixture
def default_multi_requests_bloom_batch(default_pb_request, bloom_560m_tokenizer):
    req_0 = copy(default_pb_request)
    req_0.id = 1
    req_1 = default_pb_request
    req_1.id = 2
    req_1.stopping_parameters.max_new_tokens = 5

    batch_pb = generate_pb2.Batch(id=0, requests=[req_0, req_1], size=2)
    return BloomCausalLMBatch.from_pb(
        batch_pb, bloom_560m_tokenizer, torch.float32, torch.device("cpu")
    )


def test_batch_from_pb(default_pb_batch, default_bloom_batch):
    batch = default_bloom_batch

    assert batch.batch_id == default_pb_batch.id
    assert batch.requests == default_pb_batch.requests

    assert len(batch.input_ids) == default_pb_batch.size
    assert batch.input_ids[0][-1] == 10264
    assert torch.all(batch.input_ids[0][:-1] == 3)

    assert batch.attention_mask[0][0] == 1
    assert torch.all(batch.attention_mask[0][1:] == 0)

    assert batch.past_key_values is None

    assert all(
        [
            torch.equal(input_ids, all_input_ids[:, 0])
            for input_ids, all_input_ids in zip(batch.input_ids, batch.all_input_ids)
        ]
    )

    assert batch.input_lengths == [1]

    assert len(batch) == default_pb_batch.size
    assert len(batch.next_token_choosers) == len(batch.stopping_criterias) == len(batch)

    assert batch.max_input_length == batch.input_lengths[0]


def test_batch_concatenate_no_prefill(default_bloom_batch):
    with pytest.raises(ValueError):
        BloomCausalLMBatch.concatenate([default_bloom_batch, default_bloom_batch])


def test_causal_lm_batch_type(default_bloom):
    assert default_bloom.batch_type == BloomCausalLMBatch


def test_causal_lm_generate_token(default_bloom, default_bloom_batch):
    sequence_length = len(default_bloom_batch.all_input_ids[0])
    generations, next_batch, _ = default_bloom.generate_token(default_bloom_batch)

    assert len(generations) == len(default_bloom_batch)
    assert isinstance(next_batch, CausalLMBatch)
    assert not next_batch.keys_head_dim_last

    assert len(next_batch.all_input_ids) == len(next_batch)
    assert len(next_batch.all_input_ids[0]) == sequence_length + 1
    assert len(next_batch.attention_mask[0]) == 11
    assert torch.all(next_batch.all_input_ids[0][-2:] == 10264)
    assert torch.all(next_batch.all_input_ids[0][:-2] == 3)

    assert torch.all(next_batch.attention_mask[0][:2] == 1)
    assert torch.all(next_batch.attention_mask[0][2:] == 0)

    assert next_batch.input_ids.shape == (len(next_batch), 1)
    assert next_batch.input_ids[0, 0] == 10264

    assert next_batch.input_lengths == [2]
    assert next_batch.max_input_length == next_batch.input_lengths[0]

    assert next_batch.past_key_values is not None
    assert all(
        [p[0].shape == (16, 64, sequence_length) for p in next_batch.past_key_values]
    )
    assert all(
        [p[1].shape == (16, sequence_length, 64) for p in next_batch.past_key_values]
    )
    assert all([generation.generated_text is None for generation in generations])
    assert all([len(generation.prefill_tokens) == 1 for generation in generations])
    assert all(
        [
            token_id.item() == 10264
            for generation in generations
            for token_id in generation.tokens.token_ids
        ]
    )
    assert all(
        [
            token_text == "Test"
            for generation in generations
            for token_text in generation.tokens.texts
        ]
    )
    assert generations[0].request_id == 0


def test_causal_lm_generate_token_completion(default_bloom, default_bloom_batch):
    next_batch = default_bloom_batch
    for _ in range(default_bloom_batch.stopping_criterias[0].max_new_tokens - 1):
        generations, next_batch, _ = default_bloom.generate_token(next_batch)
        assert len(generations) == len(default_bloom_batch)

    generations, next_batch, _ = default_bloom.generate_token(next_batch)
    assert next_batch is None

    assert len(generations) == 1
    assert (
        generations[0].generated_text.text == "TestTestTestTestTestTestTestTestTestTest"
    )
    assert generations[0].request_id == default_bloom_batch.requests[0].id
    assert (
        generations[0].generated_text.generated_tokens
        == default_bloom_batch.stopping_criterias[0].max_new_tokens
    )


def test_causal_lm_generate_token_completion_multi(
    default_bloom, default_multi_requests_bloom_batch
):
    next_batch = default_multi_requests_bloom_batch

    for i in range(
        default_multi_requests_bloom_batch.stopping_criterias[1].max_new_tokens - 1
    ):
        generations, next_batch, _ = default_bloom.generate_token(next_batch)
        assert len(generations) == len(default_multi_requests_bloom_batch)

    generations, next_batch, _ = default_bloom.generate_token(next_batch)
    assert next_batch is not None

    assert len(generations) == 2
    assert generations[1].generated_text.text == "TestTestTestTestTest"
    assert (
        generations[1].request_id == default_multi_requests_bloom_batch.requests[1].id
    )
    assert (
        generations[1].generated_text.generated_tokens
        == default_multi_requests_bloom_batch.stopping_criterias[1].max_new_tokens
    )
    # Copy stopping_criterias before filtering
    stopping_criterias = default_multi_requests_bloom_batch.stopping_criterias.copy()

    next_batch = next_batch.filter([next_batch.requests[0].id])

    for _ in range(
        stopping_criterias[0].max_new_tokens - stopping_criterias[1].max_new_tokens - 1
    ):
        generations, next_batch, _ = default_bloom.generate_token(next_batch)
        assert len(generations) == len(next_batch)

    generations, next_batch, _ = default_bloom.generate_token(next_batch)
    assert next_batch is None

    assert len(generations) == 1
    assert (
        generations[0].generated_text.text == "TestTestTestTestTestTestTestTestTestTest"
    )
    assert (
        generations[0].request_id == default_multi_requests_bloom_batch.requests[0].id
    )
    assert (
        generations[0].generated_text.generated_tokens
        == default_multi_requests_bloom_batch.stopping_criterias[0].max_new_tokens
    )


def test_batch_concatenate(
    default_bloom, default_bloom_batch, default_multi_requests_bloom_batch
):
    next_batch_0 = default_bloom_batch
    _, next_batch_0, _ = default_bloom.generate_token(next_batch_0)
    _, next_batch_0, _ = default_bloom.generate_token(next_batch_0)

    next_batch_1 = default_multi_requests_bloom_batch
    _, next_batch_1, _ = default_bloom.generate_token(next_batch_1)

    # Clone past_key_values before concatenating to compare after,
    # because they are removed from the concatenated batches
    next_batch_0_past_key_values = [
        (k.clone(), v.clone()) for (k, v) in next_batch_0.past_key_values
    ]
    next_batch_1_past_key_values = [
        (k.clone(), v.clone()) for (k, v) in next_batch_1.past_key_values
    ]

    next_batch = BloomCausalLMBatch.concatenate([next_batch_0, next_batch_1])

    assert torch.equal(next_batch.all_input_ids[0], next_batch_0.all_input_ids[0])
    assert torch.equal(next_batch.all_input_ids[1], next_batch_1.all_input_ids[0])
    assert torch.equal(next_batch.all_input_ids[2], next_batch_1.all_input_ids[1])

    assert torch.all(
        next_batch.attention_mask[0, : -next_batch.padding_right_offset] == 1
    )
    assert torch.all(
        next_batch.attention_mask[1:, 1 : -next_batch.padding_right_offset] == 1
    )
    assert torch.all(next_batch.attention_mask[1:, 3:] == 0)

    assert next_batch.batch_id == 0
    assert torch.all(next_batch.input_ids == 10264)

    assert next_batch.input_lengths == [3, 2, 2]
    assert next_batch.max_input_length == 3

    assert next_batch.requests[0] == next_batch_0.requests[0]
    assert next_batch.requests[1:] == list(next_batch_1.requests)

    assert next_batch.next_token_choosers[0] == next_batch_0.next_token_choosers[0]
    assert next_batch.next_token_choosers[1:] == next_batch_1.next_token_choosers

    assert next_batch.stopping_criterias[0] == next_batch_0.stopping_criterias[0]
    assert next_batch.stopping_criterias[1:] == next_batch_1.stopping_criterias

    assert next_batch.past_key_values is not None
    assert all([p[0].shape == (3, 16, 64, 2) for p in next_batch.past_key_values])
    assert all([p[1].shape == (3, 16, 2, 64) for p in next_batch.past_key_values])

    for i, past in enumerate(next_batch.past_key_values):
        assert torch.equal(next_batch_0_past_key_values[i][0][:, :, -2:], past[0][0])
        assert torch.equal(
            next_batch_1_past_key_values[i][0][:, :, -1:],
            past[0][1:, :, :, -1].reshape(-1, 64, 1),
        )

        assert torch.equal(next_batch_0_past_key_values[i][1][:, -2:, :], past[1][0])
        assert torch.equal(
            next_batch_1_past_key_values[i][1][:, -1:, :],
            past[1][1:, :, -1, :].reshape(-1, 1, 64),
        )

    for _ in range(
        default_multi_requests_bloom_batch.stopping_criterias[1].max_new_tokens - 2
    ):
        generations, next_batch, _ = default_bloom.generate_token(next_batch)
        assert len(generations) == len(next_batch)

    generations, next_batch, _ = default_bloom.generate_token(next_batch)
    assert next_batch is not None

    assert len(generations) == 3
    assert generations[2].generated_text.text == "TestTestTestTestTest"
    assert (
        generations[2].request_id == default_multi_requests_bloom_batch.requests[1].id
    )
    assert (
        generations[2].generated_text.generated_tokens
        == default_multi_requests_bloom_batch.stopping_criterias[1].max_new_tokens
    )

    next_batch = next_batch.filter(
        [next_batch.requests[0].id, next_batch.requests[1].id]
    )

    for _ in range(
        default_bloom_batch.stopping_criterias[0].max_new_tokens
        - default_multi_requests_bloom_batch.stopping_criterias[1].max_new_tokens
        - 2
    ):
        generations, next_batch, _ = default_bloom.generate_token(next_batch)
        assert len(generations) == len(next_batch)

    generations, next_batch, _ = default_bloom.generate_token(next_batch)
    assert next_batch is not None

    assert len(generations) == 2
    assert (
        generations[0].generated_text.text == "TestTestTestTestTestTestTestTestTestTest"
    )
    assert generations[0].request_id == default_bloom_batch.requests[0].id
    assert (
        generations[0].generated_text.generated_tokens
        == default_bloom_batch.stopping_criterias[0].max_new_tokens
    )

    next_batch = next_batch.filter([next_batch.requests[1].id])

    for _ in range(
        default_multi_requests_bloom_batch.stopping_criterias[0].max_new_tokens
        - default_bloom_batch.stopping_criterias[0].max_new_tokens
        - default_multi_requests_bloom_batch.stopping_criterias[1].max_new_tokens
        - 4
    ):
        generations, next_batch, _ = default_bloom.generate_token(next_batch)
        assert len(generations) == len(next_batch)

    generations, next_batch, _ = default_bloom.generate_token(next_batch)
    assert next_batch is None

    assert len(generations) == 1
    assert (
        generations[0].generated_text.text == "TestTestTestTestTestTestTestTestTestTest"
    )
    assert (
        generations[0].request_id == default_multi_requests_bloom_batch.requests[0].id
    )
    assert (
        generations[0].generated_text.generated_tokens
        == default_multi_requests_bloom_batch.stopping_criterias[0].max_new_tokens
    )


================================================
FILE: server/tests/models/test_causal_lm.py
================================================
import pytest
import torch

from copy import copy
from transformers import AutoTokenizer

from text_generation_server.pb import generate_pb2
from text_generation_server.models.causal_lm import CausalLM, CausalLMBatch


@pytest.fixture(scope="session")
def default_causal_lm():
    return CausalLM.fallback("gpt2")


@pytest.fixture(scope="session")
def gpt2_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left")
    tokenizer.pad_token_id = 50256
    return tokenizer


@pytest.fixture
def default_pb_request(default_pb_parameters, default_pb_stop_parameters):
    return generate_pb2.Request(
        id=0,
        inputs="Test",
        input_chunks=generate_pb2.Input(chunks=[generate_pb2.InputChunk(text="Test")]),
        prefill_logprobs=True,
        truncate=100,
        parameters=default_pb_parameters,
        stopping_parameters=default_pb_stop_parameters,
    )


@pytest.fixture
def default_pb_batch(default_pb_request):
    return generate_pb2.Batch(id=0, requests=[default_pb_request], size=1)


@pytest.fixture
def default_causal_lm_batch(default_pb_batch, gpt2_tokenizer):
    return CausalLMBatch.from_pb(
        default_pb_batch, gpt2_tokenizer, torch.float32, torch.device("cpu")
    )


@pytest.fixture
def default_multi_requests_causal_lm_batch(default_pb_request, gpt2_tokenizer):
    req_0 = copy(default_pb_request)
    req_0.id = 1
    req_1 = default_pb_request
    req_1.id = 2
    req_1.stopping_parameters.max_new_tokens = 5

    batch_pb = generate_pb2.Batch(id=1, requests=[req_0, req_1], size=2)
    return CausalLMBatch.from_pb(
        batch_pb, gpt2_tokenizer, torch.float32, torch.device("cpu")
    )


def test_batch_from_pb(default_pb_batch, default_causal_lm_batch):
    batch = default_causal_lm_batch

    assert batch.batch_id == default_pb_batch.id
    assert batch.requests == default_pb_batch.requests

    assert len(batch.input_ids) == default_pb_batch.size
    assert batch.input_ids[0][-1] == 14402
    assert torch.all(batch.input_ids[0][:-1] == 50256)

    assert batch.attention_mask[0, 0] == 1
    assert torch.all(batch.attention_mask[0, 1:] == 0)

    assert batch.past_key_values is None

    assert all(
        [
            torch.equal(input_ids, all_input_ids[:, 0])
            for input_ids, all_input_ids in zip(batch.input_ids, batch.all_input_ids)
        ]
    )

    assert batch.input_lengths == [1]

    assert len(batch) == default_pb_batch.size
    assert len(batch.next_token_choosers) == len(batch.stopping_criterias) == len(batch)

    assert batch.max_input_length == batch.input_lengths[0]


def test_batch_concatenate_no_prefill(default_causal_lm_batch):
    with pytest.raises(ValueError):
        CausalLMBatch.concatenate([default_causal_lm_batch, default_causal_lm_batch])


def test_causal_lm_batch_type(default_causal_lm):
    assert default_causal_lm.batch_type == CausalLMBatch


def test_causal_lm_generate_token(default_causal_lm, default_causal_lm_batch):
    sequence_length = len(default_causal_lm_batch.all_input_ids[0])
    generations, next_batch, _ = default_causal_lm.generate_token(
        default_causal_lm_batch
    )

    assert len(generations) == len(next_batch)
    assert isinstance(next_batch, CausalLMBatch)

    assert len(next_batch.all_input_ids) == len(next_batch)
    assert len(next_batch.all_input_ids[0]) == sequence_length + 1
    assert len(next_batch.attention_mask[0]) == 11
    assert next_batch.all_input_ids[0][-1] == 13
    assert next_batch.all_input_ids[0][-2] == 14402
    assert torch.all(next_batch.all_input_ids[0][:-2] == 50256)

    assert torch.all(next_batch.attention_mask[0][0:2] == 1)
    assert torch.all(next_batch.attention_mask[0][2:] == 0)

    assert next_batch.input_ids.shape == (len(next_batch), 1)
    assert next_batch.input_ids[0, 0] == 13

    assert next_batch.input_lengths == [2]
    assert next_batch.max_input_length == next_batch.input_lengths[0]

    assert next_batch.past_key_values is not None
    assert all(
        [p[0].shape == (1, 12, sequence_length, 64) for p in next_batch.past_key_values]
    )
    assert all(
        [p[1].shape == (1, 12, sequence_length, 64) for p in next_batch.past_key_values]
    )
    assert all([generation.generated_text is None for generation in generations])
    assert all([len(generation.prefill_tokens) == 1 for generation in generations])
    assert all(
        [
            token_id.item() == 13
            for generation in generations
            for token_id in generation.tokens.token_ids
        ]
    )
    assert all(
        [
            token_text == "."
            for generation in generations
            for token_text in generation.tokens.texts
        ]
    )
    assert generations[0].request_id == 0


def test_causal_lm_generate_token_completion(
    default_causal_lm, default_causal_lm_batch
):
    next_batch = default_causal_lm_batch
    for _ in range(default_causal_lm_batch.stopping_criterias[0].max_new_tokens - 1):
        generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
        assert len(generations) == len(next_batch)

    generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
    assert next_batch is None

    assert len(generations) == 1
    assert generations[0].generated_text.text == ".java:784) at net.minecraft."
    assert generations[0].request_id == default_causal_lm_batch.requests[0].id
    assert (
        generations[0].generated_text.generated_tokens
        == default_causal_lm_batch.stopping_criterias[0].max_new_tokens
    )


def test_causal_lm_generate_token_completion_multi(
    default_causal_lm, default_multi_requests_causal_lm_batch
):
    next_batch = default_multi_requests_causal_lm_batch

    for i in range(
        default_multi_requests_causal_lm_batch.stopping_criterias[1].max_new_tokens - 1
    ):
        generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
        assert len(generations) == len(next_batch)

    generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
    assert next_batch is not None

    assert len(generations) == 2
    assert generations[1].generated_text.text == ".java:784)"
    assert (
        generations[1].request_id
        == default_multi_requests_causal_lm_batch.requests[1].id
    )
    assert (
        generations[1].generated_text.generated_tokens
        == default_multi_requests_causal_lm_batch.stopping_criterias[1].max_new_tokens
    )
    # Copy stopping_criterias before filtering
    stopping_criterias = (
        default_multi_requests_causal_lm_batch.stopping_criterias.copy()
    )

    next_batch = next_batch.filter([next_batch.requests[0].id])

    for _ in range(
        stopping_criterias[0].max_new_tokens - stopping_criterias[1].max_new_tokens - 1
    ):
        generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
        assert len(generations) == len(next_batch)

    generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
    assert next_batch is None

    assert len(generations) == 1
    assert generations[0].generated_text.text == ".java:784) at net.minecraft."
    assert (
        generations[0].request_id
        == default_multi_requests_causal_lm_batch.requests[0].id
    )
    assert (
        generations[0].generated_text.generated_tokens
        == default_multi_requests_causal_lm_batch.stopping_criterias[0].max_new_tokens
    )


def test_batch_concatenate(
    default_causal_lm, default_causal_lm_batch, default_multi_requests_causal_lm_batch
):
    next_batch_0 = default_causal_lm_batch
    _, next_batch_0, _ = default_causal_lm.generate_token(next_batch_0)
    _, next_batch_0, _ = default_causal_lm.generate_token(next_batch_0)

    next_batch_1 = default_multi_requests_causal_lm_batch
    _, next_batch_1, _ = default_causal_lm.generate_token(next_batch_1)

    # Clone past_key_values before concatenating to compare after,
    # because they are removed from the concatenated batches
    next_batch_0_past_key_values = [
        (k.clone(), v.clone()) for (k, v) in next_batch_0.past_key_values
    ]
    next_batch_1_past_key_values = [
        (k.clone(), v.clone()) for (k, v) in next_batch_1.past_key_values
    ]

    next_batch = CausalLMBatch.concatenate([next_batch_0, next_batch_1])

    assert torch.equal(next_batch.all_input_ids[0], next_batch_0.all_input_ids[0])
    assert torch.equal(next_batch.all_input_ids[1], next_batch_1.all_input_ids[0])
    assert torch.equal(next_batch.all_input_ids[2], next_batch_1.all_input_ids[1])

    assert torch.all(
        next_batch.attention_mask[0, : -next_batch.padding_right_offset] == 1
    )
    assert torch.all(
        next_batch.attention_mask[1:, 1 : -next_batch.padding_right_offset] == 1
    )
    assert torch.all(next_batch.attention_mask[1:, 3:] == 0)

    assert next_batch.batch_id == 0
    assert next_batch.input_ids[0, 0] == 12355
    assert torch.all(next_batch.input_ids[1:] == 13)

    assert next_batch.input_lengths == [3, 2, 2]
    assert next_batch.max_input_length == 3

    assert next_batch.requests[0] == next_batch_0.requests[0]
    assert next_batch.requests[1:] == list(next_batch_1.requests)

    assert next_batch.next_token_choosers[0] == next_batch_0.next_token_choosers[0]
    assert next_batch.next_token_choosers[1:] == next_batch_1.next_token_choosers

    assert next_batch.stopping_criterias[0] == next_batch_0.stopping_criterias[0]
    assert next_batch.stopping_criterias[1:] == next_batch_1.stopping_criterias

    assert next_batch.past_key_values is not None
    assert all([p[0].shape == (3, 12, 2, 64) for p in next_batch.past_key_values])
    assert all([p[1].shape == (3, 12, 2, 64) for p in next_batch.past_key_values])

    for i, past in enumerate(next_batch.past_key_values):
        assert torch.equal(next_batch_0_past_key_values[i][0][0, :, -2:], past[0][0])
        assert torch.equal(
            next_batch_1_past_key_values[i][0][:, :, -1:], past[0][1:, :, -1:, :]
        )

        assert torch.equal(next_batch_0_past_key_values[i][1][0, :, -2:], past[1][0])
        assert torch.equal(
            next_batch_1_past_key_values[i][1][:, :, -1:], past[1][1:, :, -1:, :]
        )

    for _ in range(
        default_multi_requests_causal_lm_batch.stopping_criterias[1].max_new_tokens - 2
    ):
        generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
        assert len(generations) == len(next_batch)

    generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
    assert next_batch is not None

    assert len(generations) == 3
    assert generations[2].generated_text.text == ".java:784)"
    assert (
        generations[2].request_id
        == default_multi_requests_causal_lm_batch.requests[1].id
    )
    assert (
        generations[2].generated_text.generated_tokens
        == default_multi_requests_causal_lm_batch.stopping_criterias[1].max_new_tokens
    )

    next_batch = next_batch.filter(
        [next_batch.requests[0].id, next_batch.requests[1].id]
    )

    for _ in range(
        default_causal_lm_batch.stopping_criterias[0].max_new_tokens
        - default_multi_requests_causal_lm_batch.stopping_criterias[1].max_new_tokens
        - 2
    ):
        generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
        assert len(generations) == len(next_batch)

    generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
    assert next_batch is not None

    assert len(generations) == 2
    assert generations[0].generated_text.text == ".java:784) at net.minecraft."
    assert generations[0].request_id == default_causal_lm_batch.requests[0].id
    assert (
        generations[0].generated_text.generated_tokens
        == default_causal_lm_batch.stopping_criterias[0].max_new_tokens
    )

    next_batch = next_batch.filter([next_batch.requests[1].id])

    for _ in range(
        default_multi_requests_causal_lm_batch.stopping_criterias[0].max_new_tokens
        - default_causal_lm_batch.stopping_criterias[0].max_new_tokens
        - default_multi_requests_causal_lm_batch.stopping_criterias[1].max_new_tokens
        - 4
    ):
        generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
        assert len(generations) == len(next_batch)

    generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
    assert next_batch is None

    assert len(generations) == 1
    assert generations[0].generated_text.text == ".java:784) at net.minecraft."
    assert (
        generations[0].request_id
        == default_multi_requests_causal_lm_batch.requests[0].id
    )
    assert (
        generations[0].generated_text.generated_tokens
        == default_multi_requests_causal_lm_batch.stopping_criterias[0].max_new_tokens
    )


================================================
FILE: server/tests/models/test_model.py
================================================
import pytest
import torch

from transformers import AutoTokenizer

from text_generation_server.models import Model


def get_test_model():
    class TestModel(Model):
        def batch_type(self):
            raise NotImplementedError

        def generate_token(self, batch):
            raise NotImplementedError

    tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")

    model = TestModel(
        "test_model_id",
        torch.nn.Linear(1, 1),
        tokenizer,
        False,
        torch.float32,
        torch.device("cpu"),
    )
    return model


@pytest.mark.private
def test_decode_streaming_english_spaces():
    model = get_test_model()
    truth = "Hello here, this is a simple test"
    all_input_ids = [15043, 1244, 29892, 445, 338, 263, 2560, 1243]
    assert (
        all_input_ids == model.tokenizer(truth, add_special_tokens=False)["input_ids"]
    )

    decoded_text = ""
    offset = 0
    token_offset = 0
    for i in range(len(all_input_ids)):
        text, offset, token_offset = model.decode_token(
            all_input_ids[: i + 1], offset, token_offset
        )
        decoded_text += text

    assert decoded_text == truth


@pytest.mark.private
def test_decode_streaming_chinese_utf8():
    model = get_test_model()
    truth = "我很感谢你的热情"
    all_input_ids = [
        30672,
        232,
        193,
        139,
        233,
        135,
        162,
        235,
        179,
        165,
        30919,
        30210,
        234,
        134,
        176,
        30993,
    ]

    decoded_text = ""
    offset = 0
    token_offset = 0
    for i in range(len(all_input_ids)):
        text, offset, token_offset = model.decode_token(
            all_input_ids[: i + 1], offset, token_offset
        )
        decoded_text += text

    assert decoded_text == truth


================================================
FILE: server/tests/models/test_santacoder.py
================================================
import pytest

from text_generation_server.pb import generate_pb2
from text_generation_server.models.causal_lm import CausalLMBatch, CausalLM


@pytest.fixture(scope="session")
def default_santacoder():
    return CausalLM.fallback(model_id="bigcode/santacoder")


@pytest.fixture
def default_pb_request(default_pb_parameters, default_pb_stop_parameters):
    return generate_pb2.Request(
        id=0,
        inputs="def",
        input_chunks=generate_pb2.Input(chunks=[generate_pb2.InputChunk(text="def")]),
        prefill_logprobs=True,
        truncate=100,
        parameters=default_pb_parameters,
        stopping_parameters=default_pb_stop_parameters,
    )


@pytest.fixture
def default_pb_batch(default_pb_request):
    return generate_pb2.Batch(id=0, requests=[default_pb_request], size=1)


@pytest.fixture
def default_fim_pb_request(default_pb_parameters, default_pb_stop_parameters):
    return generate_pb2.Request(
        id=0,
        inputs="<fim-prefix>def<fim-suffix>world<fim-middle>",
        input_chunks=generate_pb2.Input(
            chunks=[
                generate_pb2.InputChunk(
                    text="<fim-prefix>def<fim-suffix>world<fim-middle>"
                )
            ]
        ),
        prefill_logprobs=True,
        truncate=100,
        parameters=default_pb_parameters,
        stopping_parameters=default_pb_stop_parameters,
    )


@pytest.fixture
def default_fim_pb_batch(default_fim_pb_request):
    return generate_pb2.Batch(id=0, requests=[default_fim_pb_request], size=1)


@pytest.mark.skip
def test_santacoder_generate_token_completion(default_santacoder, default_pb_batch):
    batch = CausalLMBatch.from_pb(
        default_pb_batch,
        default_santacoder.tokenizer,
        default_santacoder.dtype,
        default_santacoder.device,
    )
    next_batch = batch

    for _ in range(batch.stopping_criterias[0].max_new_tokens - 1):
        generations, next_batch, _ = default_santacoder.generate_token(next_batch)
        assert len(generations) == len(next_batch)

    generations, next_batch, _ = default_santacoder.generate_token(next_batch)
    assert next_batch is None

    assert len(generations) == 1
    assert generations[0].generated_text.text == " test_get_all_users_with_"
    assert generations[0].request_id == batch.requests[0].id
    assert (
        generations[0].generated_text.generated_tokens
        == batch.stopping_criterias[0].max_new_tokens
    )


@pytest.mark.skip
def test_fim_santacoder_generate_token_completion(
    default_santacoder, default_fim_pb_batch
):
    batch = CausalLMBatch.from_pb(
        default_fim_pb_batch,
        default_santacoder.tokenizer,
        default_santacoder.dtype,
        default_santacoder.device,
    )
    next_batch = batch

    for _ in range(batch.stopping_criterias[0].max_new_tokens - 1):
        generations, next_batch, _ = default_santacoder.generate_token(next_batch)
        assert len(generations) == len(next_batch)

    generations, next_batch, _ = default_santacoder.generate_token(next_batch)
    assert next_batch is None

    assert len(generations) == 1
    assert (
        generations[0].generated_text.text
        == """ineProperty(exports, "__esModule", { value"""
    )
    assert generations[0].request_id == batch.requests[0].id
    assert (
        generations[0].generated_text.generated_tokens
        == batch.stopping_criterias[0].max_new_tokens
    )


================================================
FILE: server/tests/models/test_seq2seq_lm.py
================================================
import pytest
import torch

from copy import copy

from transformers import AutoTokenizer

from text_generation_server.pb import generate_pb2
from text_generation_server.models.seq2seq_lm import Seq2SeqLM, Seq2SeqLMBatch


@pytest.fixture(scope="session")
def mt0_small_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained(
        "bigscience/mt0-small", padding_side="left"
    )
    tokenizer.bos_token_id = 0
    return tokenizer


@pytest.fixture(scope="session")
def default_seq2seq_lm():
    return Seq2SeqLM.fallback("bigscience/mt0-small")


@pytest.fixture
def default_pb_request(default_pb_parameters, default_pb_stop_parameters):
    return generate_pb2.Request(
        id=0,
        inputs="Test",
        input_chunks=generate_pb2.Input(chunks=[generate_pb2.InputChunk(text="Test")]),
        prefill_logprobs=True,
        truncate=100,
        parameters=default_pb_parameters,
        stopping_parameters=default_pb_stop_parameters,
    )


@pytest.fixture
def default_pb_batch(default_pb_request):
    return generate_pb2.Batch(id=0, requests=[default_pb_request], size=1)


@pytest.fixture
def default_seq2seq_lm_batch(default_pb_batch, mt0_small_tokenizer):
    return Seq2SeqLMBatch.from_pb(
        default_pb_batch, mt0_small_tokenizer, torch.float32, torch.device("cpu")
    )


@pytest.fixture
def default_multi_requests_seq2seq_lm_batch(default_pb_request, mt0_small_tokenizer):
    req_0 = copy(default_pb_request)
    req_0.id = 1
    req_1 = default_pb_request
    req_1.id = 2
    req_1.stopping_parameters.max_new_tokens = 5

    batch_pb = generate_pb2.Batch(id=0, requests=[req_0, req_1], size=2)
    return Seq2SeqLMBatch.from_pb(
        batch_pb, mt0_small_tokenizer, torch.float32, torch.device("cpu")
    )


def test_batch_from_pb(default_pb_batch, default_seq2seq_lm_batch):
    batch = default_seq2seq_lm_batch
    sequence_length = len(default_seq2seq_lm_batch.input_ids[0])

    assert batch.batch_id == default_pb_batch.id
    assert batch.requests == default_pb_batch.requests

    assert batch.input_ids.shape == (default_pb_batch.size, sequence_length)
    assert batch.input_ids[0][-2] == 4268
    assert batch.input_ids[0][-1] == 1
    assert torch.all(batch.input_ids[0][:-2] == 0)

    assert torch.all(batch.attention_mask[0][-2:] == 1)
    assert torch.all(batch.attention_mask[0][:-2] == 0)

    assert len(batch.decoder_input_ids) == default_pb_batch.size
    assert batch.decoder_attention_mask is None
    assert batch.encoder_last_hidden_state is None

    assert batch.past_key_values is None

    assert batch.input_lengths == [2]
    assert batch.decoder_input_lengths == [1]

    assert len(batch) == default_pb_batch.size
    assert len(batch.next_token_choosers) == len(batch.stopping_criterias) == len(batch)

    assert batch.max_input_length == batch.input_lengths[0]
    assert batch.max_decoder_input_length == batch.decoder_input_lengths[0]


def test_batch_concatenate_no_prefill(default_seq2seq_lm_batch):
    with pytest.raises(ValueError):
        Seq2SeqLMBatch.concatenate([default_seq2seq_lm_batch, default_seq2seq_lm_batch])


def test_seq2seq_lm_batch_type(default_seq2seq_lm):
    assert default_seq2seq_lm.batch_type == Seq2SeqLMBatch


def test_seq2seq_lm_generate_token(default_seq2seq_lm, default_seq2seq_lm_batch):
    sequence_length = len(default_seq2seq_lm_batch.input_ids[0])
    generations, next_batch, _ = default_seq2seq_lm.generate_token(
        default_seq2seq_lm_batch
    )

    assert len(generations) == len(next_batch)
    assert isinstance(next_batch, Seq2SeqLMBatch)

    assert next_batch.input_ids is None
    assert torch.equal(
        next_batch.attention_mask, default_seq2seq_lm_batch.attention_mask
    )
    assert next_batch.input_lengths == default_seq2seq_lm_batch.input_lengths
    assert next_batch.max_input_length == default_seq2seq_lm_batch.max_input_length
    assert (
        next_batch.next_token_choosers == default_seq2seq_lm_batch.next_token_choosers
    )
    assert next_batch.stopping_criterias == default_seq2seq_lm_batch.stopping_criterias

    assert len(next_batch.decoder_input_ids) == len(next_batch)
    assert next_batch.all_decoder_input_ids[0][0] == 0
    assert next_batch.all_decoder_input_ids[0][1] == 259
    assert next_batch.decoder_attention_mask is None
    assert next_batch.encoder_last_hidden_state.shape == (1, sequence_length, 512)

    assert next_batch.decoder_input_lengths == [2]
    assert next_batch.max_decoder_input_length == 2

    assert next_batch.past_key_values is not None
    assert all(
        [p[0].shape == (len(next_batch), 6, 1, 64) for p in next_batch.past_key_values]
    )
    assert all(
        [p[1].shape == (len(next_batch), 6, 1, 64) for p in next_batch.past_key_values]
    )
    assert all(
        [
            p[2].shape == (len(next_batch), 6, sequence_length, 64)
            for p in next_batch.past_key_values
        ]
    )
    assert all(
        [
            p[3].shape == (len(next_batch), 6, sequence_length, 64)
            for p in next_batch.past_key_values
        ]
    )
    assert all([generation.generated_text is None for generation in generations])
    assert all([len(generation.prefill_tokens) == 1 for generation in generations])
    assert all(
        [
            token_id.item() == 259
            for generation in generations
            for token_id in generation.tokens.token_ids
        ]
    )
    assert all(
        [
            token_text == " "
            for generation in generations
            for token_text in generation.tokens.texts
        ]
    )
    assert generations[0].request_id == 0


def test_seq2seq_lm_generate_token_completion(
    default_seq2seq_lm, default_seq2seq_lm_batch
):
    next_batch = default_seq2seq_lm_batch
    for _ in range(6):
        generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
        assert len(generations) == len(next_batch)

    generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
    assert next_batch is None

    assert len(generations) == 1
    assert generations[0].generated_text.text == "a few weeks"
    assert generations[0].request_id == default_seq2seq_lm_batch.requests[0].id
    assert generations[0].generated_text.generated_tokens == 7


def test_seq2seq_lm_generate_token_completion_multi(
    default_seq2seq_lm, default_multi_requests_seq2seq_lm_batch
):
    next_batch = default_multi_requests_seq2seq_lm_batch

    for i in range(4):
        generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
        assert len(generations) == len(next_batch)

    generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
    assert next_batch is not None

    assert len(generations) == 2
    assert generations[1].generated_text.text == "a few "
    assert (
        generations[1].request_id
        == default_multi_requests_seq2seq_lm_batch.requests[1].id
    )
    assert generations[1].generated_text.generated_tokens == 5

    next_batch = next_batch.filter([next_batch.requests[0].id])

    generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
    assert len(generations) == len(next_batch)

    generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
    assert next_batch is None

    assert len(generations) == 1
    assert generations[0].generated_text.text == "a few weeks"
    assert (
        generations[0].request_id
        == default_multi_requests_seq2seq_lm_batch.requests[0].id
    )
    assert generations[0].generated_text.generated_tokens == 7


def test_batch_concatenate(
    default_seq2seq_lm,
    default_seq2seq_lm_batch,
    default_multi_requests_seq2seq_lm_batch,
):
    next_batch_0 = default_seq2seq_lm_batch
    _, next_batch_0, _ = default_seq2seq_lm.generate_token(next_batch_0)
    _, next_batch_0, _ = default_seq2seq_lm.generate_token(next_batch_0)

    next_batch_1 = default_multi_requests_seq2seq_lm_batch
    _, next_batch_1, _ = default_seq2seq_lm.generate_token(next_batch_1)

    # Copy hidden state because it is removed from the concatenated branches
    next_batch_0_encoder_last_hidden_state = next_batch_0.encoder_last_hidden_state
    next_batch_1_encoder_last_hidden_state = next_batch_1.encoder_last_hidden_state

    # Clone past_key_values before concatenating to compare after,
    # because they are removed from the concatenated batches
    next_batch_0_past_key_values = [
        [t.clone() for t in layer] for layer in next_batch_0.past_key_values
    ]
    next_batch_1_past_key_values = [
        [t.clone() for t in layer] for layer in next_batch_1.past_key_values
    ]

    next_batch = Seq2SeqLMBatch.concatenate([next_batch_0, next_batch_1])

    assert next_batch.batch_id == 0

    assert torch.equal(
        next_batch.decoder_input_ids[0], next_batch_0.decoder_input_ids[0]
    )
    assert next_batch.all_decoder_input_ids[1][0] == 0
    assert next_batch.all_decoder_input_ids[2][0] == 0
    assert torch.equal(
        next_batch.decoder_input_ids[1:, -2:], next_batch_1.decoder_input_ids
    )

    assert torch.all(next_batch.decoder_attention_mask[0, :3] == 1)
    assert torch.all(next_batch.decoder_attention_mask[0, 3:] == 0)
    assert torch.all(next_batch.decoder_attention_mask[1:, 0] == 0)
    assert torch.all(next_batch.decoder_attention_mask[1:, 1:3] == 1)

    assert torch.equal(
        next_batch.encoder_last_hidden_state[0],
        next_batch_0_encoder_last_hidden_state[0, -2:],
    )
    assert torch.equal(
        next_batch.encoder_last_hidden_state[1:],
        next_batch_1_encoder_last_hidden_state[:, -2:],
    )

    assert next_batch.input_lengths == [2, 2, 2]
    assert next_batch.decoder_input_lengths == [3, 2, 2]
    assert next_batch.max_input_length == 2
    assert next_batch.max_decoder_input_length == 3

    assert next_batch.requests[0] == next_batch_0.requests[0]
    assert next_batch.requests[1:] == list(next_batch_1.requests)

    assert next_batch.next_token_choosers[0] == next_batch_0.next_token_choosers[0]
    assert next_batch.next_token_choosers[1:] == next_batch_1.next_token_choosers

    assert next_batch.stopping_criterias[0] == next_batch_0.stopping_criterias[0]
    assert next_batch.stopping_criterias[1:] == next_batch_1.stopping_criterias

    assert next_batch.past_key_values is not None
    assert all(
        [p[0].shape == (len(next_batch), 6, 2, 64) for p in next_batch.past_key_values]
    )
    assert all(
        [p[1].shape == (len(next_batch), 6, 2, 64) for p in next_batch.past_key_values]
    )
    assert all(
        [p[2].shape == (len(next_batch), 6, 2, 64) for p in next_batch.past_key_values]
    )
    assert all(
        [p[3].shape == (len(next_batch), 6, 2, 64) for p in next_batch.past_key_values]
    )

    for i, past in enumerate(next_batch.past_key_values):
        assert torch.equal(next_batch_0_past_key_values[i][0][0, :, -2:, :], past[0][0])
        assert torch.equal(
            next_batch_1_past_key_values[i][0][:, :, -1:, :], past[0][1:, :, -1:, :]
        )

        assert torch.equal(next_batch_0_past_key_values[i][1][0, :, -2:, :], past[1][0])
        assert torch.equal(
            next_batch_1_past_key_values[i][1][:, :, -1:, :], past[1][1:, :, -1:, :]
        )

        assert torch.equal(next_batch_0_past_key_values[i][2][0, :, -2:, :], past[2][0])
        assert torch.equal(
            next_batch_1_past_key_values[i][2][:, :, -2:, :], past[2][1:]
        )

        assert torch.equal(next_batch_0_past_key_values[i][3][0, :, -2:, :], past[3][0])
        assert torch.equal(
            next_batch_1_past_key_values[i][3][:, :, -2:, :], past[3][1:]
        )

    for _ in range(3):
        generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
        assert len(generations) == len(next_batch)

    generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
    assert next_batch is not None

    assert len(generations) == 3
    assert generations[2].generated_text.text == "a few "
    assert (
        generations[2].request_id
        == default_multi_requests_seq2seq_lm_batch.requests[1].id
    )
    assert generations[2].generated_text.generated_tokens == 5

    next_batch = next_batch.filter(
        [next_batch.requests[0].id, next_batch.requests[1].id]
    )

    generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
    assert next_batch is not None

    assert len(generations) == 2
    assert generations[0].generated_text.text == "a few weeks"
    assert generations[0].request_id == default_seq2seq_lm_batch.requests[0].id
    assert generations[0].generated_text.generated_tokens == 7

    next_batch = next_batch.filter([next_batch.requests[1].id])

    generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
    assert next_batch is None

    assert len(generations) == 1
    assert generations[0].generated_text.text == "a few weeks"
    assert (
        generations[0].request_id
        == default_multi_requests_seq2seq_lm_batch.requests[0].id
    )
    assert generations[0].generated_text.generated_tokens == 7


================================================
FILE: server/tests/utils/test_adapter.py
================================================
import pytest
from unittest.mock import Mock
from text_generation_server.utils.adapter import (
    get_attn_weights,
    get_mlp_weights,
    parse_lora_adapters,
    AdapterInfo,
)


def test_parse_lora_adapters_empty():
    assert parse_lora_adapters(None) == []
    assert parse_lora_adapters("") == []


def test_parse_lora_adapters_single():
    result = parse_lora_adapters("adapter1")
    assert result == [AdapterInfo(id="adapter1", path=None, revision=None)]


def test_parse_lora_adapters_with_path():
    result = parse_lora_adapters("adapter1=path/to/adapter1")
    assert result == [
        AdapterInfo(id="adapter1", path="path/to/adapter1", revision=None)
    ]


def test_parse_lora_adapters_with_path_and_revision():
    result = parse_lora_adapters("adapter1=path/to/adapter1@main")
    assert result == [
        AdapterInfo(id="adapter1", path="path/to/adapter1", revision="main")
    ]


def test_parse_lora_adapters_multiple():
    result = parse_lora_adapters(
        "adapter1,adapter2=path/to/adapter2,adapter3=path/to/adapter3@dev"
    )
    assert result == [
        AdapterInfo(id="adapter1", path=None, revision=None),
        AdapterInfo(id="adapter2", path="path/to/adapter2", revision=None),
        AdapterInfo(id="adapter3", path="path/to/adapter3", revision="dev"),
    ]


def test_parse_lora_adapters_invalid_format():
    try:
        parse_lora_adapters("adapter1,invalid=format=test,adapter3")
        assert False, "Should have raised ValueError"
    except ValueError as e:
        assert str(e) == "Invalid LoRA adapter format: invalid=format=test"


def test_get_attn_weights():
    # create a mock layer
    mock_layer = Mock()
    mock_layer.self_attn.query_key_value = Mock()
    mock_layer.self_attn.o_proj = Mock()

    # call the function
    result = get_attn_weights(2, mock_layer)

    # assert the result
    expected = {
        (2, "q_proj"): (
            "model.layers.2.self_attn.q_proj",
            mock_layer.self_attn.query_key_value,
        ),
        (2, "k_proj"): (
            "model.layers.2.self_attn.k_proj",
            mock_layer.self_attn.query_key_value,
        ),
        (2, "qkv_proj"): (
            "model.layers.2.self_attn.qkv_proj",
            mock_layer.self_attn.query_key_value,
        ),
        (2, "v_proj"): (
            "model.layers.2.self_attn.v_proj",
            mock_layer.self_attn.query_key_value,
        ),
        (2, "o_proj"): ("model.layers.2.self_attn.o_proj", mock_layer.self_attn.o_proj),
    }
    assert result == expected


def test_get_mlp_weights_with_gate_up_proj():
    # create a mock layer with gate_up_proj
    mock_layer = Mock()
    mock_layer.mlp.gate_up_proj = Mock()
    mock_layer.mlp.down_proj = Mock()

    # call the function
    result = get_mlp_weights(3, mock_layer)

    # assert the result
    expected = {
        (3, "c_fc"): ("model.layers.3.mlp.c_fc", mock_layer.mlp.c_fc),
        (3, "c_proj"): ("model.layers.3.mlp.c_proj", mock_layer.mlp.c_proj),
        (3, "gate_proj"): ("model.layers.3.mlp.gate_proj", mock_layer.mlp.gate_up_proj),
        (3, "up_proj"): ("model.layers.3.mlp.up_proj", mock_layer.mlp.gate_up_proj),
        (3, "down_proj"): ("model.layers.3.mlp.down_proj", mock_layer.mlp.down_proj),
    }
    assert result == expected


def test_get_mlp_weights_without_gate_up_proj():
    # create a mock layer without gate_up_proj
    mock_layer = Mock()
    mock_layer.mlp = Mock(spec=[])

    # call the function
    result = get_mlp_weights(1, mock_layer)

    # assert the result
    assert result == {}


@pytest.mark.parametrize("layer_index", [0, 1, 5])
def test_get_attn_weights_different_layers(layer_index):
    mock_layer = Mock()
    mock_layer.self_attn.query_key_value = Mock()
    mock_layer.self_attn.o_proj = Mock()

    result = get_attn_weights(layer_index, mock_layer)

    for k in ["q", "k", "v"]:
        assert (layer_index, f"{k}_proj") in result
        assert (
            result[(layer_index, f"{k}_proj")][0]
            == f"model.layers.{layer_index}.self_attn.{k}_proj"
        )

    assert (layer_index, "o_proj") in result
    assert (
        result[(layer_index, "o_proj")][0]
        == f"model.layers.{layer_index}.self_attn.o_proj"
    )


@pytest.mark.parametrize("layer_index", [0, 1, 5])
def test_get_mlp_weights_different_layers(layer_index):
    mock_layer = Mock()
    mock_layer.mlp.gate_up_proj = Mock()
    mock_layer.mlp.down_proj = Mock()

    result = get_mlp_weights(layer_index, mock_layer)

    for k in ["gate", "up", "down"]:
        assert (layer_index, f"{k}_proj") in result
        assert (
            result[(layer_index, f"{k}_proj")][0]
            == f"model.layers.{layer_index}.mlp.{k}_proj"
        )


def test_get_attn_weights_llama_compatibility():
    mock_layer = Mock()
    mock_layer.self_attn.query_key_value = Mock()
    mock_layer.self_attn.o_proj = Mock()

    result = get_attn_weights(2, mock_layer)

    expected = {
        (2, "q_proj"): (
            "model.layers.2.self_attn.q_proj",
            mock_layer.self_attn.query_key_value,
        ),
        (2, "k_proj"): (
            "model.layers.2.self_attn.k_proj",
            mock_layer.self_attn.query_key_value,
        ),
        (2, "qkv_proj"): (
            "model.layers.2.self_attn.qkv_proj",
            mock_layer.self_attn.query_key_value,
        ),
        (2, "v_proj"): (
            "model.layers.2.self_attn.v_proj",
            mock_layer.self_attn.query_key_value,
        ),
        (2, "o_proj"): ("model.layers.2.self_attn.o_proj", mock_layer.self_attn.o_proj),
    }
    assert result == expected


def test_get_mlp_weights_llama_compatibility():
    mock_layer = Mock()
    mock_layer.mlp.gate_up_proj = Mock()
    mock_layer.mlp.down_proj = Mock()

    result = get_mlp_weights(3, mock_layer)

    expected = {
        (3, "c_fc"): ("model.layers.3.mlp.c_fc", mock_layer.mlp.c_fc),
        (3, "c_proj"): ("model.layers.3.mlp.c_proj", mock_layer.mlp.c_proj),
        (3, "gate_proj"): ("model.layers.3.mlp.gate_proj", mock_layer.mlp.gate_up_proj),
        (3, "up_proj"): ("model.layers.3.mlp.up_proj", mock_layer.mlp.gate_up_proj),
        (3, "down_proj"): ("model.layers.3.mlp.down_proj", mock_layer.mlp.down_proj),
    }
    assert result == expected


def test_get_attn_weights_gemma_compatibility():
    mock_layer = Mock()
    mock_layer.self_attn.query_key_value = Mock()
    mock_layer.self_attn.o_proj = Mock()

    result = get_attn_weights(2, mock_layer)

    expected = {
        (2, "q_proj"): (
            "model.layers.2.self_attn.q_proj",
            mock_layer.self_attn.query_key_value,
        ),
        (2, "k_proj"): (
            "model.layers.2.self_attn.k_proj",
            mock_layer.self_attn.query_key_value,
        ),
        (2, "qkv_proj"): (
            "model.layers.2.self_attn.qkv_proj",
            mock_layer.self_attn.query_key_value,
        ),
        (2, "v_proj"): (
            "model.layers.2.self_attn.v_proj",
            mock_layer.self_attn.query_key_value,
        ),
        (2, "o_proj"): ("model.layers.2.self_attn.o_proj", mock_layer.self_attn.o_proj),
    }
    assert result == expected


def test_get_mlp_weights_gemma_compatibility():
    mock_layer = Mock()
    mock_layer.mlp.gate_proj = Mock()
    mock_layer.mlp.up_proj = Mock()
    mock_layer.mlp.down_proj = Mock()

    # ensure that the mock_layer.mlp.gate_up_proj attribute does not exist.
    # This is necessary because the use of `Mock` automatically creates any
    # attributes that are accessed, even if they don't exist in the actual
    # implementation. If `gate_up_proj` were created, `get_mlp_weights` might
    # follow the wrong execution path and return an incorrect result.
    del mock_layer.mlp.gate_up_proj

    result = get_mlp_weights(3, mock_layer)

    expected = {
        (3, "c_fc"): ("model.layers.3.mlp.c_fc", mock_layer.mlp.c_fc),
        (3, "c_proj"): ("model.layers.3.mlp.c_proj", mock_layer.mlp.c_proj),
        (3, "gate_proj"): ("model.layers.3.mlp.gate_proj", mock_layer.mlp.gate_proj),
        (3, "up_proj"): ("model.layers.3.mlp.up_proj", mock_layer.mlp.up_proj),
        (3, "down_proj"): ("model.layers.3.mlp.down_proj", mock_layer.mlp.down_proj),
    }
    assert result == expected


================================================
FILE: server/tests/utils/test_convert.py
================================================
from text_generation_server.utils.hub import (
    download_weights,
    weight_hub_files,
    weight_files,
)

from text_generation_server.utils.convert import convert_files


def test_convert_files():
    model_id = "bigscience/bloom-560m"
    pt_filenames = weight_hub_files(model_id, extension=".bin")
    local_pt_files = download_weights(pt_filenames, model_id)
    local_st_files = [
        p.parent / f"{p.stem.lstrip('pytorch_')}.safetensors" for p in local_pt_files
    ]
    convert_files(local_pt_files, local_st_files, discard_names=[])

    found_st_files = weight_files(model_id)

    assert all([p in found_st_files for p in local_st_files])


================================================
FILE: server/tests/utils/test_hub.py
================================================
import os
import tempfile

import pytest

import huggingface_hub.constants

import text_generation_server.utils.hub
from text_generation_server.utils.hub import (
    weight_hub_files,
    download_weights,
    weight_files,
    EntryNotFoundError,
    LocalEntryNotFoundError,
    RevisionNotFoundError,
)


@pytest.fixture()
def offline():
    current_value = text_generation_server.utils.hub.HF_HUB_OFFLINE
    text_generation_server.utils.hub.HF_HUB_OFFLINE = True
    yield "offline"
    text_generation_server.utils.hub.HF_HUB_OFFLINE = current_value


@pytest.fixture()
def fresh_cache():
    with tempfile.TemporaryDirectory() as d:
        current_value = huggingface_hub.constants.HUGGINGFACE_HUB_CACHE
        huggingface_hub.constants.HUGGINGFACE_HUB_CACHE = d
        text_generation_server.utils.hub.HUGGINGFACE_HUB_CACHE = d
        os.environ["HUGGINGFACE_HUB_CACHE"] = d
        yield
        huggingface_hub.constants.HUGGINGFACE_HUB_CACHE = current_value
        os.environ["HUGGINGFACE_HUB_CACHE"] = current_value
        text_generation_server.utils.hub.HUGGINGFACE_HUB_CACHE = current_value


@pytest.fixture()
def prefetched():
    model_id = "bert-base-uncased"
    huggingface_hub.snapshot_download(
        repo_id=model_id,
        revision="main",
        local_files_only=False,
        repo_type="model",
        allow_patterns=["*.safetensors"],
    )
    yield model_id


def test_weight_hub_files_offline_error(offline, fresh_cache):
    # If the model is not prefetched then it will raise an error
    with pytest.raises(EntryNotFoundError):
        weight_hub_files("gpt2")


def test_weight_hub_files_offline_ok(prefetched, offline):
    # If the model is prefetched then we should be able to get the weight files from local cache
    filenames = weight_hub_files(prefetched)
    root = None
    assert len(filenames) == 1
    for f in filenames:
        curroot, filename = os.path.split(f)
        if root is None:
            root = curroot
        else:
            assert root == curroot
        assert filename == "model.safetensors"


def test_weight_hub_files():
    filenames = weight_hub_files("bigscience/bloom-560m")
    assert filenames == ["model.safetensors"]


def test_weight_hub_files_llm():
    filenames = weight_hub_files("bigscience/bloom")
    assert filenames == [f"model_{i:05d}-of-00072.safetensors" for i in range(1, 73)]


def test_weight_hub_files_empty():
    with pytest.raises(EntryNotFoundError):
        weight_hub_files("bigscience/bloom", extension=".errors")


def test_download_weights():
    model_id = "bigscience/bloom-560m"
    filenames = weight_hub_files(model_id)
    files = download_weights(filenames, model_id)
    local_files = weight_files("bigscience/bloom-560m")
    assert files == local_files


def test_weight_files_revision_error():
    with pytest.raises(RevisionNotFoundError):
        weight_files("bigscience/bloom-560m", revision="error")


def test_weight_files_not_cached_error(fresh_cache):
    with pytest.raises(LocalEntryNotFoundError):
        weight_files("bert-base-uncased")


================================================
FILE: server/tests/utils/test_layers.py
================================================
import torch
from text_generation_server.layers import (
    TensorParallelEmbedding,
)


class ProcessGroup:
    def __init__(self, rank: int, world_size: int):
        self._rank = rank
        self.world_size = world_size

    def size(self) -> int:
        return self.world_size

    def rank(self) -> int:
        return self._rank


class Weights:
    def __init__(self, rank: int, world_size: int, vocab_size: int, hidden_dim: int):
        self.weight = (
            torch.arange(vocab_size * hidden_dim).float().view(vocab_size, hidden_dim)
        )
        self.process_group = ProcessGroup(rank, world_size)

    def get_partial_sharded(self, name: str, dim: int):
        assert dim == 0

        rank = self.process_group.rank()
        world_size = self.process_group.size()
        size = self.weight.shape[dim]

        block_size = (size + world_size - 1) // world_size
        start = rank * block_size
        stop = (rank + 1) * block_size
        return self.weight[start:stop]

    def get_shape(self, name: str):
        return self.weight.shape


def test_weight_hub_files_offline_error():

    vocab_size = 17
    weights = Weights(
        rank=0,
        world_size=1,
        vocab_size=vocab_size,
        hidden_dim=256,
    )
    embeddings = TensorParallelEmbedding("", weights)

    input_ids = torch.arange(vocab_size)
    output = embeddings.forward(input_ids)
    assert embeddings.min_id == 0
    assert embeddings.max_id == 17
    torch.testing.assert_close(output, torch.arange(256 * 17).float().view(17, 256))

    weights_0_2 = Weights(rank=0, world_size=2, vocab_size=vocab_size, hidden_dim=256)
    weights_1_2 = Weights(rank=1, world_size=2, vocab_size=vocab_size, hidden_dim=256)
    embeddings_0_2 = TensorParallelEmbedding("", weights_0_2, reduce=False)
    assert embeddings_0_2.min_id == 0
    assert embeddings_0_2.max_id == 9
    torch.testing.assert_close(
        embeddings_0_2.weight,
        torch.cat([torch.arange(9 * 256), torch.zeros(256)], dim=0)
        .view(10, 256)
        .float(),
    )
    embeddings_1_2 = TensorParallelEmbedding("", weights_1_2, reduce=False)
    assert embeddings_1_2.min_id == 9
    assert embeddings_1_2.max_id == 17
    torch.testing.assert_close(
        embeddings_1_2.weight,
        torch.cat([torch.arange(8 * 256) + 9 * 256, torch.zeros(256)], dim=0)
        .view(9, 256)
        .float(),
    )
    output_tp_0 = embeddings_0_2.forward(input_ids)
    output_tp_1 = embeddings_1_2.forward(input_ids)

    torch.testing.assert_close(output, output_tp_0 + output_tp_1)


================================================
FILE: server/tests/utils/test_tokens.py
================================================
import torch
from text_generation_server.utils.tokens import (
    StopSequenceCriteria,
    StoppingCriteria,
    FinishReason,
    batch_top_tokens,
)


def test_stop_sequence_criteria():
    criteria = StopSequenceCriteria("/test;")

    assert not criteria("/")
    assert not criteria("/test")
    assert criteria("/test;")
    assert not criteria("/test; ")


def test_stop_sequence_criteria_escape():
    criteria = StopSequenceCriteria("<|stop|>")

    assert not criteria("<")
    assert not criteria("<|stop")
    assert criteria("<|stop|>")
    assert not criteria("<|stop|> ")


def test_stopping_criteria():
    criteria = StoppingCriteria(0, [StopSequenceCriteria("/test;")], max_new_tokens=5)
    assert criteria(65827, "/test") == (False, None)
    assert criteria(30, ";") == (True, FinishReason.FINISH_REASON_STOP_SEQUENCE)


def test_stopping_criteria_eos():
    criteria = StoppingCriteria(0, [StopSequenceCriteria("/test;")], max_new_tokens=5)
    assert criteria(1, "") == (False, None)
    assert criteria(0, "") == (True, FinishReason.FINISH_REASON_EOS_TOKEN)


def test_stopping_criteria_max():
    criteria = StoppingCriteria(0, [StopSequenceCriteria("/test;")], max_new_tokens=5)
    assert criteria(1, "") == (False, None)
    assert criteria(1, "") == (False, None)
    assert criteria(1, "") == (False, None)
    assert criteria(1, "") == (False, None)
    assert criteria(1, "") == (True, FinishReason.FINISH_REASON_LENGTH)


def test_batch_top_tokens():
    top_n_tokens = [0, 2, 3, 4, 5]
    top_n_tokens_tensor = torch.tensor(top_n_tokens)
    inp_logprobs = torch.tensor([[-1.0, -3.0, -4.0, -2.0, -3.0]] * 5)
    accepted_ids = torch.ones_like(top_n_tokens_tensor)

    topn_tok_ids, topn_tok_logprobs = batch_top_tokens(
        top_n_tokens, top_n_tokens_tensor, inp_logprobs, accepted_ids
    )

    assert topn_tok_ids[0] == [[]]
    assert topn_tok_ids[1] == [[0, 3]]
    assert topn_tok_ids[2] == [[0, 3, 1, 4]]
    assert topn_tok_ids[3] == [[0, 3, 1, 4]]
    assert topn_tok_ids[4] == [[0, 3, 1, 4, 2]]

    assert topn_tok_logprobs[0] == [[]]
    assert topn_tok_logprobs[1] == [[-1, -2]]
    assert topn_tok_logprobs[2] == [[-1, -2, -3, -3]]
    assert topn_tok_logprobs[3] == [[-1, -2, -3, -3]]
    assert topn_tok_logprobs[4] == [[-1, -2, -3, -3, -4]]

    # Now let's make second member of the batch be speculated
    inp_logprobs = torch.tensor([[-1.0, -3.0, -4.0, -2.0, -3.0]] * 5 * 2)
    accepted_ids[1] = 2
    topn_tok_ids, topn_tok_logprobs = batch_top_tokens(
        top_n_tokens, top_n_tokens_tensor, inp_logprobs, accepted_ids
    )

    assert topn_tok_ids[0] == [[]]
    assert topn_tok_ids[1] == [[0, 3], [0, 3]]
    assert topn_tok_ids[2] == [[0, 3, 1, 4]]
    assert topn_tok_ids[3] == [[0, 3, 1, 4]]
    assert topn_tok_ids[4] == [[0, 3, 1, 4, 2]]

    assert topn_tok_logprobs[0] == [[]]
    assert topn_tok_logprobs[1] == [[-1, -2], [-1, -2]]
    assert topn_tok_logprobs[2] == [[-1, -2, -3, -3]]
    assert topn_tok_logprobs[3] == [[-1, -2, -3, -3]]
    assert topn_tok_logprobs[4] == [[-1, -2, -3, -3, -4]]


================================================
FILE: server/tests/utils/test_watermark.py
================================================
# test_watermark_logits_processor.py

import os
import numpy as np
import torch
from text_generation_server.utils.watermark import WatermarkLogitsProcessor


GAMMA = os.getenv("WATERMARK_GAMMA", 0.5)
DELTA = os.getenv("WATERMARK_DELTA", 2.0)


def test_seed_rng():
    input_ids = [101, 2036, 3731, 102, 2003, 103]
    processor = WatermarkLogitsProcessor()
    processor._seed_rng(input_ids)
    assert isinstance(processor.rng, torch.Generator)


def test_get_greenlist_ids():
    input_ids = [101, 2036, 3731, 102, 2003, 103]
    processor = WatermarkLogitsProcessor()
    result = processor._get_greenlist_ids(input_ids, 10, torch.device("cpu"))
    assert max(result) <= 10
    assert len(result) == int(10 * 0.5)


def test_calc_greenlist_mask():
    processor = WatermarkLogitsProcessor()
    scores = torch.tensor([[0.5, 0.3, 0.2, 0.8], [0.1, 0.2, 0.7, 0.9]])
    greenlist_token_ids = torch.tensor([2, 3])
    result = processor._calc_greenlist_mask(scores, greenlist_token_ids)
    assert result.tolist() == [[False, False, False, False], [False, False, True, True]]
    assert result.shape == scores.shape


def test_bias_greenlist_logits():
    processor = WatermarkLogitsProcessor()
    scores = torch.tensor([[0.5, 0.3, 0.2, 0.8], [0.1, 0.2, 0.7, 0.9]])
    green_tokens_mask = torch.tensor(
        [[False, False, True, True], [False, False, False, True]]
    )
    greenlist_bias = 2.0
    result = processor._bias_greenlist_logits(scores, green_tokens_mask, greenlist_bias)
    assert np.allclose(result.tolist(), [[0.5, 0.3, 2.2, 2.8], [0.1, 0.2, 0.7, 2.9]])
    assert result.shape == scores.shape


def test_call():
    input_ids = [101, 2036, 3731, 102, 2003, 103]
    processor = WatermarkLogitsProcessor()
    scores = torch.tensor([[0.5, 0.3, 0.2, 0.8], [0.1, 0.2, 0.7, 0.9]])
    result = processor(input_ids, scores)
    assert result.shape == scores.shape


================================================
FILE: server/tests/utils/test_weights.py
================================================
import pytest
import torch
from text_generation_server.utils.weights import (
    DefaultWeightsLoader,
    Weights,
    WeightsLoader,
)
from text_generation_server.layers.gptq import GPTQWeight, GPTQWeightsLoader
from text_generation_server.layers.exl2 import Exl2Weight, Exl2WeightsLoader
from text_generation_server.layers.marlin.marlin import (
    MarlinWeight,
    MarlinWeightsLoader,
)
from types import SimpleNamespace
from typing import List, Optional, Dict, Union
from pathlib import Path


@pytest.fixture
def gptq_weights_loader():
    return GPTQWeightsLoader(
        bits=4,
        groupsize=-1,
        desc_act=False,
        quant_method="gptq",
        quantize="gptq",
        sym=True,
        modules_to_not_convert=[],
    )


@pytest.fixture
def gptq_weights_loader_awq():
    return GPTQWeightsLoader(
        bits=4,
        groupsize=-1,
        desc_act=False,
        quant_method="awq",
        quantize="awq",
        sym=True,
        modules_to_not_convert=[],
    )


@pytest.fixture
def marlin_weights_loader():
    return MarlinWeightsLoader(bits=4, is_marlin_24=False)


dummy_file_system = {
    "test_weights": {
        "layer.0.weight": torch.tensor(
            [
                [1, 2],
                [3, 4],
            ],
            dtype=torch.float32,
        ),
    },
    "test_weights_2": {
        "layer.1337.weight": torch.tensor(
            [
                [1, 2, 3, 4],
                [5, 6, 7, 8],
            ],
            dtype=torch.float32,
        ),
    },
    "test_get_weights_col_packed": {
        "weight.weight": torch.tensor(
            [
                [1, 2],
                [3, 4],
                [5, 6],
                [7, 8],
            ],
            dtype=torch.float32,
        ),
    },
    "test_get_multi_weights_col": {
        "weight.weight": torch.tensor(
            [
                [1, 2],
                [3, 4],
                [5, 6],
                [7, 8],
            ],
            dtype=torch.float32,
        ),
    },
    "test_get_weights_row": {
        "weight.weight": torch.tensor(
            [
                [1, 2],
                [3, 4],
                [5, 6],
                [7, 8],
            ],
            dtype=torch.float32,
        ),
    },
    "test_get_weights_col_gptq": {
        "weight.qweight": torch.tensor(
            [
                [1, 2],
                [3, 4],
                [5, 6],
                [7, 8],
            ],
            dtype=torch.float32,
        ),
        "weight.g_idx": torch.tensor([0, 1, 0, 1], dtype=torch.int32),
        "weight.qzeros": torch.tensor(
            [
                [0, 1],
                [1, 0],
            ],
            dtype=torch.int32,
        ),
        "weight.scales": torch.tensor(
            [
                [100.0, 100.0],
                [100.0, 100.0],
            ],
            dtype=torch.float16,
        ),
        "gptq_bits": torch.tensor([8], dtype=torch.float32),
        "gptq_groupsize": torch.tensor([2], dtype=torch.float32),
    },
    "test_get_weights_col_marlin": {
        "weight.B": torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
        "weight.s": torch.tensor([[0.5000], [0.2500]], dtype=torch.float16),
    },
    "test_get_weights_row_gptq": {
        "weight.qweight": torch.tensor(
            [
                [1, 2],
                [3, 4],
                [5, 6],
                [7, 8],
            ],
            dtype=torch.int32,
        ),
        "weight.g_idx": torch.tensor([0, 1, 0, 1], dtype=torch.int32),
        "weight.qzeros": torch.tensor(
            [
                [0, 1],
                [1, 0],
            ],
            dtype=torch.int32,
        ),
        "weight.scales": torch.tensor(
            [
                [100.0, 100.0],
                [100.0, 100.0],
            ],
            dtype=torch.float16,
        ),
        "gptq_bits": torch.tensor([8], dtype=torch.float32),
        "gptq_groupsize": torch.tensor([2], dtype=torch.float32),
    },
    "test_get_multi_weights_col_gptq": {
        "weight.qweight": torch.tensor(
            [
                [1, 2],
                [3, 4],
                [5, 6],
                [7, 8],
            ],
            dtype=torch.int32,
        ),
        "weight.g_idx": torch.tensor([0, 1, 0, 1], dtype=torch.int32),
        "weight.qzeros": torch.tensor(
            [
                [0, 1],
                [1, 0],
            ],
            dtype=torch.int32,
        ),
        "weight.scales": torch.tensor(
            [
                [100.0, 100.0],
                [100.0, 100.0],
            ],
            dtype=torch.float16,
        ),
        "gptq_bits": torch.tensor([8], dtype=torch.float32),
        "gptq_groupsize": torch.tensor([2], dtype=torch.float32),
    },
    "test_get_weights_col_packed_gptq": {
        "weight.qweight": torch.tensor(
            [
                [1, 2],
                [3, 4],
                [5, 6],
                [7, 8],
            ],
            dtype=torch.int32,
        ),
        "weight.g_idx": torch.tensor([0, 1, 0, 1], dtype=torch.int32),
        "weight.qzeros": torch.tensor(
            [
                [0, 1],
                [1, 0],
            ],
            dtype=torch.int32,
        ),
        "weight.scales": torch.tensor(
            [
                [100.0, 100.0],
                [100.0, 100.0],
            ],
            dtype=torch.float16,
        ),
        "gptq_bits": torch.tensor([8], dtype=torch.float32),
        "gptq_groupsize": torch.tensor([2], dtype=torch.float32),
    },
    "test_get_weights_col_packed_exl2": {
        "weight.q_weight": torch.tensor(
            [
                [1, 2],
                [3, 4],
                [5, 6],
                [7, 8],
            ],
            dtype=torch.int32,
        ),
        "weight.q_scale": torch.tensor([8], dtype=torch.int32),
        "weight.q_invperm": torch.tensor([1, 0, 3, 2], dtype=torch.int32),
        "weight.q_scale_max": torch.tensor([100], dtype=torch.float16),
        "weight.q_groups": torch.tensor([4], dtype=torch.int16),
    },
    "test_get_weights_row_exl2": {
        "weight.q_weight": torch.tensor(
            [
                [1, 2],
                [3, 4],
                [5, 6],
                [7, 8],
            ],
            dtype=torch.int32,
        ),
        "weight.q_scale": torch.tensor([8], dtype=torch.int32),
        "weight.q_invperm": torch.tensor([1, 0, 3, 2], dtype=torch.int32),
        "weight.q_scale_max": torch.tensor([100], dtype=torch.float16),
        "weight.q_groups": torch.tensor([4], dtype=torch.int16),
    },
    "test_get_multi_weights_col_exl2": {
        "weight.q_weight": torch.tensor(
            [
                [1, 2],
                [3, 4],
                [5, 6],
                [7, 8],
            ],
            dtype=torch.int32,
        ),
        "weight.q_scale": torch.tensor([8], dtype=torch.int32),
        "weight.q_invperm": torch.tensor([1, 0, 3, 2], dtype=torch.int32),
        "weight.q_scale_max": torch.tensor([100], dtype=torch.float16),
        "weight.q_groups": torch.tensor([4], dtype=torch.int16),
    },
    "test_get_weights_col_exl2": {
        "weight.q_weight": torch.tensor(
            [
                [1, 2],
                [3, 4],
                [5, 6],
                [7, 8],
            ],
            dtype=torch.int32,
        ),
        "weight.q_scale": torch.tensor([8], dtype=torch.int32),
        "weight.q_invperm": torch.tensor([1, 0, 3, 2], dtype=torch.int32),
        "weight.q_scale_max": torch.tensor([100], dtype=torch.float16),
        "weight.q_groups": torch.tensor([4], dtype=torch.int16),
    },
    "test_get_weights_row_marlin": {
        "weight.B": torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
        "weight.s": torch.tensor([[0.5], [0.25]], dtype=torch.float16),
    },
    "test_get_multi_weights_col_marlin": {
        "weight.B": torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
        "weight.s": torch.tensor([[0.5], [0.25]], dtype=torch.float16),
    },
    "test_get_weights_col_packed_marlin": {
        "weight.B": torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
        "weight.s": torch.tensor([[0.5], [0.25]], dtype=torch.float16),
    },
}


class MockSlice:
    def __init__(self, tensor):
        self.tensor = tensor

    def get_shape(self):
        return self.tensor.shape

    def __getitem__(self, idx):
        return self.tensor[idx]


def mock_get_slice(tensor_name, filename):
    tensor = dummy_file_system[filename][tensor_name]
    return MockSlice(tensor)


def mock_handle(filename, device, dtype):
    return SimpleNamespace(
        get_slice=lambda tensor_name: mock_get_slice(tensor_name, filename)
    )


class MockSafeOpen:
    def __init__(self, filename, framework, dummy_fs):
        self.filename = filename
        self.framework = framework
        self.dummy_fs = dummy_fs

    def keys(self):
        return list(self.dummy_fs[self.filename].keys())

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        pass


class MockWeights(Weights):
    def __init__(
        self,
        filenames: List[Union[Path, str]],
        device,
        dtype,
        process_group,
        dummy_fs,
        aliases: Optional[Dict[str, List[str]]] = None,
        prefix: Optional[str] = None,
        weights_loader: Optional[WeightsLoader] = None,
    ):
        routing = {}
        self.dummy_fs = dummy_fs
        for filename in filenames:
            with MockSafeOpen(filename, framework="pytorch", dummy_fs=dummy_fs) as f:
                for k in f.keys():
                    if k in routing:
                        raise RuntimeError(
                            f"Key {k} was found in multiple files: {filename} and {routing[k]}"
                        )
                    routing[k] = filename
        if aliases is None:
            aliases = {}
        self.aliases = aliases
        self.routing = routing
        self.device = device
        self.dtype = dtype
        self.process_group = process_group
        self.prefix = prefix
        self.weights_loader = (
            # We don't need to get linear layers, so just wrap raw tensors.
            DefaultWeightsLoader(lambda x: x)
            if weights_loader is None
            else weights_loader
        )
        self._handles = {}

    def _get_handle(self, filename: Union[Path, str]):
        if filename in self._handles:
            return self._handles[filename]
        else:
            handle = mock_handle(filename, self.device, self.dtype)
            self._handles[filename] = handle
            return handle

    def get_shape(self, tensor_name: str):
        filename, _ = self.get_filename(tensor_name)
        handle = self._get_handle(filename)
        return handle.get_slice(tensor_name).get_shape()

    def get_tensor(self, tensor_name: str):
        filename, _ = self.get_filename(tensor_name)
        handle = self._get_handle(filename)
        return handle.get_slice(tensor_name).tensor


dummy_process_group = SimpleNamespace(rank=lambda: 0, size=lambda: 1)


def test_weights():
    weights = MockWeights(
        [
            "test_weights",
            "test_weights_2",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
    )
    assert weights.get_shape("layer.0.weight") == (2, 2)
    assert weights.get_tensor("layer.1337.weight").shape == (2, 4)


def test_get_tensor():
    weights = MockWeights(
        [
            "test_weights",
            "test_weights_2",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
    )
    assert torch.allclose(
        weights.get_tensor("layer.0.weight"),
        torch.tensor(
            [
                [1, 2],
                [3, 4],
            ],
            dtype=torch.float32,
        ),
    )
    assert torch.allclose(
        weights.get_tensor("layer.1337.weight"),
        torch.tensor(
            [
                [1, 2, 3, 4],
                [5, 6, 7, 8],
            ],
            dtype=torch.float32,
        ),
    )


def test_get_weights_col_packed():

    weights = MockWeights(
        [
            "test_get_weights_col_packed",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
    )

    prefix = "weight"
    block_sizes = 1

    w = weights.get_weights_col_packed(
        prefix=prefix,
        block_sizes=block_sizes,
    )

    assert torch.allclose(
        w,
        torch.tensor(
            [
                [1, 2],
                [3, 4],
                [5, 6],
                [7, 8],
            ],
            dtype=torch.float32,
        ),
    )


def test_get_weights_col_packed_block_size():

    weights = MockWeights(
        [
            "test_get_weights_col_packed",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
    )

    prefix = "weight"
    block_sizes = 2

    w = weights.get_weights_col_packed(
        prefix=prefix,
        block_sizes=block_sizes,
    )

    assert torch.allclose(
        w,
        torch.tensor(
            [
                [1, 2],
                [3, 4],
                [5, 6],
                [7, 8],
            ],
            dtype=torch.float32,
        ),
    )


def test_get_weights_col_packed_block_size_arr():

    weights = MockWeights(
        [
            "test_get_weights_col_packed",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
    )

    prefix = "weight"
    block_sizes = [1, 1]

    w = weights.get_weights_col_packed(
        prefix=prefix,
        block_sizes=block_sizes,
    )

    assert torch.allclose(
        w,
        torch.tensor(
            [
                [1, 2],
                [3, 4],
                [5, 6],
                [7, 8],
            ],
            dtype=torch.float32,
        ),
    )


def test_get_multi_weights_col():
    weights = MockWeights(
        [
            "test_get_multi_weights_col",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
    )

    prefixes = ["weight", "weight"]

    w = weights.get_multi_weights_col(
        prefixes=prefixes,
        dim=0,
    )

    assert torch.allclose(
        w,
        torch.tensor(
            [
                [1, 2],
                [3, 4],
                [5, 6],
                [7, 8],
                [1, 2],
                [3, 4],
                [5, 6],
                [7, 8],
            ],
            dtype=torch.float32,
        ),
    )


def test_get_weights_row():
    weights = MockWeights(
        [
            "test_get_weights_row",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
    )

    prefix = "weight"

    w = weights.get_weights_row(
        prefix=prefix,
    )

    assert torch.allclose(
        w,
        torch.tensor(
            [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]],
            dtype=torch.float32,
        ),
    )


# test_get_weights_col


def test_get_weights_col_awq(gptq_weights_loader_awq):
    weights = MockWeights(
        [
            "test_get_weights_col_gptq",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
        weights_loader=gptq_weights_loader_awq,
    )

    prefix = "weight"

    w = weights.get_weights_col(
        prefix=prefix,
    )

    expected_weight = GPTQWeight(
        qweight=torch.tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]]),
        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
        scales=torch.tensor(
            [[100.0, 100.0], [100.0, 100.0]],
            dtype=torch.float16,
        ),
        g_idx=None,
        bits=8.0,
        groupsize=2.0,
        use_awq_kernel=True,
        use_exllama=False,
    )

    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
    assert w.g_idx == expected_weight.g_idx, "g_idx mismatch"
    assert w.bits == expected_weight.bits, "bits mismatch"
    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"


def test_get_weights_col_gtpq(gptq_weights_loader):
    weights = MockWeights(
        [
            "test_get_weights_col_gptq",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
        weights_loader=gptq_weights_loader,
    )

    prefix = "weight"

    w = weights.get_weights_col(
        prefix=prefix,
    )

    expected_weight = GPTQWeight(
        qweight=torch.tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]]),
        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
        scales=torch.tensor([[100.0, 100.0], [100.0, 100.0]], dtype=torch.float16),
        g_idx=torch.tensor([0, 1, 0, 1], dtype=torch.int32),
        bits=8.0,
        groupsize=2.0,
        use_awq_kernel=False,
        use_exllama=False,
    )

    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
    assert torch.allclose(w.g_idx, expected_weight.g_idx), "g_idx mismatch"
    assert w.bits == expected_weight.bits, "bits mismatch"
    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"


def test_get_weights_col_exl2():
    weights = MockWeights(
        [
            "test_get_weights_col_exl2",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
        weights_loader=Exl2WeightsLoader(),
    )

    prefix = "weight"

    w = weights.get_weights_col(
        prefix=prefix,
    )

    scaled_scale_max = 0.3906 * 256
    expected_weight = Exl2Weight(
        q_weight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
        q_scale=torch.tensor([8], dtype=torch.int32),
        q_invperm=torch.tensor([1, 0, 3, 2], dtype=torch.int16),
        q_scale_max=torch.tensor([scaled_scale_max], dtype=torch.float16),
        q_groups=torch.tensor([4], dtype=torch.int16),
    )

    assert torch.allclose(w.q_weight, expected_weight.q_weight), "q_weight mismatch"
    assert torch.allclose(w.q_scale, expected_weight.q_scale), "q_scale mismatch"
    assert torch.allclose(w.q_invperm, expected_weight.q_invperm), "q_invperm mismatch"
    assert torch.allclose(
        w.q_scale_max, expected_weight.q_scale_max
    ), "q_scale_max mismatch"
    assert torch.allclose(w.q_groups, expected_weight.q_groups), "q_groups mismatch"


def test_get_weights_col_marlin(marlin_weights_loader):
    weights = MockWeights(
        [
            "test_get_weights_col_marlin",
        ],
        device="cpu",
        dtype=torch.float16,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
        weights_loader=marlin_weights_loader,
    )

    prefix = "weight"

    w = weights.get_weights_col(
        prefix=prefix,
    )

    expected_weight = MarlinWeight(
        B=torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
        s=torch.tensor([[0.5000], [0.2500]], dtype=torch.float16),
    )

    assert torch.allclose(w.B, expected_weight.B), "B mismatch"
    assert torch.allclose(w.s, expected_weight.s), "s mismatch"


# test_get_weights_col_packed


def test_get_weights_col_packed_awq(gptq_weights_loader_awq):
    weights = MockWeights(
        [
            "test_get_weights_col_packed_gptq",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
        weights_loader=gptq_weights_loader_awq,
    )

    prefix = "weight"
    block_sizes = 1

    w = weights.get_weights_col_packed(
        prefix=prefix,
        block_sizes=block_sizes,
    )

    expected_weight = GPTQWeight(
        qweight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
        scales=torch.tensor([[100.0, 100.0], [100.0, 100.0]], dtype=torch.float16),
        g_idx=None,
        bits=8.0,
        groupsize=2.0,
        use_awq_kernel=True,
        use_exllama=False,
    )

    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
    assert w.g_idx == expected_weight.g_idx, "g_idx mismatch"
    assert w.bits == expected_weight.bits, "bits mismatch"
    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"


@pytest.mark.skip(reason="Review expected functionality")
def test_get_weights_col_packed_exl2():
    weights = MockWeights(
        [
            "test_get_weights_col_packed_exl2",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
        weights_loader=Exl2WeightsLoader(),
    )

    prefix = "weight"
    block_sizes = 1

    w = weights.get_weights_col_packed(
        prefix=prefix,
        block_sizes=block_sizes,
    )

    scaled_scale_max = 0.3906 * 256
    expected_weight = Exl2Weight(
        q_weight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
        q_scale=torch.tensor([8], dtype=torch.int32),
        q_invperm=torch.tensor([1], dtype=torch.int16),
        q_scale_max=torch.tensor([scaled_scale_max], dtype=torch.float16),
        q_groups=torch.tensor([4], dtype=torch.int16),
    )

    assert torch.allclose(w.q_weight, expected_weight.q_weight), "q_weight mismatch"
    assert torch.allclose(w.q_scale, expected_weight.q_scale), "q_scale mismatch"
    assert torch.allclose(w.q_invperm, expected_weight.q_invperm), "q_invperm mismatch"
    assert torch.allclose(
        w.q_scale_max, expected_weight.q_scale_max
    ), "q_scale_max mismatch"
    assert torch.allclose(w.q_groups, expected_weight.q_groups), "q_groups mismatch"


def test_get_weights_col_packed_gptq(gptq_weights_loader):
    weights = MockWeights(
        [
            "test_get_weights_col_packed_gptq",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
        weights_loader=gptq_weights_loader,
    )

    prefixes = ["weight"]

    w = weights.get_multi_weights_col(
        prefixes=prefixes,
        dim=0,
    )

    expected_weight = GPTQWeight(
        qweight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
        scales=torch.tensor([[100.0, 100.0], [100.0, 100.0]], dtype=torch.float16),
        g_idx=torch.tensor([0, 1, 0, 1], dtype=torch.int32),
        bits=8.0,
        groupsize=2.0,
        use_awq_kernel=False,
        use_exllama=False,
    )

    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
    assert torch.allclose(w.g_idx, expected_weight.g_idx), "g_idx mismatch"
    assert w.bits == expected_weight.bits, "bits mismatch"
    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"


def test_get_weights_col_packed_marlin(marlin_weights_loader):
    weights = MockWeights(
        [
            "test_get_weights_col_packed_marlin",
        ],
        device="cpu",
        dtype=torch.float16,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
        weights_loader=marlin_weights_loader,
    )

    prefix = "weight"

    w = weights.get_multi_weights_col(
        prefixes=[prefix],
        dim=0,
    )

    expected_weight = MarlinWeight(
        B=torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
        s=torch.tensor([[0.5000], [0.2500]], dtype=torch.float16),
    )

    print(expected_weight)

    assert torch.allclose(w.B, expected_weight.B), "B mismatch"
    assert torch.allclose(w.s, expected_weight.s), "s mismatch"


# test_get_multi_weights_col


def test_get_multi_weights_col_awq(gptq_weights_loader_awq):
    weights = MockWeights(
        [
            "test_get_multi_weights_col_gptq",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
        weights_loader=gptq_weights_loader_awq,
    )

    prefixes = ["weight"]

    w = weights.get_multi_weights_col(
        prefixes=prefixes,
        dim=0,
    )

    expected_weight = GPTQWeight(
        qweight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
        scales=torch.tensor([[100.0, 100.0], [100.0, 100.0]], dtype=torch.float16),
        g_idx=None,
        bits=8.0,
        groupsize=2.0,
        use_awq_kernel=True,
        use_exllama=False,
    )

    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
    assert w.g_idx == expected_weight.g_idx, "g_idx mismatch"
    assert w.bits == expected_weight.bits, "bits mismatch"
    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"


def test_get_multi_weights_col_exl2():
    weights = MockWeights(
        [
            "test_get_multi_weights_col_exl2",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
        weights_loader=Exl2WeightsLoader(),
    )

    prefix = "weight"

    try:
        weights.get_multi_weights_col(
            prefixes=[prefix],
            dim=0,
        )
    except ValueError as e:
        assert e.args[0] == "get_multi_weights_col is not supported for exl2"


def test_get_multi_weights_col_gptq(gptq_weights_loader):
    weights = MockWeights(
        [
            "test_get_multi_weights_col_gptq",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
        weights_loader=gptq_weights_loader,
    )

    prefixes = ["weight"]

    w = weights.get_multi_weights_col(
        prefixes=prefixes,
        dim=0,
    )

    expected_weight = GPTQWeight(
        qweight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
        scales=torch.tensor([[100.0, 100.0], [100.0, 100.0]], dtype=torch.float16),
        g_idx=torch.tensor([0, 1, 0, 1], dtype=torch.int32),
        bits=8.0,
        groupsize=2.0,
        use_awq_kernel=False,
        use_exllama=False,
    )

    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
    assert torch.allclose(w.g_idx, expected_weight.g_idx), "g_idx mismatch"
    assert w.bits == expected_weight.bits, "bits mismatch"
    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"


def test_get_multi_weights_col_marlin(marlin_weights_loader):
    weights = MockWeights(
        [
            "test_get_multi_weights_col_marlin",
        ],
        device="cpu",
        dtype=torch.float16,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
        weights_loader=marlin_weights_loader,
    )

    prefix = "weight"

    w = weights.get_multi_weights_col(
        prefixes=[prefix],
        dim=0,
    )

    expected_weight = MarlinWeight(
        B=torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
        s=torch.tensor([[0.5000], [0.2500]], dtype=torch.float16),
    )

    assert torch.allclose(w.B, expected_weight.B), "B mismatch"
    assert torch.allclose(w.s, expected_weight.s), "s mismatch"


# test_get_weights_row


def test_get_weights_row_awq(gptq_weights_loader_awq):
    weights = MockWeights(
        [
            "test_get_weights_row_gptq",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
        weights_loader=gptq_weights_loader_awq,
    )

    prefix = "weight"

    w = weights.get_weights_row(
        prefix=prefix,
    )

    expected_weight = GPTQWeight(
        qweight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
        scales=torch.tensor([[100.0, 100.0], [100.0, 100.0]], dtype=torch.float16),
        g_idx=None,
        bits=8.0,
        groupsize=2.0,
        use_awq_kernel=True,
        use_exllama=False,
    )

    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
    assert w.g_idx == expected_weight.g_idx, "g_idx mismatch"
    assert w.bits == expected_weight.bits, "bits mismatch"
    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"


def test_get_weights_row_exl2():
    weights = MockWeights(
        [
            "test_get_weights_row_exl2",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
        weights_loader=Exl2WeightsLoader(),
    )

    prefix = "weight"

    w = weights.get_weights_row(
        prefix=prefix,
    )
    print(w)

    scaled_scale_max = 0.3906 * 256
    expected_weight = Exl2Weight(
        q_weight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
        q_scale=torch.tensor([8], dtype=torch.int32),
        q_invperm=torch.tensor([1, 0, 3, 2], dtype=torch.int16),
        q_scale_max=torch.tensor([scaled_scale_max], dtype=torch.float16),
        q_groups=torch.tensor([4], dtype=torch.int16),
    )

    assert torch.allclose(w.q_weight, expected_weight.q_weight), "q_weight mismatch"
    assert torch.allclose(w.q_scale, expected_weight.q_scale), "q_scale mismatch"
    assert torch.allclose(w.q_invperm, expected_weight.q_invperm), "q_invperm mismatch"
    assert torch.allclose(
        w.q_scale_max, expected_weight.q_scale_max
    ), "q_scale_max mismatch"
    assert torch.allclose(w.q_groups, expected_weight.q_groups), "q_groups mismatch"


def test_get_weights_row_gptq(gptq_weights_loader):
    weights = MockWeights(
        [
            "test_get_weights_row_gptq",
        ],
        device="cpu",
        dtype=torch.float32,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
        weights_loader=gptq_weights_loader,
    )

    prefix = "weight"

    w = weights.get_weights_row(
        prefix=prefix,
    )

    expected_weight = GPTQWeight(
        qweight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
        scales=torch.tensor([[100.0, 100.0], [100.0, 100.0]], dtype=torch.float16),
        g_idx=torch.tensor([0, 1, 0, 1], dtype=torch.int32),
        bits=8.0,
        groupsize=2.0,
        use_awq_kernel=False,
        use_exllama=False,
    )

    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
    assert torch.allclose(w.g_idx, expected_weight.g_idx), "g_idx mismatch"
    assert w.bits == expected_weight.bits, "bits mismatch"
    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
    assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch"
    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"


def test_get_weights_row_marlin(marlin_weights_loader):
    weights = MockWeights(
        [
            "test_get_weights_row_marlin",
        ],
        device="cpu",
        dtype=torch.float16,
        process_group=dummy_process_group,
        dummy_fs=dummy_file_system,
        weights_loader=marlin_weights_loader,
    )

    prefix = "weight"

    w = weights.get_weights_row(
        prefix=prefix,
    )

    expected_weight = MarlinWeight(
        B=torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
        s=torch.tensor([[0.5000], [0.2500]], dtype=torch.float16),
    )

    assert torch.allclose(w.B, expected_weight.B), "B mismatch"
    assert torch.allclose(w.s, expected_weight.s), "s mismatch"


================================================
FILE: server/text_generation_server/__init__.py
================================================


================================================
FILE: server/text_generation_server/adapters/__init__.py
================================================
# Origin:   https://github.com/predibase/lorax
# Path:     lorax/server/lorax_server/adapters/__init__.py
# License:  Apache License Version 2.0, January 2004

from text_generation_server.adapters.weights import (
    AdapterBatchData,
    AdapterBatchMetadata,
)

__all__ = [
    "AdapterBatchData",
    "AdapterBatchMetadata",
]


================================================
FILE: server/text_generation_server/adapters/config.py
================================================
# Origin:   https://github.com/predibase/lorax
# Path:     lorax/server/lorax_server/adapters/config.py
# License:  Apache License Version 2.0, January 2004

from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Dict, Set, Tuple

import torch

from text_generation_server.adapters.weights import AdapterWeights


@dataclass
class ModuleMap:
    module_name: str
    module_weights: Dict[str, Tuple[torch.Tensor, str]]


@dataclass
class AdapterConfig(ABC):
    base_model_name_or_path: str

    @abstractmethod
    def map_weights_for_model(
        self,
        adapter_weights: Dict[int, AdapterWeights],
        weight_names: Tuple[str],
    ) -> Tuple[ModuleMap, Set[str]]:
        pass


================================================
FILE: server/text_generation_server/adapters/lora.py
================================================
# Origin:   https://github.com/predibase/lorax
# Path:     lorax/server/lorax_server/adapters/lora.py
# License:  Apache License Version 2.0, January 2004

from collections import defaultdict
from dataclasses import dataclass
from typing import Dict, List, Optional, Set, Tuple, Type, Union

from loguru import logger
import torch
from peft import LoraConfig as _LoraConfig
from torch.distributed import ProcessGroup
from text_generation_server.utils.log import log_master
from text_generation_server.utils.import_utils import SYSTEM
from text_generation_server.adapters.config import AdapterConfig, ModuleMap
from text_generation_server.utils.kernels import load_kernel
from text_generation_server.adapters.weights import (
    AdapterBatchMetadata,
    AdapterWeights,
    BatchAdapterWeights,
)

if SYSTEM == "cuda":
    punica_sgmv = load_kernel(
        module="punica_sgmv", repo_id="kernels-community/punica-sgmv"
    )
else:
    punica_sgmv = None


def get_start_stop_idxs_for_rank(offset, size, rank, world_size):
    block_size = size // world_size
    start = offset + rank * block_size
    stop = offset + (rank + 1) * block_size
    return start, stop


def shard_on_dim(
    t: torch.Tensor, dim: int, process_group: torch.distributed.ProcessGroup
):
    world_size = process_group.size()
    rank = process_group.rank()

    size = t.shape[dim]
    start, stop = get_start_stop_idxs_for_rank(0, size, rank, world_size)

    if dim == 0:
        tensor = t[start:stop]
    elif dim == 1:
        tensor = t[:, start:stop]
    else:
        raise NotImplementedError("Let's make that generic when needed")

    return tensor


def shard_lora_weights(
    weights_a: List[torch.Tensor],
    weights_b: List[torch.Tensor],
    split_dim: int,
    process_group: ProcessGroup,
) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
    # [hidden_size, r]
    weights_a = [
        shard_on_dim(w, dim=split_dim, process_group=process_group) for w in weights_a
    ]

    # [r, hidden_size]
    weights_b = [shard_on_dim(w, dim=1, process_group=process_group) for w in weights_b]

    return weights_a, weights_b


@dataclass
class LoraConfig(AdapterConfig):
    r: int
    target_modules: Optional[Union[List[str], str]]
    fan_in_fan_out: bool
    lora_alpha: int
    use_rslora: bool

    def map_weights_for_model(
        self,
        adapter_weights: Dict[int, AdapterWeights],
        weight_names: Tuple[str],
    ) -> Tuple[ModuleMap, Set[str]]:
        adapter_weight_names = set()
        module_map = {}
        for weight_name in weight_names:
            lora_a_name = f"base_model.model.{weight_name}.lora_A.weight"
            lora_b_name = f"base_model.model.{weight_name}.lora_B.weight"
            if lora_a_name not in adapter_weights or lora_b_name not in adapter_weights:
                continue

            module_map[weight_name] = {
                "lora_A": (adapter_weights[lora_a_name], lora_a_name),
                "lora_B": (adapter_weights[lora_b_name], lora_b_name),
            }
            adapter_weight_names.add(lora_a_name)
            adapter_weight_names.add(lora_b_name)
        return module_map, adapter_weight_names

    @classmethod
    def load(cls, adapter_id: str, api_token: str) -> "LoraConfig":
        hf_config = _LoraConfig.from_pretrained(adapter_id, token=api_token)
        return cls(
            base_model_name_or_path=hf_config.base_model_name_or_path,
            r=hf_config.r,
            target_modules=hf_config.target_modules,
            fan_in_fan_out=hf_config.fan_in_fan_out,
            lora_alpha=hf_config.lora_alpha,
            use_rslora=(
                hf_config.use_rslora if hasattr(hf_config, "use_rslora") else False
            ),
        )


class LoraWeights(AdapterWeights):
    """LoRA weights for a single adapter merged across all layers."""

    def __init__(
        self,
        weights_a: List[torch.Tensor],
        weights_b: List[torch.Tensor],
        adapter_config: LoraConfig,
    ):
        self.lora_a_r = weights_a[0].size(1) if len(weights_a) > 0 else 1
        self.lora_b_r = weights_b[0].size(0) if len(weights_a) > 0 else 1

        self._is_transposed = False
        if SYSTEM == "ipex":
            self._use_cutlass_shrink = False
            # [num_layers, r, hidden_size]
            weights_a = [w.transpose(0, 1).contiguous() for w in weights_a]
            self._weights_a = torch.stack(weights_a)

            # [num_layers, hidden_size, r]
            weights_b = [w.transpose(0, 1).contiguous() for w in weights_b]
            self._weights_b = torch.stack(weights_b)
        else:
            self._use_cutlass_shrink = punica_sgmv.use_cutlass_shrink(self.lora_a_r)
            # [num_layers, hidden_size, r]
            weights_a = [
                punica_sgmv.orient_for_rank(w, w.size(1)).contiguous()
                for w in weights_a
            ]
            self._weights_a = torch.stack(weights_a)

            # [num_layers, r, hidden_size]
            self._weights_b = torch.stack(weights_b)

        self.adapter_config = adapter_config

    @property
    def weights_a(self) -> torch.Tensor:
        if self._is_transposed:
            self._transpose_weights()
        return self._weights_a

    @property
    def weights_b(self) -> torch.Tensor:
        if self._is_transposed:
            self._transpose_weights()
        return self._weights_b

    @property
    def weights_a_t(self) -> torch.Tensor:
        if not self._is_transposed:
            self._transpose_weights()
        return self._weights_a

    @property
    def weights_b_t(self) -> torch.Tensor:
        if not self._is_transposed:
            self._transpose_weights()
        return self._weights_b

    def _transpose_weights(self):
        if self._use_cutlass_shrink:
            # If we're not using the cutlass shrink, then both SGMV and BGMV use the same orientation
            self._weights_a = self._weights_a.transpose(1, 2).contiguous()
        self._weights_b = self._weights_b.transpose(1, 2).contiguous()
        self._is_transposed = not self._is_transposed

    @classmethod
    def get_batch_types(cls) -> List[Type[BatchAdapterWeights]]:
        if SYSTEM == "ipex":
            return [IPEXBatchLoraWeights]
        else:
            return [BatchLoraWeights]

    # prepare pre-loaded lora weights for use in the model.
    #
    # this method processes and organizes lora weights for a specific layer type across all layers:
    # - uses `config` (LoraConfig) to apply lora-specific settings like scaling factor.
    # - retrieves weights from `module_map` based on the `layer_type`.
    # - processes `nlayers` number of layers.
    # - converts weights to the specified `dtype`.
    # - shards weights across `world_size` number of processes using the `process_group`.
    # - maps weights to specific layers using `target_to_layer`.
    # - tracks `unused_weight_names` to identify any unused weights.
    #
    # the method handles weight transposition, scaling, and padding to ensure compatibility
    # with SGMV or BGMV operations.
    @classmethod
    def prepare_weights(
        cls,
        config: LoraConfig,
        module_map: Dict[str, Dict],
        layer_type: str,
        unused_weight_names: Set[str],
        nlayers: int,
        dtype: torch.dtype,
        world_size: int,
        process_group: ProcessGroup,
        target_to_layer: Dict[str, Tuple[str, torch.Tensor]],
    ) -> Optional[AdapterWeights]:
        lora_a_list = [None] * nlayers
        lora_b_list = [None] * nlayers

        for layer_id in range(nlayers):
            key = (layer_id, layer_type)
            if key not in target_to_layer:
                # There is no layer of this type in the model
                log_master(
                    logger.warning,
                    f"Key specified in lora weights but not found in base model: {key}",
                )
                return None

            weight_name, layer = target_to_layer[key]
            base_weight = layer.base_layer.linear.weight
            base_device = base_weight.device

            if weight_name not in module_map:
                # There is no LoRA weight for this layer type in the adapter
                return None

            lora_a, lora_a_name = module_map[weight_name]["lora_A"]
            lora_a = lora_a.to(base_device, dtype)

            lora_b, lora_b_name = module_map[weight_name]["lora_B"]
            lora_b = lora_b.to(base_device, dtype)

            scale = get_scaling_factor(
                config.lora_alpha,
                config.r,
                uses_rslora=config.use_rslora,
            )

            unused_weight_names.discard(lora_a_name)
            unused_weight_names.discard(lora_b_name)

            # Merge scaling factor into lora_b due to associativity of matrix multiplication:
            # (A * B) * C = A * (B * C)
            lora_a_list[layer_id] = lora_a.transpose(0, 1)
            lora_b_list[layer_id] = lora_b.transpose(0, 1) * scale

        # pad lora ranks to be compatible with sgmv
        if SYSTEM != "ipex":
            lora_a_list = [
                punica_sgmv.pad_rank(w, dim=1, world_size=world_size)
                for w in lora_a_list
            ]
            lora_b_list = [
                punica_sgmv.pad_rank(w, dim=0, world_size=world_size)
                for w in lora_b_list
            ]

            if lora_a_list:
                # update rank if it was padded
                padded_rank = lora_a_list[0].size(1)
                config.r = padded_rank

        return LoraWeights(
            *shard_lora_weights(
                weights_a=lora_a_list,
                weights_b=lora_b_list,
                split_dim=0 if layer_type in {"o_proj", "down_proj", "lm_head"} else 1,
                process_group=process_group,
            ),
            config,
        )


@dataclass
class RankSegments:
    rank: int

    lora_a_ptr: torch.Tensor
    lora_b_ptr: torch.Tensor

    # prefill (sgmv)
    tmp_shrink: torch.Tensor
    tmp_expand: torch.Tensor
    segment_starts: torch.Tensor
    segment_ends: torch.Tensor

    # decode (bgmv)
    indices: torch.Tensor


@dataclass
class BatchLoraWeights(BatchAdapterWeights):
    lora_a: Dict[int, torch.Tensor]
    lora_b: Dict[int, torch.Tensor]
    adapter_index_configs: Dict[int, LoraConfig]
    rank_data: Dict[int, RankSegments]
    use_sgmv: bool

    def has_adapter(self, adapter_index: int) -> bool:
        return adapter_index in self.adapter_index_configs

    def can_vectorize(self, pg: ProcessGroup) -> bool:
        return all(
            rank_data.rank // pg.size() <= punica_sgmv.MAX_RANK_CUSTOM
            for rank_data in self.rank_data.values()
        )

    @classmethod
    def load(
        self,
        adapter_weights: Dict[int, AdapterWeights],
        meta: AdapterBatchMetadata,
        prefill: bool,
        prefill_head_indices: Optional[torch.Tensor],
    ) -> Optional["BatchLoraWeights"]:
        adapter_weights = {k: _convert_lora(v) for k, v in adapter_weights.items()}
        adapter_weights = {
            k: v for k, v in adapter_weights.items() if isinstance(v, LoraWeights)
        }
        if not adapter_weights:
            return None

        first_weights = next(iter(adapter_weights.values()))
        device = first_weights.weights_a.device
        segment_indices = meta.segment_indices

        lora_a = {
            idx: adapter_weights[idx].weights_a
            for idx in segment_indices
            if idx in adapter_weights
        }
        lora_b = {
            idx: adapter_weights[idx].weights_b
            for idx in segment_indices
            if idx in adapter_weights
        }

        max_rank = max(
            (
                adapter_weights[idx].lora_a_r
                for idx in segment_indices
                if idx in adapter_weights
            ),
            default=0,
        )

        use_sgmv = False
        if prefill or max_rank > punica_sgmv.BGMV_MAX_RANK:
            if punica_sgmv is not None:
                use_sgmv = True
            lora_a_ptr = torch.tensor(
                [
                    (
                        adapter_weights[idx].weights_a.data_ptr()
                        if idx in adapter_weights
                        else 0
                    )
                    for idx in segment_indices
                ],
                dtype=torch.int64,
                device=device,
            )
            lora_b_ptr = torch.tensor(
                [
                    (
                        adapter_weights[idx].weights_b.data_ptr()
                        if idx in adapter_weights
                        else 0
                    )
                    for idx in segment_indices
                ],
                dtype=torch.int64,
                device=device,
            )
        else:
            lora_a_ptr = torch.tensor(
                [
                    (
                        adapter_weights[idx].weights_a_t.data_ptr()
                        if idx in adapter_weights
                        else 0
                    )
                    for idx in segment_indices
                ],
                dtype=torch.int64,
                device=device,
            )
            lora_b_ptr = torch.tensor(
                [
                    (
                        adapter_weights[idx].weights_b_t.data_ptr()
                        if idx in adapter_weights
                        else 0
                    )
                    for idx in segment_indices
                ],
                dtype=torch.int64,
                device=device,
            )

        adapter_index_configs = {
            idx: adapter_weights[idx].adapter_config
            for idx in segment_indices
            if idx in adapter_weights
        }

        adapter_to_segment = {v: k for k, v in enumerate(segment_indices)}

        rank_indices = defaultdict(list)
        for segment_idx, adapter_idx in enumerate(segment_indices):
            if adapter_idx not in adapter_weights:
                continue
            rank_indices[adapter_weights[adapter_idx].lora_a_r].append(segment_idx)

        if prefill_head_indices is not None:
            j, prefill_head_segment_starts, prefill_head_segment_ends = 1, [0], [0]
            for head_index in prefill_head_indices:
                # j cannot go out of bounds as that would mean there are tokens without corresponding adapters
                if head_index < meta.adapter_segments[j]:
                    prefill_head_segment_ends[-1] += 1
                else:
                    prefill_head_segment_starts.append(prefill_head_segment_ends[-1])
                    prefill_head_segment_ends.append(prefill_head_segment_ends[-1] + 1)
                    j += 1

        rank_data = {}
        for rank, indices in rank_indices.items():
            tmp_shrink = None
            tmp_expand = None
            segment_starts = None
            segment_ends = None
            batch_indices = None

            if use_sgmv:
                lora_a_ptr_indices = lora_a_ptr[indices]
                tmp_shrink, tmp_expand = punica_sgmv.get_tmp_tensors(
                    lora_a_ptr_indices.size(0), rank, device
                )
                segment_starts = meta.adapter_segments[indices]
                segment_ends = meta.adapter_segments[[i + 1 for i in indices]]
                if prefill_head_indices is not None:
                    for i, segment_index in enumerate(indices):
                        segment_starts[i] = prefill_head_segment_starts[segment_index]
                        segment_ends[i] = prefill_head_segment_ends[segment_index]
            else:
                rank_indices = set(indices)
                batch_indices = [
                    adapter_to_segment[idx] for idx in meta.adapter_indices.tolist()
                ]
                batch_indices = [
                    idx if idx in rank_indices else -1 for idx in batch_indices
                ]
                batch_indices = torch.tensor(
                    batch_indices, dtype=torch.int64, device=device
                )

            rank_data[rank] = RankSegments(
                rank=rank,
                tmp_shrink=tmp_shrink,
                tmp_expand=tmp_expand,
                lora_a_ptr=lora_a_ptr[indices],
                lora_b_ptr=lora_b_ptr[indices],
                segment_starts=segment_starts,
                segment_ends=segment_ends,
                indices=batch_indices,
            )

        return BatchLoraWeights(
            lora_a=lora_a,
            lora_b=lora_b,
            adapter_index_configs=adapter_index_configs,
            rank_data=rank_data,
            use_sgmv=use_sgmv,
        )


@dataclass
class IPEXBatchLoraWeights(BatchLoraWeights):
    @classmethod
    def load(
        self,
        adapter_weights: Dict[int, AdapterWeights],
        meta: AdapterBatchMetadata,
        prefill: bool,
        prefill_head_indices: Optional[torch.Tensor],
    ) -> Optional["BatchLoraWeights"]:
        adapter_weights = {k: _convert_lora(v) for k, v in adapter_weights.items()}
        adapter_weights = {
            k: v for k, v in adapter_weights.items() if isinstance(v, LoraWeights)
        }
        if not adapter_weights:
            return None

        first_weights = next(iter(adapter_weights.values()))
        device = first_weights.weights_a.device
        segment_indices = meta.segment_indices

        lora_a = {
            idx: adapter_weights[idx].weights_a
            for idx in segment_indices
            if idx in adapter_weights
        }
        lora_b = {
            idx: adapter_weights[idx].weights_b
            for idx in segment_indices
            if idx in adapter_weights
        }
        adapter_index_configs = {
            idx: adapter_weights[idx].adapter_config
            for idx in segment_indices
            if idx in adapter_weights
        }
        if len(lora_a) != 0:
            lora_a_ptr = torch.stack(list(lora_a.values()))
        if len(lora_b) != 0:
            lora_b_ptr = torch.stack(list(lora_b.values()))

        use_sgmv = True if prefill else False

        adapter_to_segment = {v: k for k, v in enumerate(segment_indices)}

        rank_indices = defaultdict(list)
        for segment_idx, adapter_idx in enumerate(segment_indices):
            if adapter_idx not in adapter_weights:
                continue
            rank_indices[adapter_weights[adapter_idx].lora_a_r].append(segment_idx)

        if prefill_head_indices is not None:
            j, prefill_head_segment_starts, prefill_head_segment_ends = 1, [0], [0]
            for head_index in prefill_head_indices:
                # j cannot go out of bounds as that would mean there are tokens without corresponding adapters
                if head_index < meta.adapter_segments[j]:
                    prefill_head_segment_ends[-1] += 1
                else:
                    prefill_head_segment_starts.append(prefill_head_segment_ends[-1])
                    prefill_head_segment_ends.append(prefill_head_segment_ends[-1] + 1)
                    j += 1

        rank_data = {}
        segment_starts = None
        segment_ends = None
        if use_sgmv:
            segment_starts = meta.adapter_segments[:-1]
            segment_ends = meta.adapter_segments[1:]
            if prefill_head_indices is not None:
                segment_starts = prefill_head_segment_starts[:-1]
                segment_ends = prefill_head_segment_ends[1:]
        batch_indices = [
            adapter_to_segment[idx] for idx in meta.adapter_indices.tolist()
        ]
        for rank, indices in rank_indices.items():
            adapters_indices = []
            lora_a_keys = list(lora_a.keys())
            for segment_idx in batch_indices:
                if segment_idx in indices:
                    adapters_indices.append(
                        lora_a_keys.index(segment_indices[segment_idx])
                    )
                else:
                    adapters_indices.append(-1)
            adapters_indices = torch.tensor(
                adapters_indices, dtype=torch.int64, device=device
            )
            if use_sgmv:
                adapters_indices = adapters_indices[segment_starts]
            rank_data[rank] = RankSegments(
                rank=rank,
                tmp_shrink=None,
                tmp_expand=None,
                lora_a_ptr=lora_a_ptr,
                lora_b_ptr=lora_b_ptr,
                segment_starts=segment_starts,
                segment_ends=segment_ends,
                indices=adapters_indices,
            )

        return BatchLoraWeights(
            lora_a=lora_a,
            lora_b=lora_b,
            adapter_index_configs=adapter_index_configs,
            rank_data=rank_data,
            use_sgmv=use_sgmv,
        )


def get_scaling_factor(
    lora_alpha: int,
    r: int,
    uses_rslora: bool = False,
) -> float:
    """Computes the scaling factor for the lora weights."""
    if uses_rslora:
        return lora_alpha / (r**0.5)
    return lora_alpha / r


def _convert_lora(v: AdapterWeights) -> AdapterWeights:
    if hasattr(v, "lora_weights"):
        return v.lora_weights
    return v


================================================
FILE: server/text_generation_server/adapters/weights.py
================================================
# Origin:   https://github.com/predibase/lorax
# Path:     lorax/server/lorax_server/adapters/weights.py
# License:  Apache License Version 2.0, January 2004

from abc import ABC, abstractclassmethod
from collections import defaultdict
from dataclasses import dataclass
from typing import Dict, List, Optional, Set, Type

import torch


@dataclass
class AdapterBatchMetadata:
    # [batch_size]
    adapter_indices: torch.Tensor

    # [num_adapters]
    adapter_set: Set[int]

    # [num_segments + 1]
    adapter_segments: torch.Tensor

    # [num_segments]
    # maps from segment index to adapter index, i.e.:
    # segment_indices[s] == adapter_indices[i]
    segment_indices: List[int]


class AdapterWeights(ABC):
    @abstractclassmethod
    def get_batch_types(cls) -> List[Type["BatchAdapterWeights"]]:
        pass

    @property
    def speculative_tokens(self) -> int:
        return 0


class BatchAdapterWeights(ABC):
    @abstractclassmethod
    def has_adapter(self, adapter_index: int) -> bool:
        pass

    @abstractclassmethod
    def load(
        cls,
        adapter_weights: Dict[int, AdapterWeights],
        meta: "AdapterBatchMetadata",
        prefill: bool,
        prefill_head_indices: torch.Tensor,
    ) -> Optional["BatchAdapterWeights"]:
        pass


class LayerAdapterWeights:
    """Adapter weights that apply to a particular layer."""

    def __init__(self):
        self.adapter_weights: Dict[int, AdapterWeights] = {}

    def add_adapter(self, adapter_idx: int, weights: AdapterWeights):
        self.adapter_weights[adapter_idx] = weights

    def remove_adapter(self, adapter_idx: int):
        if adapter_idx not in self.adapter_weights:
            return
        del self.adapter_weights[adapter_idx]

    def is_empty(self) -> bool:
        return len(self.adapter_weights) == 0

    def get_data(
        self,
        meta: AdapterBatchMetadata,
        prefill: bool,
        prefill_head_indices: Optional[torch.Tensor],
    ) -> Dict[str, BatchAdapterWeights]:
        # bucket adapters by batch class
        adapter_batch_types: Dict[
            Type[BatchAdapterWeights], Dict[int, AdapterWeights]
        ] = defaultdict(dict)
        for adapter_index, adapter_weights in self.adapter_weights.items():
            for batch_type in adapter_weights.get_batch_types():
                adapter_batch_types[batch_type][adapter_index] = adapter_weights

        batch_data = {}
        for batch_type, adapter_weights in adapter_batch_types.items():
            batched_weights = batch_type.load(
                adapter_weights, meta, prefill, prefill_head_indices
            )
            if batched_weights is not None:
                batch_data = batched_weights
        return batch_data


@dataclass
class AdapterBatchData:
    meta: AdapterBatchMetadata

    # layer type -> adapter type -> batch weight data
    data: Dict[str, Dict[str, BatchAdapterWeights]]

    prefill: bool

    @staticmethod
    def from_meta(
        meta: AdapterBatchMetadata,
        weights: Dict[str, LayerAdapterWeights],
        prefill: bool,
        prefill_head_indices: Optional[torch.Tensor],
    ) -> "AdapterBatchData":
        data = {}
        for k, v in weights.items():
            if v.is_empty():
                continue
            data[k] = v.get_data(
                meta, prefill, prefill_head_indices if k == "lm_head" else None
            )
        return AdapterBatchData(meta=meta, data=data, prefill=prefill)

    def ranks(self) -> Set[int]:
        # TODO(travis): refactor to be less coupled to lora implementation
        ranks = set()
        for lora_data in self.data.values():
            if lora_data is None:
                continue

            for rank_data in lora_data.rank_data.values():
                ranks.add(rank_data.rank)

        return ranks

    def layer_names(self) -> Set[str]:
        return set(self.data.keys())

    def adapter_keys(self) -> Set[str]:
        adapter_keys = set()
        for layer_data in self.data.values():
            adapter_keys.update(layer_data.keys())
        return adapter_keys

    @property
    def max_rank(self) -> int:
        ranks = self.ranks()
        return max(ranks) if len(ranks) > 0 else 0


================================================
FILE: server/text_generation_server/cache.py
================================================
import torch

from typing import Dict, Optional, TypeVar

from text_generation_server.models.types import Batch

B = TypeVar("B", bound=Batch)


class Cache:
    def __init__(self):
        self.cache: Dict[int, B] = {}

    def pop(self, batch_id: int) -> Optional[B]:
        return self.cache.pop(batch_id, None)

    def set(self, entry: B):
        if entry is not None:
            self.cache[entry.batch_id] = entry

    def delete(self, batch_id: int):
        batch = self.pop(batch_id)
        if batch is not None:
            del batch
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    def clear(self):
        keys = list(self.cache.keys())
        for k in keys:
            self.delete(k)

    def __len__(self):
        return len(self.cache.keys())


================================================
FILE: server/text_generation_server/cli.py
================================================
import os
import sys
import typer

from pathlib import Path
from loguru import logger
from typing import Optional
from enum import Enum
from huggingface_hub import hf_hub_download
from text_generation_server.utils.adapter import parse_lora_adapters

# Dummy change should cache hit.


app = typer.Typer()


class Quantization(str, Enum):
    bitsandbytes = "bitsandbytes"
    bitsandbytes_nf4 = "bitsandbytes-nf4"
    bitsandbytes_fp4 = "bitsandbytes-fp4"
    gptq = "gptq"
    awq = "awq"
    compressed_tensors = "compressed-tensors"
    eetq = "eetq"
    exl2 = "exl2"
    fp8 = "fp8"
    marlin = "marlin"


class Dtype(str, Enum):
    float16 = "float16"
    bloat16 = "bfloat16"


class KVCacheDtype(str, Enum):
    fp8_e4m3fn = "fp8_e4m3fn"
    fp8_e5m2 = "fp8_e5m2"


@app.command()
def serve(
    model_id: str,
    revision: Optional[str] = None,
    sharded: bool = False,
    quantize: Optional[Quantization] = None,
    speculate: Optional[int] = None,
    dtype: Optional[Dtype] = None,
    kv_cache_dtype: Optional[KVCacheDtype] = None,
    trust_remote_code: bool = False,
    uds_path: Path = "/tmp/text-generation-server",
    logger_level: str = "INFO",
    json_output: bool = False,
    otlp_endpoint: Optional[str] = None,
    otlp_service_name: str = "text-generation-inference.server",
    max_input_tokens: Optional[int] = None,
):
    if sharded:
        assert (
            os.getenv("RANK", None) is not None
        ), "RANK must be set when sharded is True"
        assert (
            os.getenv("WORLD_SIZE", None) is not None
        ), "WORLD_SIZE must be set when sharded is True"
        assert (
            os.getenv("MASTER_ADDR", None) is not None
        ), "MASTER_ADDR must be set when sharded is True"
        assert (
            os.getenv("MASTER_PORT", None) is not None
        ), "MASTER_PORT must be set when sharded is True"

    # Remove default handler
    logger.remove()
    logger.add(
        sys.stdout,
        format="{message}",
        filter="text_generation_server",
        level=logger_level,
        serialize=json_output,
        backtrace=True,
        diagnose=False,
    )

    # Import here after the logger is added to log potential import exceptions
    from text_generation_server import server
    from text_generation_server.tracing import setup_tracing

    # Setup OpenTelemetry distributed tracing
    if otlp_endpoint is not None:
        setup_tracing(otlp_service_name=otlp_service_name, otlp_endpoint=otlp_endpoint)

    lora_adapters = parse_lora_adapters(os.getenv("LORA_ADAPTERS"))

    # TODO: enable lora with cuda graphs. for now disable cuda graphs if lora is enabled
    # and warn the user
    if lora_adapters:
        logger.warning("LoRA adapters enabled (experimental feature).")

        if "CUDA_GRAPHS" in os.environ:
            logger.warning(
                "LoRA adapters incompatible with CUDA Graphs. Disabling CUDA Graphs."
            )
            global CUDA_GRAPHS
            CUDA_GRAPHS = None

    # Downgrade enum into str for easier management later on
    quantize = None if quantize is None else quantize.value
    dtype = None if dtype is None else dtype.value
    kv_cache_dtype = None if kv_cache_dtype is None else kv_cache_dtype.value
    if dtype is not None and quantize not in {
        None,
        "bitsandbytes",
        "bitsandbytes-nf4",
        "bitsandbytes-fp4",
    }:
        raise RuntimeError(
            "Only 1 can be set between `dtype` and `quantize`, as they both decide how goes the final model."
        )
    server.serve(
        model_id,
        lora_adapters,
        revision,
        sharded,
        quantize,
        speculate,
        dtype,
        kv_cache_dtype,
        trust_remote_code,
        uds_path,
        max_input_tokens,
    )


@app.command()
def download_weights(
    model_id: str,
    revision: Optional[str] = None,
    extension: str = ".safetensors",
    auto_convert: bool = True,
    logger_level: str = "INFO",
    json_output: bool = False,
    trust_remote_code: bool = False,
    merge_lora: bool = False,
):
    # Remove default handler
    logger.remove()
    logger.add(
        sys.stdout,
        format="{message}",
        filter="text_generation_server",
        level=logger_level,
        serialize=json_output,
        backtrace=True,
        diagnose=False,
    )

    # Import here after the logger is added to log potential import exceptions
    from text_generation_server import utils

    # Test if files were already download
    try:
        utils.weight_files(model_id, revision, extension)
        logger.info("Files are already present on the host. " "Skipping download.")
        return
    # Local files not found
    except (utils.LocalEntryNotFoundError, FileNotFoundError, utils.EntryNotFoundError):
        pass

    is_local_model = (Path(model_id).exists() and Path(model_id).is_dir()) or os.getenv(
        "WEIGHTS_CACHE_OVERRIDE", None
    ) is not None

    if not is_local_model:
        # TODO: maybe reverse the default value of merge_lora?
        # currently by default we don't merge the weights with the base model
        if merge_lora:
            try:
                hf_hub_download(
                    model_id, revision=revision, filename="adapter_config.json"
                )
                utils.download_and_unload_peft(
                    model_id, revision, trust_remote_code=trust_remote_code
                )
                is_local_model = True
                utils.weight_files(model_id, revision, extension)
                return
            except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
                pass
        else:
            try:
                utils.peft.download_peft(
                    model_id, revision, trust_remote_code=trust_remote_code
                )
            except Exception:
                pass

        try:
            import json

            config = hf_hub_download(
                model_id, revision=revision, filename="config.json"
            )
            with open(config, "r") as f:
                config = json.load(f)

            base_model_id = config.get("base_model_name_or_path", None)
            if base_model_id and base_model_id != model_id:
                try:
                    logger.info(f"Downloading parent model {base_model_id}")
                    download_weights(
                        model_id=base_model_id,
                        revision="main",
                        extension=extension,
                        auto_convert=auto_convert,
                        logger_level=logger_level,
                        json_output=json_output,
                        trust_remote_code=trust_remote_code,
                    )
                except Exception:
                    pass
        except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
            pass

        # Try to download weights from the hub
        try:
            filenames = utils.weight_hub_files(model_id, revision, extension)
            utils.download_weights(filenames, model_id, revision)
            # Successfully downloaded weights
            return

        # No weights found on the hub with this extension
        except utils.EntryNotFoundError as e:
            # Check if we want to automatically convert to safetensors or if we can use .bin weights instead
            if not extension == ".safetensors" or not auto_convert:
                raise e

    elif (Path(model_id) / "adapter_config.json").exists():
        # Try to load as a local PEFT model
        try:
            utils.download_and_unload_peft(
                model_id, revision, trust_remote_code=trust_remote_code
            )
            utils.weight_files(model_id, revision, extension)
            return
        except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
            pass
    elif (Path(model_id) / "config.json").exists():
        # Try to load as a local Medusa model
        try:
            import json

            config = Path(model_id) / "config.json"
            with open(config, "r") as f:
                config = json.load(f)

            base_model_id = config.get("base_model_name_or_path", None)
            if base_model_id:
                try:
                    logger.info(f"Downloading parent model {base_model_id}")
                    download_weights(
                        model_id=base_model_id,
                        revision="main",
                        extension=extension,
                        auto_convert=auto_convert,
                        logger_level=logger_level,
                        json_output=json_output,
                        trust_remote_code=trust_remote_code,
                    )
                except Exception:
                    pass
        except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
            pass

    # Try to see if there are local pytorch weights
    try:
        # Get weights for a local model, a hub cached model and inside the WEIGHTS_CACHE_OVERRIDE
        try:
            local_pt_files = utils.weight_files(model_id, revision, ".bin")
        except Exception:
            local_pt_files = utils.weight_files(model_id, revision, ".pt")

    # No local pytorch weights
    except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
        if extension == ".safetensors":
            logger.warning(
                f"No safetensors weights found for model {model_id} at revision {revision}. "
                f"Downloading PyTorch weights."
            )

        # Try to see if there are pytorch weights on the hub
        pt_filenames = utils.weight_hub_files(model_id, revision, ".bin")
        # Download pytorch weights
        local_pt_files = utils.download_weights(pt_filenames, model_id, revision)

    if auto_convert:
        if not trust_remote_code:
            logger.warning(
                "🚨🚨BREAKING CHANGE in 2.0🚨🚨: Safetensors conversion is disabled without `--trust-remote-code` because "
                "Pickle files are unsafe and can essentially contain remote code execution!"
                "Please check for more information here: https://huggingface.co/docs/text-generation-inference/basic_tutorials/safety",
            )

        logger.warning(
            f"No safetensors weights found for model {model_id} at revision {revision}. "
            f"Converting PyTorch weights to safetensors."
        )

        # Safetensors final filenames
        local_st_files = [
            p.parent / f"{p.stem.lstrip('pytorch_')}.safetensors"
            for p in local_pt_files
        ]
        try:
            import transformers
            import json

            if is_local_model:
                config_filename = os.path.join(model_id, "config.json")
            else:
                config_filename = hf_hub_download(
                    model_id, revision=revision, filename="config.json"
                )
            with open(config_filename, "r") as f:
                config = json.load(f)
            architecture = config["architectures"][0]

            class_ = getattr(transformers, architecture)

            # Name for this varible depends on transformers version.
            discard_names = getattr(class_, "_tied_weights_keys", [])

        except Exception:
            discard_names = []
        # Convert pytorch weights to safetensors
        utils.convert_files(local_pt_files, local_st_files, discard_names)


@app.command()
def quantize(
    model_id: str,
    output_dir: str,
    revision: Optional[str] = None,
    logger_level: str = "INFO",
    json_output: bool = False,
    trust_remote_code: bool = False,
    upload_to_model_id: Optional[str] = None,
    percdamp: float = 0.01,
    act_order: bool = False,
    groupsize: int = 128,
):
    if revision is None:
        revision = "main"
    download_weights(
        model_id=model_id,
        revision=revision,
        logger_level=logger_level,
        json_output=json_output,
    )
    from text_generation_server.layers.gptq.quantize import quantize

    quantize(
        model_id=model_id,
        bits=4,
        groupsize=groupsize,
        output_dir=output_dir,
        revision=revision,
        trust_remote_code=trust_remote_code,
        upload_to_model_id=upload_to_model_id,
        percdamp=percdamp,
        act_order=act_order,
        sym=True,
    )


if __name__ == "__main__":
    app()


================================================
FILE: server/text_generation_server/interceptor.py
================================================
import torch
import grpc

from google.rpc import status_pb2, code_pb2
from grpc_status import rpc_status
from grpc_interceptor.server import AsyncServerInterceptor
from loguru import logger
from typing import Callable, Any


class ExceptionInterceptor(AsyncServerInterceptor):
    def __init__(self, shutdown_callback):
        self.shutdown_callback = shutdown_callback

    async def intercept(
        self,
        method: Callable,
        request_or_iterator: Any,
        context: grpc.ServicerContext,
        method_name: str,
    ) -> Any:
        try:
            response = method(request_or_iterator, context)
            return await response
        except Exception as err:
            method_name = method_name.split("/")[-1]
            logger.exception(f"Method {method_name} encountered an error.")

            # Runtime Error cannot be recovered from
            if isinstance(err, RuntimeError):
                self.shutdown_callback()

            if torch.cuda.is_available():
                torch.cuda.empty_cache()

            await context.abort_with_status(
                rpc_status.to_status(
                    status_pb2.Status(code=code_pb2.INTERNAL, message=str(err))
                )
            )


================================================
FILE: server/text_generation_server/layers/__init__.py
================================================
from text_generation_server.layers.tensor_parallel import (
    TensorParallelColumnLinear,
    TensorParallelRowLinear,
    TensorParallelEmbedding,
)
from text_generation_server.layers.linear import (
    get_linear,
    FastLinear,
)
from text_generation_server.layers.speculative import SpeculativeHead

# Just to add the `load` methods.
from text_generation_server.layers.layernorm import load_layer_norm
from text_generation_server.layers.conv import load_conv2d

from text_generation_server.layers.lora import (
    LoraLinear,
    TensorParallelMultiAdapterLinear,
    TensorParallelAdapterRowLinear,
)

__all__ = [
    "get_linear",
    "FastLinear",
    "TensorParallelColumnLinear",
    "TensorParallelRowLinear",
    "TensorParallelEmbedding",
    "SpeculativeHead",
    "LoraLinear",
    "TensorParallelMultiAdapterLinear",
    "TensorParallelAdapterRowLinear",
    "load_layer_norm",
    "load_conv2d",
]


================================================
FILE: server/text_generation_server/layers/attention/__init__.py
================================================
import os

from text_generation_server.utils.import_utils import SYSTEM

from .common import Seqlen

if os.getenv("USE_FLASH_ATTENTION", "").lower() == "false":
    raise ImportError("`USE_FLASH_ATTENTION` is false.")
if SYSTEM == "cuda":
    from .cuda import (
        SUPPORTS_WINDOWING,
        attention,
        paged_attention,
    )
elif SYSTEM == "rocm":
    from .rocm import (
        SUPPORTS_WINDOWING,
        attention,
        paged_attention,
    )
elif SYSTEM == "ipex":
    from .ipex import (
        SUPPORTS_WINDOWING,
        attention,
        paged_attention,
    )
else:
    raise ImportError(f"System {SYSTEM} doesn't support flash/paged attention")

# KVCache needs `reshape_and_cache`, so ensure that it is defined already.
from .kv_cache import KVCache, get_kv_scales

__all__ = [
    "attention",
    "get_kv_scales",
    "paged_attention",
    "SUPPORTS_WINDOWING",
    "KVCache",
    "Seqlen",
]


================================================
FILE: server/text_generation_server/layers/attention/common.py
================================================
from dataclasses import dataclass
import torch
from typing import Optional


@dataclass
class Seqlen:
    input_lengths: torch.Tensor
    cache_lengths: torch.Tensor
    cu_seqlen_q: Optional[torch.Tensor]
    cu_seqlen_k: Optional[torch.Tensor]
    max_q: int
    max_k: int

    def __init__(
        self,
        input_lengths,
        cache_lengths,
        cu_seqlen_q=None,
        max_q=None,
        max_k=None,
    ):
        self.input_lengths = input_lengths
        self.cache_lengths = cache_lengths
        device = self.input_lengths.device
        shape = self.input_lengths.shape
        if cu_seqlen_q is None:
            cu_seqlen_q = torch.arange(
                shape[0] + 1,
                device=device,
                dtype=torch.int32,
            )
            max_q = 1
        else:
            assert max_q is not None
        assert max_k is not None
        cu_seqlen_k = torch.zeros(shape[-1] + 1, device=device, dtype=torch.int32)

        # cuda graphs don't like this and this is necessary to clamp within mistral
        # Although FA2 might not want the clamping
        # cu_seqlen_k[0] = 0
        total = self.input_lengths + self.cache_lengths
        torch.cumsum(total, -1, out=cu_seqlen_k[1:])

        self.cu_seqlen_q = cu_seqlen_q
        self.cu_seqlen_k = cu_seqlen_k
        self.max_q = max_q
        self.max_k = max_k

    def clamp(self, max):
        # Flash decoding doesn't need to clamp
        return self


================================================
FILE: server/text_generation_server/layers/attention/cuda.py
================================================
import torch
from text_generation_server.layers.attention.kv_cache import KVCache, KVScales
from text_generation_server.utils.import_utils import SYSTEM
from text_generation_server.utils.kernels import load_kernel
from text_generation_server.models.globals import (
    ATTENTION,
    BLOCK_SIZE,
)
from text_generation_server.layers.attention import Seqlen
from typing import Optional


major, minor = torch.cuda.get_device_capability()
is_sm75 = major == 7 and minor == 5
_PARTITION_SIZE = 512

if SYSTEM == "cuda":
    try:
        paged_attention_kernels = load_kernel(
            module="paged_attention", repo_id="kernels-community/paged-attention"
        )
    except Exception as e:
        raise ImportError(
            f"Could not import attention kernels. Make sure your installation is correct. Complete error: {e}"
        )
else:
    paged_attention_kernels = None


def paged_attention(
    query: torch.Tensor,
    kv_cache: KVCache,
    kv_head_mapping: torch.Tensor,
    softmax_scale: float,
    block_tables: torch.Tensor,
    seqlen: Seqlen,
    max_s: int,
    *,
    kv_scales: KVScales,
    softcap: Optional[float] = None,
    window_size_left: Optional[int] = -1,
):
    # Adapted from: https://github.com/vllm-project/vllm/blob/f8a1e39fae05ca610be8d5a78be9d40f5274e5fc/vllm/model_executor/layers/attention.py
    # Copyright 2023 The vLLM team. All rights
    # reserved.
    #
    # Licensed under the Apache License, Version 2.0 (the "License");
    # you may not use this file except in compliance with the License.
    # You may obtain a copy of the License at
    #
    #     http://www.apache.org/licenses/LICENSE-2.0
    #
    # Unless required by applicable law or agreed to in writing, software
    # distributed under the License is distributed on an "AS IS" BASIS,
    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    # See the License for the specific language governing permissions and
    # limitations under the License.
    #

    # value_cache => [num_blocks, num_heads, head_size, block_size]
    # block_size = value_cache.shape[3]
    block_size = BLOCK_SIZE
    num_seqs, num_heads, head_size = query.shape
    max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE

    can_scale = kv_cache.can_scale(kv_scales)

    # NOTE(woosuk): We use a simple heuristic to decide whether to use
    # PagedAttention V1 or V2. If the number of partitions is 1, we use
    # V1 to avoid the overhead of reduction. Also, if the number of
    # sequences or heads is large, we use V1 since there is enough work
    # to parallelize.
    if ATTENTION == "flashinfer":
        from text_generation_server.layers.attention.flashinfer import decode_state

        return decode_state.get().forward(
            query,
            paged_kv_cache=(kv_cache.key, kv_cache.value),
            logits_soft_cap=softcap,
            sm_scale=softmax_scale,
            k_scale=kv_scales.key_scale_cpu if can_scale else 1.0,
            v_scale=kv_scales.value_scale_cpu if can_scale else 1.0,
            window_left=window_size_left,
        )
    elif ATTENTION == "flashdecoding":
        max_q = 1
        max_k = max_s
        import flash_attn_2_cuda

        window_size_right = -1 if window_size_left == -1 else 0

        # TODO fixme when flash contains the fix.
        # Number of splits is not correctly handled
        # by the current path
        # https://github.com/Dao-AILab/flash-attention/blob/320fb59487658f033f56711efd3d61b7c7a6f8f3/csrc/flash_attn/flash_api.cpp#L577
        # This fails becuase we're using causal, therefore window_right is set to 0 and the split logic is never applied.
        if softcap is None:
            softcap = 0.0
        out = flash_attn_2_cuda.varlen_fwd(
            query,
            kv_cache.key,
            kv_cache.value,
            None,
            seqlen.cu_seqlen_q,
            seqlen.cu_seqlen_k,
            None,  # pad_k
            None,
            block_tables,
            None,
            max_q,
            max_k,
            0.0,  # dropout
            softmax_scale,
            False,  # zero_tensors
            True,  # causal
            window_size_left,  # Window_left
            window_size_right,  # Window right
            softcap,
            False,  # return softmax
            None,  # generator
        )
        return out[0]
    else:
        if softcap is not None:
            raise RuntimeError("Paged attention doesn't support softcapping")
        input_lengths = seqlen.input_lengths + seqlen.cache_lengths

        out = torch.empty_like(query)

        kv_cache_dtype = "fp8" if kv_cache.dtype == torch.float8_e4m3fn else "auto"

        use_v1 = max_s <= 8192 and (
            max_num_partitions == 1 or num_seqs * num_heads > 512
        )
        if use_v1:
            paged_attention_kernels.paged_attention_v1(
                out,
                query,
                kv_cache.key,
                kv_cache.value,
                kv_cache.key.shape[1],
                softmax_scale,
                block_tables,
                input_lengths,
                block_size,
                max_s,
                None,
                kv_cache_dtype,
                torch.tensor(kv_scales.key_scale_cpu if can_scale else 1.0),
                torch.tensor(kv_scales.value_scale_cpu if can_scale else 1.0),
            )
        else:
            # Run PagedAttention V2.
            assert _PARTITION_SIZE % block_size == 0
            tmp_output = torch.empty(
                size=(num_seqs, num_heads, max_num_partitions, head_size),
                dtype=out.dtype,
                device=out.device,
            )
            exp_sums = torch.empty(
                size=(num_seqs, num_heads, max_num_partitions),
                dtype=torch.float32,
                device=out.device,
            )
            max_logits = torch.empty_like(exp_sums)

            paged_attention_kernels.paged_attention_v2(
                out,
                exp_sums,
                max_logits,
                tmp_output,
                query,
                kv_cache.key,
                kv_cache.value,
                kv_cache.key.shape[1],
                softmax_scale,
                block_tables,
                input_lengths,
                block_size,
                max_s,
                None,
                kv_cache_dtype,
                torch.tensor(kv_scales.key_scale_cpu if can_scale else 1.0),
                torch.tensor(kv_scales.value_scale_cpu if can_scale else 1.0),
            )
    return out


try:
    is_ampere_or_newer = major >= 8 and minor >= 0
    if not is_ampere_or_newer:
        raise ImportError("FlashAttention only supports Ampere GPUs or newer.")

    import flash_attn_2_cuda

    V2 = True
except ImportError:
    try:
        import flash_attn_cuda

        V2 = False
    except ImportError as e:
        if major >= 8:
            architecture_suffix = f"-{SYSTEM}"
            raise ImportError(
                "Flash Attention V2 is not installed.\n"
                "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
                f"or install flash attention v2 with `cd server && make install install-flash-attention-v2{architecture_suffix}`"
            )
        elif is_sm75:
            raise ImportError(
                "Flash Attention is not installed.\n"
                "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
                "or install flash attention with `cd server && make install install-flash-attention`"
            ) from e
        else:
            raise ImportError(
                f"GPU with CUDA capability {major} {minor} is not supported"
            ) from e


if ATTENTION == "flashdecoding" and not V2:
    raise ValueError("Flash decoding requires Flash Attention V2")

SUPPORTS_WINDOWING = V2


def attention(
    *,
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    kv_cache: KVCache,
    kv_scales: KVScales,
    seqlen: Seqlen,
    block_tables: torch.Tensor,
    softmax_scale: float,
    window_size_left: int = -1,
    causal: bool = True,
    softcap: Optional[float] = None,
):
    can_scale = kv_cache.can_scale(kv_scales)

    if ATTENTION == "flashinfer":
        from text_generation_server.layers.attention.flashinfer import (
            prefill_with_paged_kv_state,
        )

        if softcap is None:
            softcap = 0.0

        return prefill_with_paged_kv_state.get().forward(
            query,
            causal=causal,
            paged_kv_cache=(kv_cache.key, kv_cache.value),
            logits_soft_cap=softcap,
            sm_scale=softmax_scale,
            k_scale=kv_scales.key_scale_cpu if can_scale else 1.0,
            v_scale=kv_scales.value_scale_cpu if can_scale else 1.0,
            window_left=window_size_left,
        )

    # If we are using flashdecoding or paged, we always use flash-attn for
    # the prefill. We have to branch on whether we use flash-attn v1 or v2.
    elif V2:
        out = torch.empty_like(query)
        if window_size_left <= 0 and window_size_left != -1:
            raise ValueError("`window_size_left` must be > 0 or -1")

        if softcap is None:
            softcap = 0.0

        return flash_attn_2_cuda.varlen_fwd(
            query,
            # flashdecoding: pass the KV caches, paged: pass the KV.
            kv_cache.key if ATTENTION == "flashdecoding" else key,
            kv_cache.value if ATTENTION == "flashdecoding" else value,
            out,
            seqlen.cu_seqlen_q,
            seqlen.cu_seqlen_k,
            None,
            None,
            block_tables if ATTENTION == "flashdecoding" else None,
            None,
            seqlen.max_q,
            seqlen.max_k,
            0.0,
            softmax_scale,
            False,
            causal,
            window_size_left,
            0,
            softcap,
            False,
            None,
        )[0]

    else:
        if window_size_left != -1:
            raise NotImplementedError(
                "window_size_left is only available with flash attn v2"
            )
        if softcap is not None:
            raise NotImplementedError("softcap is not available in flash attn v1")

        # Flash attention v1 requires q, k and v to have the same number of heads
        if key.shape[1] != query.shape[1]:
            # MQA expand
            if key.shape[1] == 1:
                key = key.expand(-1, query.shape[1], -1)
            # Grouped attention reshape
            else:
                original_shape = key.shape
                key = (
                    key.unsqueeze(2)
                    .expand(-1, -1, query.shape[1] // key.shape[1], -1)
                    .reshape(original_shape[0], -1, original_shape[2])
                )
        if value.shape[1] != query.shape[1]:
            # MQA expand
            if value.shape[1] == 1:
                value = value.expand(-1, query.shape[1], -1)
            # Grouped attention reshape
            else:
                original_shape = value.shape
                value = (
                    value.unsqueeze(2)
                    .expand(-1, -1, query.shape[1] // value.shape[1], -1)
                    .reshape(original_shape[0], -1, original_shape[2])
                )

        out = torch.empty_like(query)
        flash_attn_cuda.fwd(
            query,
            key,
            value,
            out,
            seqlen.cu_seqlen_q,
            seqlen.cu_seqlen_q,
            seqlen.max_q,
            seqlen.max_k,
            0.0,
            softmax_scale,
            False,
            causal,
            False,
            0,
            None,
        )
        return out


__all__ = [
    "SUPPORTS_WINDOWING",
    "attention",
    "paged_attention",
]


================================================
FILE: server/text_generation_server/layers/attention/flash_attn_triton.py
================================================
#!/usr/bin/env python
"""
Fused Attention
===============

This is a Triton implementation of the Flash Attention v2 algorithm from Tri Dao
(https://tridao.me/publications/flash2/flash2.pdf)
Credits: OpenAI kernel team, AMD ML Frameworks Triton team

Features supported:

1) Fwd with causal masking
2) Any sequence lengths without padding (currently fwd kernel only)
3) Support for different sequence lengths for q and k
4) Nested tensor API currently does not support dropout or bias.

Not currently supported:

1) Non power of two head dims

"""

import torch
import triton
import triton.language as tl

torch_dtype: tl.constexpr = torch.float16


@triton.jit
def cdiv_fn(x, y):
    return (x + y - 1) // y


@triton.jit
def max_fn(x, y):
    return tl.math.max(x, y)


@triton.jit
def dropout_offsets(philox_seed, philox_offset, dropout_p, m, n, stride):
    ms = tl.arange(0, m)
    ns = tl.arange(0, n)
    return philox_offset + ms[:, None] * stride + ns[None, :]


@triton.jit
def dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride):
    rng_offsets = dropout_offsets(
        philox_seed, philox_offset, dropout_p, m, n, stride
    ).to(tl.uint32)
    # TODO: use tl.randint for better performance
    return tl.rand(philox_seed, rng_offsets)


@triton.jit
def dropout_mask(philox_seed, philox_offset, dropout_p, m, n, stride):
    rng_output = dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride)
    rng_keep = rng_output > dropout_p
    return rng_keep


@triton.jit
def load_fn(block_ptr, first, second, pad):
    if first and second:
        tensor = tl.load(block_ptr, boundary_check=(0, 1), padding_option=pad)
    elif first:
        tensor = tl.load(block_ptr, boundary_check=(0,), padding_option=pad)
    elif second:
        tensor = tl.load(block_ptr, boundary_check=(1,), padding_option=pad)
    else:
        tensor = tl.load(block_ptr)
    return tensor


@triton.jit
def _attn_fwd_inner(
    acc,
    l_i,
    m_i,
    q,
    K_block_ptr,
    V_block_ptr,
    start_m,
    actual_seqlen_k,
    dropout_p,
    philox_seed,
    batch_philox_offset,
    encoded_softmax_block_ptr,
    block_min,
    block_max,
    offs_n_causal,
    masked_blocks,
    n_extra_tokens,
    bias_ptr,
    IS_CAUSAL: tl.constexpr,
    BLOCK_M: tl.constexpr,
    BLOCK_DMODEL: tl.constexpr,
    BLOCK_N: tl.constexpr,
    OFFS_M: tl.constexpr,
    OFFS_N: tl.constexpr,
    PRE_LOAD_V: tl.constexpr,
    MASK_STEPS: tl.constexpr,
    ENABLE_DROPOUT: tl.constexpr,
    RETURN_ENCODED_SOFTMAX: tl.constexpr,
    PADDED_HEAD: tl.constexpr,
):
    # loop over k, v, and update accumulator
    for start_n in range(block_min, block_max, BLOCK_N):
        # For padded blocks, we will overrun the tensor size if
        # we load all BLOCK_N. For others, the blocks are all within range.
        k = load_fn(
            K_block_ptr,
            PADDED_HEAD,
            MASK_STEPS and (n_extra_tokens != 0),
            "zero",
        )
        if PRE_LOAD_V:
            v = load_fn(
                V_block_ptr,
                MASK_STEPS and (n_extra_tokens != 0),
                PADDED_HEAD,
                "zero",
            )
        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
        # We start from end of seqlen_k so only the first iteration would need
        # to be checked for padding if it is not a multiple of block_n
        # TODO: This can be optimized to only be true for the padded block.
        if MASK_STEPS:  # noqa: SIM102
            # If this is the last block / iteration, we want to
            # mask if the sequence length is not a multiple of block size
            # a solution is to always do BLOCK_M // BLOCK_N + 1 steps
            # if not is_modulo_mn. last step might get wasted but that is okay.
            # check if this masking works for that case.
            if (start_n + BLOCK_N == block_max) and (n_extra_tokens != 0):
                boundary_m = tl.full([BLOCK_M], actual_seqlen_k, dtype=tl.int32)
                size_n = start_n + OFFS_N[None, :]
                mask = size_n < boundary_m[:, None]
                qk = tl.where(mask, qk, float("-inf"))
        if IS_CAUSAL:
            causal_boundary = start_n + offs_n_causal
            causal_mask = OFFS_M[:, None] >= causal_boundary[None, :]
            qk = tl.where(causal_mask, qk, float("-inf"))
        # -- compute qk ----
        qk += tl.dot(q, k)
        if bias_ptr is not None:
            bias = load_fn(
                bias_ptr, False, MASK_STEPS and (n_extra_tokens != 0), "zero"
            )
            # While bias is added after multiplying qk with sm_scale, our
            # optimization to use 2^x instead of e^x results in an additional
            # scale factor of log2(e) which we must also multiply the bias with.
            qk += bias * 1.44269504089
        m_ij = tl.maximum(m_i, tl.max(qk, 1))
        qk = qk - m_ij[:, None]
        p = tl.math.exp2(qk)

        # CAVEAT: Must update l_ij before applying dropout
        l_ij = tl.sum(p, 1)
        if ENABLE_DROPOUT:
            philox_offset = (
                batch_philox_offset
                + start_m * BLOCK_M * actual_seqlen_k
                + start_n
                - BLOCK_N
            )
            keep = dropout_mask(
                philox_seed,
                philox_offset,
                dropout_p,
                BLOCK_M,
                BLOCK_N,
                actual_seqlen_k,
            )
            if RETURN_ENCODED_SOFTMAX:
                tl.store(
                    encoded_softmax_block_ptr,
                    tl.where(keep, p, -p).to(encoded_softmax_block_ptr.type.element_ty),
                )
            p = tl.where(keep, p, 0.0)
        elif RETURN_ENCODED_SOFTMAX:
            tl.store(
                encoded_softmax_block_ptr,
                p.to(encoded_softmax_block_ptr.type.element_ty),
            )
        # -- update output accumulator --
        alpha = tl.math.exp2(m_i - m_ij)
        acc = acc * alpha[:, None]
        if not PRE_LOAD_V:
            v = load_fn(
                V_block_ptr,
                MASK_STEPS and (n_extra_tokens != 0),
                PADDED_HEAD,
                "zero",
            )
        # -- update m_i and l_i
        l_i = l_i * alpha + l_ij
        # update m_i and l_i
        m_i = m_ij
        acc += tl.dot(p.to(V_block_ptr.type.element_ty), v)
        V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0))
        K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N))
        if bias_ptr is not None:
            bias_ptr = tl.advance(bias_ptr, (0, BLOCK_N))
        if RETURN_ENCODED_SOFTMAX:
            encoded_softmax_block_ptr = tl.advance(
                encoded_softmax_block_ptr, (0, BLOCK_N)
            )
    return acc, l_i, m_i


@triton.autotune(
    configs=[
        triton.Config(
            {
                "BLOCK_M": 256,
                "BLOCK_N": 64,
                "waves_per_eu": 2,
                "PRE_LOAD_V": False,
            },
            num_stages=1,
            num_warps=8,
        ),
        triton.Config(
            {
                "BLOCK_M": 128,
                "BLOCK_N": 128,
                "waves_per_eu": 2,
                "PRE_LOAD_V": False,
            },
            num_stages=1,
            num_warps=4,
        ),
        triton.Config(
            {
                "BLOCK_M": 256,
                "BLOCK_N": 128,
                "waves_per_eu": 2,
                "PRE_LOAD_V": False,
            },
            num_stages=1,
            num_warps=8,
        ),
        triton.Config(
            {
                "BLOCK_M": 128,
                "BLOCK_N": 64,
                "waves_per_eu": 3,
                "PRE_LOAD_V": True,
            },
            num_stages=1,
            num_warps=4,
        ),
        triton.Config(
            {
                "BLOCK_M": 128,
                "BLOCK_N": 64,
                "waves_per_eu": 3,
                "PRE_LOAD_V": False,
            },
            num_stages=1,
            num_warps=4,
        ),
        triton.Config(
            {
                "BLOCK_M": 64,
                "BLOCK_N": 64,
                "waves_per_eu": 4,
                "PRE_LOAD_V": False,
            },
            num_stages=1,
            num_warps=8,
        ),
        triton.Config(
            {
                "BLOCK_M": 32,
                "BLOCK_N": 32,
                "waves_per_eu": 4,
                "PRE_LOAD_V": False,
            },
            num_stages=1,
            num_warps=8,
        ),
        # TODO: This config fails with head_size not pow2 with data mismatches.
        #    triton.Config({'BLOCK_M': 32, 'BLOCK_N': 16, 'waves_per_eu': 1,
        #                   'PRE_LOAD_V': False}, num_stages=1, num_warps=4),
        triton.Config(
            {
                "BLOCK_M": 16,
                "BLOCK_N": 16,
                "waves_per_eu": 1,
                "PRE_LOAD_V": False,
            },
            num_stages=1,
            num_warps=4,
        ),
        triton.Config(
            {
                "BLOCK_M": 128,
                "BLOCK_N": 64,
                "waves_per_eu": 1,
                "PRE_LOAD_V": False,
            },
            num_stages=1,
            num_warps=4,
        ),
    ],
    key=["IS_CAUSAL", "dropout_p", "BLOCK_DMODEL"],
)
@triton.jit
def attn_fwd(
    Q,
    K,
    V,
    bias,
    sm_scale,
    L,
    Out,
    stride_qz,
    stride_qh,
    stride_qm,
    stride_qk,
    stride_kz,
    stride_kh,
    stride_kn,
    stride_kk,
    stride_vz,
    stride_vh,
    stride_vk,
    stride_vn,
    stride_oz,
    stride_oh,
    stride_om,
    stride_on,
    stride_bz,
    stride_bh,
    stride_bm,
    stride_bn,
    cu_seqlens_q,
    cu_seqlens_k,
    dropout_p,
    philox_seed,
    philox_offset_base,
    encoded_softmax,
    HQ: tl.constexpr,
    HK: tl.constexpr,
    ACTUAL_BLOCK_DMODEL: tl.constexpr,
    MAX_SEQLENS_Q: tl.constexpr,
    MAX_SEQLENS_K: tl.constexpr,
    VARLEN: tl.constexpr,
    IS_CAUSAL: tl.constexpr,
    BLOCK_M: tl.constexpr,
    BLOCK_DMODEL: tl.constexpr,
    BLOCK_N: tl.constexpr,
    PRE_LOAD_V: tl.constexpr,
    BIAS_TYPE: tl.constexpr,
    ENABLE_DROPOUT: tl.constexpr,
    RETURN_ENCODED_SOFTMAX: tl.constexpr,
):
    start_m = tl.program_id(0)
    off_h_q = tl.program_id(1)
    off_z = tl.program_id(2)
    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
    offs_n = tl.arange(0, BLOCK_N)
    if VARLEN:
        cu_seqlens_q_start = tl.load(cu_seqlens_q + off_z)
        cu_seqlens_q_end = tl.load(cu_seqlens_q + off_z + 1)
        seqlen_q = cu_seqlens_q_end - cu_seqlens_q_start
        # We have a one-size-fits-all grid in id(0). Some seqlens might be too
        # small for all start_m so for those we return early.
        if start_m * BLOCK_M > seqlen_q:
            return
        cu_seqlens_k_start = tl.load(cu_seqlens_k + off_z)
        cu_seqlens_k_end = tl.load(cu_seqlens_k + off_z + 1)
        seqlen_k = cu_seqlens_k_end - cu_seqlens_k_start
    else:
        cu_seqlens_q_start = 0
        cu_seqlens_k_start = 0
        seqlen_q = MAX_SEQLENS_Q
        seqlen_k = MAX_SEQLENS_K

    # Now we compute whether we need to exit early due to causal masking.
    # This is because for seqlen_q > seqlen_k, M rows of the attn scores
    # are completely masked, resulting in 0s written to the output, and
    # inf written to LSE. We don't need to do any GEMMs in this case.
    # This block of code determines what N is, and if this WG is operating
    # on those M rows.
    n_blocks = cdiv_fn(seqlen_k, BLOCK_N)
    if IS_CAUSAL:
        # If seqlen_q == seqlen_k, the attn scores are a square matrix.
        # If seqlen_q != seqlen_k, attn scores are rectangular which means
        # the causal mask boundary is bottom right aligned, and ends at either
        # the top edge (seqlen_q < seqlen_k) or left edge.
        # This captures the decrease in n_blocks if we have a rectangular attn
        # matrix
        n_blocks_seqlen = cdiv_fn(
            (start_m + 1) * BLOCK_M + seqlen_k - seqlen_q, BLOCK_N
        )
        # This is what adjusts the block_max for the current WG, only
        # if IS_CAUSAL. Otherwise we want to always iterate through all n_blocks
        n_blocks = min(n_blocks, n_blocks_seqlen)
        # If we have no blocks after adjusting for seqlen deltas, this WG is
        # part of the blocks that are all 0. We exit early.
        if n_blocks <= 0:
            o_offset = (
                off_z * stride_oz + cu_seqlens_q_start * stride_om + off_h_q * stride_oh
            )
            O_block_ptr = tl.make_block_ptr(
                base=Out + o_offset,
                shape=(seqlen_q, BLOCK_DMODEL),
                strides=(stride_om, stride_on),
                offsets=(start_m * BLOCK_M, 0),
                block_shape=(BLOCK_M, BLOCK_DMODEL),
                order=(1, 0),
            )
            acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=Out.type.element_ty)
            # We still need to write 0s to the result
            # tl.store(O_block_ptr,
            # acc.to(Out.type.element_ty), boundary_check=(0,1))
            # l_ptrs = L + off_z * hq * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q
            #          + offs_m
            # We store inf to LSE, not -inf because in the bwd pass,
            # we subtract this
            # from qk which makes it -inf, such that exp(qk - inf) = 0
            # for these masked blocks.
            # l = tl.full([BLOCK_M], value=float("inf"), dtype=tl.float32)
            # tl.store(l_ptrs, l)
            # TODO: Should dropout and return encoded softmax be handled here?
            return

    # If MQA / GQA, set the K and V head offsets appropriately.
    GROUP_SIZE: tl.constexpr = HQ // HK
    if GROUP_SIZE != 1:
        off_h_k = off_h_q // GROUP_SIZE
    else:
        off_h_k = off_h_q

    n_extra_tokens = 0
    if seqlen_k < BLOCK_N:
        n_extra_tokens = BLOCK_N - seqlen_k
    elif seqlen_k % BLOCK_N:
        n_extra_tokens = seqlen_k % BLOCK_N
    PADDED_HEAD: tl.constexpr = ACTUAL_BLOCK_DMODEL != BLOCK_DMODEL

    # Compute pointers for all the tensors used in this kernel.
    q_offset = off_z * stride_qz + off_h_q * stride_qh + cu_seqlens_q_start * stride_qm
    Q_block_ptr = tl.make_block_ptr(
        base=Q + q_offset,
        shape=(seqlen_q, ACTUAL_BLOCK_DMODEL),
        strides=(stride_qm, stride_qk),
        offsets=(start_m * BLOCK_M, 0),
        block_shape=(BLOCK_M, BLOCK_DMODEL),
        order=(1, 0),
    )
    k_offset = off_z * stride_kz + off_h_k * stride_kh + cu_seqlens_k_start * stride_kn
    K_block_ptr = tl.make_block_ptr(
        base=K + k_offset,
        shape=(ACTUAL_BLOCK_DMODEL, seqlen_k),
        strides=(stride_kk, stride_kn),
        offsets=(0, 0),
        block_shape=(BLOCK_DMODEL, BLOCK_N),
        order=(0, 1),
    )
    v_offset = off_z * stride_vz + off_h_k * stride_vh + cu_seqlens_k_start * stride_vk
    V_block_ptr = tl.make_block_ptr(
        base=V + v_offset,
        shape=(seqlen_k, ACTUAL_BLOCK_DMODEL),
        strides=(stride_vk, stride_vn),
        offsets=(0, 0),
        block_shape=(BLOCK_N, BLOCK_DMODEL),
        order=(1, 0),
    )
    if BIAS_TYPE != 0:
        bias_ptr = tl.make_block_ptr(
            base=bias + off_h_q * stride_bh,
            shape=(seqlen_q, seqlen_k),
            strides=(stride_bm, stride_bn),
            offsets=(start_m * BLOCK_M, 0),
            block_shape=(BLOCK_M, BLOCK_N),
            order=(1, 0),
        )
    else:
        bias_ptr = None
    if ENABLE_DROPOUT:
        batch_philox_offset = (
            philox_offset_base + (off_z * HQ + off_h_q) * seqlen_q * seqlen_k
        )
    else:
        batch_philox_offset = 0
    # We can ask to return the dropout mask without actually doing any dropout.
    # In this case, we return an invalid pointer so indicate the mask is not i
    # valid.
    # TODO: Fix encoded softmax. It currently uses just h_q in the base offset.
    if RETURN_ENCODED_SOFTMAX:
        encoded_softmax_block_ptr = tl.make_block_ptr(
            base=encoded_softmax + off_h_q * seqlen_q * seqlen_k,
            shape=(seqlen_q, seqlen_k),
            strides=(seqlen_k, 1),
            offsets=(start_m * BLOCK_M, 0),
            block_shape=(BLOCK_M, BLOCK_N),
            order=(1, 0),
        )
    else:
        encoded_softmax_block_ptr = 0
    # initialize pointer to m and l
    m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
    l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
    # scale sm_scale by log_2(e) and use 2^x in the loop as we do not
    # have native e^x support in HW.
    qk_scale = sm_scale * 1.44269504089
    # Q is loaded once at the beginning and shared by all N blocks.
    q = load_fn(Q_block_ptr, True, PADDED_HEAD, "zero")
    q = (q * qk_scale).to(Q_block_ptr.type.element_ty)

    # Here we compute how many full and masked blocks we have.
    padded_block_k = n_extra_tokens != 0
    is_modulo_mn = not padded_block_k and (seqlen_q % BLOCK_M == 0)
    if IS_CAUSAL:
        # There are always at least BLOCK_M // BLOCK_N masked blocks.
        # Additionally there might be one more due to dissimilar seqlens.
        masked_blocks = BLOCK_M // BLOCK_N + (not is_modulo_mn)
    else:
        # Padding on Q does not need to be masked in the FA loop.
        masked_blocks = padded_block_k
    # if IS_CAUSAL, not is_modulo_mn does not always result in an additional
    # block. In this case we might exceed n_blocks so pick the min.
    masked_blocks = min(masked_blocks, n_blocks)
    n_full_blocks = n_blocks - masked_blocks
    block_min = 0
    block_max = n_blocks * BLOCK_N
    # Compute for full blocks. Here we set causal to false regardless of its
    # value because there is no masking. Similarly we do not need padding.
    if n_full_blocks > 0:
        block_max = (n_blocks - masked_blocks) * BLOCK_N
        acc, l_i, m_i = _attn_fwd_inner(
            acc,
            l_i,
            m_i,
            q,
            K_block_ptr,
            V_block_ptr,
            start_m,
            seqlen_k,
            dropout_p,
            philox_seed,
            batch_philox_offset,
            encoded_softmax_block_ptr,
            # _, _, offs_n_causal, masked_blocks, n_extra_tokens, _
            block_min,
            block_max,
            0,
            0,
            0,
            bias_ptr,
            # IS_CAUSAL, ....
            False,
            BLOCK_M,
            BLOCK_DMODEL,
            BLOCK_N,
            offs_m,
            offs_n,
            # _, MASK_STEPS, ...
            PRE_LOAD_V,
            False,
            ENABLE_DROPOUT,
            RETURN_ENCODED_SOFTMAX,
            PADDED_HEAD,
        )
        block_min = block_max
        block_max = n_blocks * BLOCK_N

    tl.debug_barrier()
    # Remaining blocks, if any, are full / not masked.
    if masked_blocks > 0:
        offs_n_causal = offs_n + (seqlen_q - seqlen_k) if IS_CAUSAL else 0
        K_block_ptr = tl.advance(K_block_ptr, (0, n_full_blocks * BLOCK_N))
        V_block_ptr = tl.advance(V_block_ptr, (n_full_blocks * BLOCK_N, 0))
        if bias_ptr is not None:
            bias_ptr = tl.advance(bias_ptr, (0, n_full_blocks * BLOCK_N))
        if RETURN_ENCODED_SOFTMAX:
            encoded_softmax_block_ptr = tl.advance(
                encoded_softmax_block_ptr, (0, n_full_blocks)
            )
        acc, l_i, m_i = _attn_fwd_inner(
            acc,
            l_i,
            m_i,
            q,
            K_block_ptr,
            V_block_ptr,
            start_m,
            seqlen_k,
            dropout_p,
            philox_seed,
            batch_philox_offset,
            encoded_softmax_block_ptr,
            block_min,
            block_max,
            offs_n_causal,
            masked_blocks,
            n_extra_tokens,
            bias_ptr,
            IS_CAUSAL,
            BLOCK_M,
            BLOCK_DMODEL,
            BLOCK_N,
            offs_m,
            offs_n,
            # _, MASK_STEPS, ...
            PRE_LOAD_V,
            True,
            ENABLE_DROPOUT,
            RETURN_ENCODED_SOFTMAX,
            PADDED_HEAD,
        )
    # epilogue
    acc = acc / l_i[:, None]
    if ENABLE_DROPOUT:
        acc = acc / (1 - dropout_p)
    # If seqlen_q > seqlen_k but the delta is not a multiple of BLOCK_M,
    # then we have one block with a row of all NaNs which come from computing
    # softmax over a row of all -infs (-inf - inf = NaN). We check for that here
    # and store 0s where there are NaNs as these rows should've been zeroed out.
    end_m_idx = (start_m + 1) * BLOCK_M
    start_m_idx = start_m * BLOCK_M
    causal_start_idx = seqlen_q - seqlen_k
    acc = acc.to(Out.type.element_ty)
    if IS_CAUSAL:  # noqa: SIM102
        if causal_start_idx > start_m_idx and causal_start_idx < end_m_idx:
            out_mask_boundary = tl.full(
                (BLOCK_DMODEL,), causal_start_idx, dtype=tl.int32
            )
            mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M)
            out_ptrs_mask = mask_m_offsets[:, None] >= out_mask_boundary[None, :]
            z = 0.0
            acc = tl.where(out_ptrs_mask, acc, z.to(acc.type.element_ty))
    # write back LSE
    # l_ptrs = L + off_z * hq * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q + offs_m
    # If seqlen_q not multiple of BLOCK_M, we need to mask out the last
    # few rows. This is only true for the last M block. For others,
    # overflow_size will be -ve
    # overflow_size = end_m_idx - seqlen_q
    # if overflow_size > 0:
    #    boundary = tl.full((BLOCK_M,), BLOCK_M - overflow_size, dtype=tl.int32)
    #    # This is a > check because mask being 0 blocks the store.
    #    l_ptrs_mask = boundary > tl.arange(0, BLOCK_M)
    #    tl.store(l_ptrs, m_i + tl.math.log2(l_i), mask=l_ptrs_mask)
    # else:
    #    tl.store(l_ptrs, m_i + tl.math.log2(l_i))

    # write back O
    o_offset = off_z * stride_oz + cu_seqlens_q_start * stride_om + off_h_q * stride_oh
    O_block_ptr = tl.make_block_ptr(
        base=Out + o_offset,
        shape=(seqlen_q, ACTUAL_BLOCK_DMODEL),
        strides=(stride_om, stride_on),
        offsets=(start_m * BLOCK_M, 0),
        block_shape=(BLOCK_M, BLOCK_DMODEL),
        order=(1, 0),
    )
    # Need boundary check on this to make sure the padding from the
    # Q and KV tensors in both dims are not part of what we store back.
    # TODO: Do the boundary check optionally.
    tl.store(O_block_ptr, acc, boundary_check=(0, 1))


def check_args(
    q,
    k,
    v,
    o,
    varlen=True,
    max_seqlens=None,
    cu_seqlens_q=None,
    cu_seqlens_k=None,
):
    assert q.dim() == k.dim() and q.dim() == v.dim()
    if varlen:
        assert q.dim() == 3
        total_q, nheads_q, head_size = q.shape
        total_k, nheads_k, _ = k.shape
        assert cu_seqlens_q is not None
        assert cu_seqlens_k is not None
        assert len(cu_seqlens_q) == len(cu_seqlens_k)
    else:
        assert q.dim() == 4
        batch, nheads_q, seqlen_q, head_size = q.shape
        _, nheads_k, seqlen_k, _ = k.shape
        assert max_seqlens > 0
    assert k.shape == v.shape
    assert q.shape[-1] == k.shape[-1] and q.shape[-1] == v.shape[-1]
    # TODO: Change assert if we support qkl f8 and v f16
    assert q.dtype == k.dtype and q.dtype == v.dtype
    # TODO: Fix assert to check head size <=256 once supported
    assert head_size <= 128
    assert o.shape == q.shape
    assert (nheads_q % nheads_k) == 0


class _attention(torch.autograd.Function):
    @staticmethod
    def forward(
        ctx,
        q,
        k,
        v,
        o,
        cu_seqlens_q,
        cu_seqlens_k,
        max_seqlens_q,
        max_seqlens_k,
        causal=False,
        sm_scale=1.0,
        bias=None,
    ):
        if o is None:
            o = torch.empty_like(q, dtype=v.dtype)

        check_args(
            q,
            k,
            v,
            o,
            varlen=True,
            cu_seqlens_q=cu_seqlens_q,
            cu_seqlens_k=cu_seqlens_k,
        )
        if True:  # varlen
            total_q, nheads_q, head_size = q.shape
            total_k, nheads_k, _ = k.shape
            batch = len(cu_seqlens_q) - 1
            q_strides = (0, q.stride(1), q.stride(0), q.stride(2))
            k_strides = (0, k.stride(1), k.stride(0), k.stride(2))
            v_strides = (0, v.stride(1), v.stride(0), v.stride(2))
            o_strides = (0, o.stride(1), o.stride(0), o.stride(2))
        else:
            batch, seqlen_q, nheads_q, head_size = q.shape
            _, seqlen_k, nheads_k, _ = k.shape
            q_strides = (q.stride(0), q.stride(2), q.stride(1), q.stride(3))
            k_strides = (k.stride(0), k.stride(2), k.stride(1), k.stride(3))
            v_strides = (v.stride(0), v.stride(2), v.stride(1), v.stride(3))
            o_strides = (o.stride(0), o.stride(2), o.stride(1), o.stride(3))

        # Get closest power of 2 over or equal to 32.
        padded_d_model = 1 << (head_size - 1).bit_length()
        padded_d_model = max(padded_d_model, 16)

        def grid(META):
            return triton.cdiv(max_seqlens_q, META["BLOCK_M"]), nheads_q, batch

        encoded_softmax = None

        # Seed the RNG so we get reproducible results for testing.
        philox_seed = 0x1BF52
        philox_offset = 0x1D4B42

        if bias is not None:
            bias_strides = (
                bias.stride(0),
                bias.stride(1),
                bias.stride(2),
                bias.stride(3),
            )
        else:
            bias_strides = (0, 0, 0, 0)

        attn_fwd[grid](
            q,
            k,
            v,
            bias,
            sm_scale,
            None,
            o,
            *q_strides,
            *k_strides,
            *v_strides,
            *o_strides,
            *bias_strides,
            cu_seqlens_q,
            cu_seqlens_k,
            dropout_p=0.0,
            philox_seed=philox_seed,
            philox_offset_base=philox_offset,
            encoded_softmax=encoded_softmax,
            HQ=nheads_q,
            HK=nheads_k,
            ACTUAL_BLOCK_DMODEL=head_size,
            MAX_SEQLENS_Q=max_seqlens_q,
            MAX_SEQLENS_K=max_seqlens_k,
            IS_CAUSAL=causal,
            VARLEN=True,
            BLOCK_DMODEL=padded_d_model,
            BIAS_TYPE=0 if bias is None else 1,
            ENABLE_DROPOUT=False,
            RETURN_ENCODED_SOFTMAX=False,
        )

        ctx.grid = grid
        ctx.sm_scale = sm_scale
        ctx.BLOCK_DMODEL = head_size
        ctx.causal = causal
        ctx.dropout_p = 0.0
        ctx.philox_seed = philox_seed
        ctx.philox_offset = philox_offset
        ctx.encoded_softmax = encoded_softmax
        ctx.return_encoded_softmax = False
        return o, encoded_softmax


triton_attention = _attention.apply


================================================
FILE: server/text_generation_server/layers/attention/flashinfer.py
================================================
from typing import Optional
from contextvars import ContextVar
from contextlib import contextmanager
import math

import flashinfer
import torch

prefill_state: ContextVar[flashinfer.BatchPrefillWithRaggedKVCacheWrapper] = ContextVar(
    "prefill_state"
)

prefill_with_paged_kv_state: ContextVar[
    flashinfer.BatchPrefillWithPagedKVCacheWrapper
] = ContextVar("prefill_with_paged_kv_state")

decode_state: ContextVar[flashinfer.BatchDecodeWithPagedKVCacheWrapper] = ContextVar(
    "decode_state"
)

workspace: Optional[torch.Tensor] = None


def unpad_2d_mask(
    attention_mask: torch.Tensor, seq_lengths: torch.Tensor
) -> torch.Tensor:
    # Like torch unpad_sequence, but for 2D masks.
    unpadded_tensors = []
    for i, length in enumerate(seq_lengths):
        unpadded_matrix = attention_mask[i, :length, :length]
        unpadded_tensors.append(unpadded_matrix.flatten())

    packed_tensor = torch.cat(unpadded_tensors)

    return packed_tensor


def get_workspace(device):
    """Get shared flashinfer workspace."""
    global workspace
    if workspace is None:
        workspace = torch.empty(128 * 1024 * 1024, dtype=torch.uint8, device=device)
    return workspace


def create_prefill_with_paged_kv_state(
    *,
    device: torch.device,
):
    """Create a prefill state that uses the KV cache."""
    workspace_buffer = get_workspace(device)
    return flashinfer.BatchPrefillWithPagedKVCacheWrapper(
        workspace_buffer, kv_layout="NHD", use_cuda_graph=False
    )


@contextmanager
def use_prefill_with_paged_kv_state(
    *,
    state: flashinfer.BatchPrefillWithPagedKVCacheWrapper,
    block_tables: torch.Tensor,
    cu_seqlens: torch.Tensor,
    custom_mask: Optional[torch.Tensor],
    input_lengths: torch.Tensor,
    num_heads: int,
    num_kv_heads: int,
    head_size: int,
    page_size: int,
    kv_dtype: torch.dtype,
    q_dtype: torch.dtype,
):
    """
    Context manager to set the active flashinfer prefill state to the given
    `state` and parameters. This state will be used by all calls to the
    `attention` function while the context manager is active.
    """

    indptr = torch.zeros(
        input_lengths.shape[0] + 1, device=input_lengths.device, dtype=torch.int32
    )
    # Round up to page size and then calculate the cumulative sum to get
    # the indices into the block table.
    torch.add(input_lengths, page_size - 1, out=indptr[1:])
    indptr[1:].div_(page_size, rounding_mode="floor")
    indptr[1:].cumsum_(-1)

    # Get the lengths of the last page in a block.
    if page_size == 1:
        last_page_len = torch.ones(
            input_lengths.shape[0], dtype=torch.int32, device=input_lengths.device
        )
    else:
        last_page_len = torch.empty(
            input_lengths.shape[0], dtype=torch.int32, device=input_lengths.device
        )
        torch.sub(input_lengths, 1, out=last_page_len)
        last_page_len.remainder_(page_size)
        last_page_len += 1

    token = prefill_with_paged_kv_state.set(state)

    # Attention masks are padded, unpad.
    if custom_mask is not None:
        bs = input_lengths.shape[0]
        seq_len = math.isqrt(custom_mask.numel() // bs)
        custom_mask = unpad_2d_mask(
            custom_mask.reshape(bs, seq_len, seq_len), input_lengths
        )

    try:
        state.plan(
            qo_indptr=cu_seqlens,
            paged_kv_indptr=indptr,
            paged_kv_indices=block_tables,
            paged_kv_last_page_len=last_page_len,
            custom_mask=custom_mask,
            num_qo_heads=num_heads,
            num_kv_heads=num_kv_heads,
            head_dim=head_size,
            kv_data_type=kv_dtype,
            q_data_type=q_dtype,
            page_size=page_size,
        )
        yield
    finally:
        if token is not None:
            prefill_with_paged_kv_state.reset(token)


def create_prefill_state(
    *,
    device: torch.device,
):
    """Create a prefill state."""
    workspace_buffer = get_workspace(device)
    return flashinfer.BatchPrefillWithRaggedKVCacheWrapper(
        workspace_buffer, kv_layout="NHD", use_cuda_graph=False
    )


def create_decode_state(
    *,
    device: torch.device,
    num_heads: int,
    num_kv_heads: int,
):
    """Create a decode state."""
    workspace_buffer = get_workspace(device)
    num_groups = num_heads // num_kv_heads
    return flashinfer.BatchDecodeWithPagedKVCacheWrapper(
        workspace_buffer,
        kv_layout="NHD",
        use_cuda_graph=False,
        # Taken from https://github.com/flashinfer-ai/flashinfer/blob/33ef95700981ba70f4cab63b8931e562bc795b21/python/flashinfer/decode.py#L57-L60
        use_tensor_cores=num_groups not in [1, 2, 4, 8],
    )


def create_decode_state_cuda_graphs(
    *,
    device: torch.device,
    block_tables: torch.Tensor,
    block_tables_ptr: torch.Tensor,
    last_page_len: torch.Tensor,
    num_heads: int,
    num_kv_heads: int,
):
    """
    Create a decode state for use with CUDA Graphs. `block_tables`,
    `block_tables_ptr`, and `last_page_len` are used in CUDA Graphs and are
    therefore stored as part of the state.
    """
    workspace_buffer = get_workspace(device)
    num_groups = num_heads // num_kv_heads
    return flashinfer.BatchDecodeWithPagedKVCacheWrapper(
        workspace_buffer,
        kv_layout="NHD",
        use_cuda_graph=True,
        paged_kv_indices_buffer=block_tables,
        paged_kv_indptr_buffer=block_tables_ptr,
        paged_kv_last_page_len_buffer=last_page_len,
        # Taken from https://github.com/flashinfer-ai/flashinfer/blob/33ef95700981ba70f4cab63b8931e562bc795b21/python/flashinfer/decode.py#L57-L60
        use_tensor_cores=num_groups not in [1, 2, 4, 8],
    )


@contextmanager
def use_decode_state(
    *,
    state: flashinfer.BatchDecodeWithPagedKVCacheWrapper,
    input_lengths: torch.Tensor,
    block_tables: torch.Tensor,
    num_heads: int,
    num_kv_heads: int,
    head_size: int,
    page_size: int,
    kv_cache_dtype: torch.dtype,
    q_dtype: torch.dtype,
):
    """
    Context manager to set the active flashinfer decoding state to the given
    `state` and parameters. This state will be used by all calls to the
    `paged_attention` function while the context manager is active.
    """
    indptr = torch.zeros(
        input_lengths.shape[0] + 1, device=input_lengths.device, dtype=torch.int32
    )
    # Round up to page size and then calculate the cumulative sum to get
    # the indices into the block table.
    torch.add(input_lengths, page_size - 1, out=indptr[1:])
    indptr[1:].div_(page_size, rounding_mode="floor")
    indptr[1:].cumsum_(-1)

    # Get the lengths of the last page in a block.
    last_page_len = torch.empty(
        input_lengths.shape[0], dtype=torch.int32, device=input_lengths.device
    )
    torch.sub(input_lengths, 1, out=last_page_len)
    last_page_len.remainder_(page_size)
    last_page_len += 1

    token = decode_state.set(state)

    try:
        state.plan(
            indptr=indptr,
            indices=block_tables,
            last_page_len=last_page_len,
            num_qo_heads=num_heads,
            num_kv_heads=num_kv_heads,
            head_dim=head_size,
            page_size=page_size,
            data_type=kv_cache_dtype,
            q_data_type=q_dtype,
        )
        yield
    finally:
        if token is not None:
            decode_state.reset(token)


================================================
FILE: server/text_generation_server/layers/attention/ipex.py
================================================
import intel_extension_for_pytorch as ipex
import torch
from text_generation_server.layers.attention.kv_cache import KVCache, KVScales
from text_generation_server.layers.attention import Seqlen
from typing import Optional
from text_generation_server.models.globals import (
    ATTENTION,
    BLOCK_SIZE,
)

if ATTENTION == "flashdecoding-ipex":
    SUPPORTS_WINDOWING = True
else:
    SUPPORTS_WINDOWING = False


def attention(
    *,
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    kv_cache: KVCache,
    kv_scales: KVScales,
    seqlen: Seqlen,
    block_tables: torch.Tensor,
    softmax_scale: float,
    window_size_left: int = -1,
    causal: bool = True,
    softcap: Optional[float] = None,
):

    out = torch.empty_like(query)
    kv_cache_dtype = "auto"
    if kv_cache.key.dtype == torch.float8_e5m2:
        kv_cache_dtype = "fp8_e5m2"
    if kv_cache.key.dtype == torch.float8_e4m3fn:
        kv_cache_dtype = "fp8_e4m3"

    # We do not need to check window_size_left (not supported) here, so it is already checked ahead of time at model load.
    if ATTENTION == "flashdecoding-ipex":
        window_size_right = -1 if window_size_left == -1 else 0
        if softcap is None:
            softcap = -1.0
        ipex.llm.modules.PagedAttention.flash_attn_varlen_func(
            out,
            query.contiguous() if query.device.type == "xpu" else query,
            kv_cache.key,
            kv_cache.value,
            seqlen.cu_seqlen_q,
            seqlen.cu_seqlen_k,
            seqlen.max_q,
            seqlen.max_k,
            softmax_scale,
            causal,
            block_tables,
            None,
            window_size_left=window_size_left,
            window_size_right=window_size_right,
            kv_cache_dtype=kv_cache_dtype,
            k_scale=kv_scales.key_scale_cpu,
            v_scale=kv_scales.value_scale_cpu,
            softcap=softcap,
        )
    else:
        if softcap is not None:
            raise NotImplementedError(
                "softcap is not available in IPEX paged attention"
            )
        ipex.llm.functional.varlen_attention(
            query.contiguous() if query.device.type == "xpu" else query,
            key.contiguous() if key.device.type == "xpu" else key,
            value.contiguous() if value.device.type == "xpu" else value,
            out,
            seqlen.cu_seqlen_q,
            seqlen.cu_seqlen_q,
            seqlen.max_q,
            seqlen.max_q,
            0.0,
            softmax_scale,
            False,
            causal,
            False,
            None,
        )

    return out


def paged_attention(
    query: torch.Tensor,
    kv_cache: KVCache,
    kv_head_mapping: torch.Tensor,
    softmax_scale: float,
    block_tables: torch.Tensor,
    seqlen: Seqlen,
    max_s: int,
    *,
    kv_scales: KVScales,
    softcap: Optional[float] = None,
    window_size_left: Optional[int] = -1,
):
    out = torch.empty_like(query)
    kv_cache_dtype = "auto"
    if kv_cache.key.dtype == torch.float8_e5m2:
        kv_cache_dtype = "fp8_e5m2"
    if kv_cache.key.dtype == torch.float8_e4m3fn:
        kv_cache_dtype = "fp8_e4m3"
    if ATTENTION == "flashdecoding-ipex":
        window_size_right = -1 if window_size_left == -1 else 0
        if softcap is None:
            softcap = -1.0
        ipex.llm.modules.PagedAttention.flash_attn_varlen_func(
            out,
            query.contiguous() if query.device.type == "xpu" else query,
            kv_cache.key,
            kv_cache.value,
            seqlen.cu_seqlen_q,
            seqlen.cu_seqlen_k,
            seqlen.max_q,
            seqlen.max_k,
            softmax_scale,
            True,
            block_tables,
            None,
            window_size_left=window_size_left,
            window_size_right=window_size_right,
            kv_cache_dtype=kv_cache_dtype,
            k_scale=kv_scales.key_scale_cpu,
            v_scale=kv_scales.value_scale_cpu,
            softcap=softcap,
        )
    else:
        input_lengths = seqlen.input_lengths + seqlen.cache_lengths
        if softcap is not None:
            raise NotImplementedError(
                "softcap is not available in IPEX paged attention"
            )
        ipex.llm.modules.PagedAttention.single_query_cached_kv_attention(
            out,
            query,
            kv_cache.key,
            kv_cache.value,
            kv_head_mapping,
            softmax_scale,
            block_tables,
            input_lengths,
            BLOCK_SIZE,
            max_s,
            None,
            k_scale=kv_scales.key_scale_cpu,
            v_scale=kv_scales.value_scale_cpu,
        )
    return out


__all__ = [
    "SUPPORTS_WINDOWING",
    "attention",
    "paged_attention",
]


================================================
FILE: server/text_generation_server/layers/attention/kv_cache.py
================================================
from typing import Tuple
from dataclasses import dataclass, field

from loguru import logger
import torch

from text_generation_server.layers.fp8 import fp8_quantize
from text_generation_server.models.globals import ATTENTION, BLOCK_SIZE
from text_generation_server.utils.import_utils import SYSTEM
from text_generation_server.utils.kernels import load_kernel
from text_generation_server.utils.log import log_once
from text_generation_server.utils.weights import Weights

if SYSTEM == "cuda":
    try:
        paged_attention = load_kernel(
            module="paged_attention", repo_id="kernels-community/paged-attention"
        )
    except Exception as e:
        raise ImportError(
            f"Could not import attention kernels. Make sure your installation is correct. Complete error: {e}"
        )
else:
    paged_attention = None


@dataclass
class KVScales:
    """
    Key-value scales for FP8 KV cache.

    This data class stores key and value scales both as a GPU tensor and
    as a GPU float. This inconvenience is necessary because some functions
    (e.g. scaling kernels) take scales as a GPU tensor, whereas others
    (e.g. flashinfer) take scales as a CPU scalar.
    """

    key_scale: torch.Tensor
    value_scale: torch.Tensor
    key_scale_cpu: float = field(init=False)
    value_scale_cpu: float = field(init=False)

    def __post_init__(self):
        if self.key_scale.numel() != 1 or self.value_scale.numel() != 1:
            raise ValueError("Key and value scales must be scalar tensors.")

        self.key_scale_cpu = self.key_scale.item()
        self.value_scale_cpu = self.value_scale.item()


class KVCache:
    """
    Key-value cache for attention layers.
    """

    kv_cache: Tuple[torch.Tensor, torch.Tensor]

    def __init__(
        self,
        *,
        num_blocks: int,
        num_heads: int,
        head_size: int,
        dtype: torch.dtype,
        device: torch.device,
    ):
        """Construct the key-value cache for a layer."""
        if dtype in {torch.float8_e5m2, torch.float8_e4m3fn}:
            if not (
                (ATTENTION == "flashinfer" and SYSTEM == "cuda")
                or (ATTENTION == "paged" and SYSTEM in ("cuda", "rocm", "ipex"))
                or (ATTENTION == "flashdecoding-ipex")
            ):
                raise ValueError(
                    "FP8 KV cache is currently only supported for flashinfer on CUDA and paged attention on CUDA, ROCm and INTEL IPEX and flashdecoding in Intel IPEX "
                )
            if SYSTEM == "rocm" and dtype == torch.float8_e5m2:
                raise ValueError(
                    "float8_e5m2 FP8 KV cache is not supported on AMD ROCm"
                )
            if device.type == "cpu" and dtype == torch.float8_e4m3fn:
                raise ValueError(
                    "float8_e4m3fn FP8 KV cache is not supported on Intel IPEX CPU"
                )

        element_size = torch.tensor([], dtype=dtype).element_size()
        if SYSTEM == "ipex" and device.type == "xpu":
            x = 1
        else:
            x = BLOCK_SIZE // element_size

        if ATTENTION in {"flashdecoding", "flashinfer"} or (
            ATTENTION == "flashdecoding-ipex" and device.type == "xpu"
        ):
            self.kv_cache = (
                torch.empty(
                    (num_blocks, BLOCK_SIZE, num_heads, head_size),
                    dtype=dtype,
                    device=device,
                ),
                torch.empty(
                    (num_blocks, BLOCK_SIZE, num_heads, head_size),
                    dtype=dtype,
                    device=device,
                ),
            )
        elif SYSTEM == "ipex" and device == torch.device("cpu"):
            # ipex cpu flashdecoding kernel and paged attention kernel share same layout
            self.kv_cache = (
                torch.empty(
                    (num_blocks, num_heads, BLOCK_SIZE, head_size),
                    dtype=dtype,
                    device=device,
                ),
                torch.empty(
                    (num_blocks, num_heads, BLOCK_SIZE, head_size),
                    dtype=dtype,
                    device=device,
                ),
            )
        else:
            self.kv_cache = (
                torch.zeros(
                    (num_blocks, num_heads, head_size // x, BLOCK_SIZE, x),
                    dtype=dtype,
                    device=device,
                ),
                torch.zeros(
                    (num_blocks, num_heads, head_size, BLOCK_SIZE),
                    dtype=dtype,
                    device=device,
                ),
            )

    def can_scale(self, kv_scales: KVScales) -> bool:
        """Check if the cache can be scaled by the given scales."""
        if kv_scales.key_scale_cpu == 1.0 and kv_scales.value_scale_cpu == 1.0:
            return False
        elif self.dtype == torch.float8_e4m3fn and (
            (ATTENTION in ("paged", "flashinfer") and SYSTEM == "cuda")
            or (ATTENTION == "paged" and SYSTEM in ["rocm", "ipex"])
            or (ATTENTION == "flashdecoding-ipex")
        ):
            log_once(logger.info, "Using FP8 KV cache scales")
            return True
        else:
            # We have scales, but not the correct FP8 cache type, so warn once.
            log_once(
                logger.info,
                "Ignoring FP8 KV cache scales, supported only for float8_e4m3fn KV cache with flashinfer on CUDA and paged attention on ROCm/IPEX and flashdecoding on IPEX",
            )
            return False

    @property
    def dtype(self):
        """Get the data type of the cache."""
        return self.kv_cache[0].dtype

    @property
    def key(self):
        """Get the key cache."""

        return self.kv_cache[0]

    @property
    def value(self):
        """Get the value cache."""

        return self.kv_cache[1]

    def store(
        self,
        *,
        key: torch.Tensor,
        value: torch.Tensor,
        slots: torch.Tensor,
        kv_scales: KVScales,
    ):
        """Store the key and value at the given slots."""

        key_cache = self.kv_cache[0]
        value_cache = self.kv_cache[1]

        if self.can_scale(kv_scales) and SYSTEM == "cuda":
            if kv_scales.key_scale_cpu != 1.0:
                key = fp8_quantize(
                    key.float(),
                    scale=kv_scales.key_scale,
                    qdtype=self.dtype,
                    scalar=True,
                )[0]
            if kv_scales.value_scale_cpu != 1.0:
                value = fp8_quantize(
                    value.float(),
                    scale=kv_scales.value_scale,
                    qdtype=self.dtype,
                    scalar=True,
                )[0]

        if ATTENTION in {"flashdecoding", "flashinfer"}:
            key = key.to(key_cache.dtype)
            value = value.to(value_cache.dtype)
            if key_cache.dtype in {torch.float8_e4m3fn, torch.float8_e5m2}:
                # Torch index_put does not support float8_{e5m2,e4m3fn} yet, so
                # put as raw data instead.
                key_cache = key_cache.view(torch.uint8)
                value_cache = value_cache.view(torch.uint8)
                key = key.view(torch.uint8)
                value = value.view(torch.uint8)
            shape = key_cache.shape
            key_cache.view(-1, shape[-2], shape[-1])[slots] = key
            value_cache.view(-1, shape[-2], shape[-1])[slots] = value
        elif ATTENTION == "flashdecoding-ipex" and key.device.type == "xpu":
            import intel_extension_for_pytorch as ipex

            kv_cache_dtype = "auto"
            if key_cache.dtype == torch.float8_e5m2:
                kv_cache_dtype = "fp8_e5m2"
            if key_cache.dtype == torch.float8_e4m3fn:
                kv_cache_dtype = "fp8_e4m3"
            ipex.llm.modules.PagedAttention.reshape_and_cache_flash(
                key,
                value,
                key_cache,
                value_cache,
                slots,
                kv_cache_dtype=kv_cache_dtype,
                k_scale=kv_scales.key_scale_cpu,
                v_scale=kv_scales.value_scale_cpu,
            )
        else:
            paged_reshape_and_cache(
                key,
                value,
                key_cache,
                value_cache,
                slots,
                kv_scales.key_scale_cpu,
                kv_scales.value_scale_cpu,
            )


def paged_reshape_and_cache(
    key: torch.Tensor,
    value: torch.Tensor,
    key_cache: torch.Tensor,
    value_cache: torch.Tensor,
    slots: torch.Tensor,
    k_scale: float = 1.0,
    v_scale: float = 1.0,
):

    if SYSTEM == "cuda":
        kv_cache_dtype = "auto"
        if key_cache.dtype == torch.float8_e4m3fn:
            kv_cache_dtype = "fp8"

        paged_attention.reshape_and_cache(
            key,
            value,
            key_cache,
            value_cache,
            slots,
            kv_cache_dtype,
            torch.tensor(k_scale),
            torch.tensor(v_scale),
        )
    elif SYSTEM == "rocm":
        try:
            import vllm._custom_ops as ops
        except Exception as e:
            raise ImportError(
                f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}"
            )

        kv_cache_dtype = "auto"
        if key_cache.dtype == torch.float8_e4m3fn:
            key_cache = key_cache.view(torch.uint8)
            value_cache = value_cache.view(torch.uint8)
            kv_cache_dtype = "fp8"

        ops.reshape_and_cache(
            key, value, key_cache, value_cache, slots, kv_cache_dtype, k_scale, v_scale
        )
    elif SYSTEM == "ipex":
        import intel_extension_for_pytorch as ipex

        kv_cache_dtype = "auto"
        if key_cache.dtype == torch.float8_e5m2:
            kv_cache_dtype = "fp8_e5m2"
        if key_cache.dtype == torch.float8_e4m3fn:
            kv_cache_dtype = "fp8_e4m3"

        ipex.llm.modules.PagedAttention.reshape_and_cache(
            key,
            value,
            key_cache,
            value_cache,
            slots,
            kv_cache_dtype=kv_cache_dtype,
            k_scale=k_scale,
            v_scale=v_scale,
        )
    else:
        raise NotImplementedError(
            f"Cannot reshape and cache for paged attention, system '{SYSTEM}' not supported"
        )


def get_kv_scales(weights: Weights, prefix: str) -> KVScales:
    """Load KV cache scales."""

    key_scale = torch.tensor(1.0, dtype=torch.float32, device=weights.device)
    value_scale = key_scale
    if weights.has_tensor(f"{prefix}.k_scale") and weights.has_tensor(
        f"{prefix}.v_scale"
    ):
        key_scale = weights.get_tensor(f"{prefix}.k_scale", to_dtype=False).float()
        value_scale = weights.get_tensor(f"{prefix}.v_scale", to_dtype=False).float()
    elif weights.has_tensor(f"{prefix}.kv_scale"):
        # Fall back to older more coarse-grained scale when available.
        key_scale = weights.get_tensor(f"{prefix}.kv_scale").float()
        value_scale = key_scale

    return KVScales(key_scale=key_scale, value_scale=value_scale)


================================================
FILE: server/text_generation_server/layers/attention/rocm.py
================================================
import os
from typing import Optional
import torch
from text_generation_server.layers.attention.kv_cache import KVCache, KVScales
from text_generation_server.utils.import_utils import SYSTEM
from text_generation_server.layers.attention import Seqlen
from text_generation_server.utils.log import log_master
from text_generation_server.models.globals import (
    ATTENTION,
    BLOCK_SIZE,
)
from loguru import logger
import vllm._custom_ops as ops

major, minor = torch.cuda.get_device_capability()
is_sm75 = major == 7 and minor == 5

_PARTITION_SIZE_V1V2 = 1024
_PARTITION_SIZE_CUSTOM = 256

_GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
_ON_MI250_MI300 = any(
    arch in _GPU_ARCH for arch in ["gfx90a", "gfx940", "gfx941", "gfx942"]
)

use_triton = os.getenv("ROCM_USE_FLASH_ATTN_V2_TRITON", "").lower() in {"true", "1"}
ENGINE = "triton" if use_triton else "ck"

use_rocm_custom_paged_attn = os.getenv("ROCM_USE_CUSTOM_PAGED_ATTN", "1") != "0"


def _use_rocm_custom_paged_attention(
    qtype: torch.dtype,
    head_size: int,
    block_size: int,
    gqa_ratio: int,
    max_seq_len: int,
) -> bool:
    # rocm custom page attention not support on navi (gfx1*)
    return (
        use_rocm_custom_paged_attn
        and _ON_MI250_MI300
        and (qtype == torch.half or qtype == torch.bfloat16)
        and (head_size == 64 or head_size == 128)
        and (block_size == 16 or block_size == 32)
        and (gqa_ratio >= 1 and gqa_ratio <= 16)
        and max_seq_len <= 131072
    )


def paged_attention(
    query: torch.Tensor,
    kv_cache: KVCache,
    kv_head_mapping: torch.Tensor,
    softmax_scale: float,
    block_tables: torch.Tensor,
    seqlen: Seqlen,
    max_s: int,
    *,
    kv_scales: KVScales,
    softcap: Optional[float] = None,
    window_size_left: Optional[int] = -1,
):
    # Adapted from: https://github.com/vllm-project/vllm/blob/f8a1e39fae05ca610be8d5a78be9d40f5274e5fc/vllm/model_executor/layers/attention.py
    # Copyright 2023 The vLLM team. All rights
    # reserved.
    #
    # Licensed under the Apache License, Version 2.0 (the "License");
    # you may not use this file except in compliance with the License.
    # You may obtain a copy of the License at
    #
    #     http://www.apache.org/licenses/LICENSE-2.0
    #
    # Unless required by applicable law or agreed to in writing, software
    # distributed under the License is distributed on an "AS IS" BASIS,
    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    # See the License for the specific language governing permissions and
    # limitations under the License.
    #

    if ATTENTION == "flashdecoding":
        max_q = 1
        max_k = max_s
        import flash_attn_2_cuda

        window_size_right = -1 if window_size_left == -1 else 0

        if softcap is None:
            softcap = 0.0
        out = flash_attn_2_cuda.varlen_fwd(
            query,
            kv_cache.key,
            kv_cache.value,
            None,
            seqlen.cu_seqlen_q,
            seqlen.cu_seqlen_k,
            None,  # pad_k
            None,
            block_tables,
            None,
            max_q,
            max_k,
            0.0,  # dropout
            softmax_scale,
            False,  # zero_tensors
            True,  # causal
            window_size_left,  # Window_left
            window_size_right,  # Window right
            softcap,
            False,  # return softmax
            None,  # generator
        )
        return out[0]

    if softcap is not None:
        raise RuntimeError("Paged attention doesn't support softcapping")

    # value_cache => [num_blocks, num_heads, head_size, block_size]
    # block_size = kv_cache.value.shape[3]
    block_size = BLOCK_SIZE
    num_seqs, num_heads, head_size = query.shape

    num_kv_heads = kv_cache.key.shape[1]
    gqa_ratio = num_heads // num_kv_heads
    use_custom = _use_rocm_custom_paged_attention(
        query.dtype, head_size, block_size, gqa_ratio, max_s
    )

    if not use_custom:
        _PARTITION_SIZE = _PARTITION_SIZE_V1V2
    else:
        _PARTITION_SIZE = _PARTITION_SIZE_CUSTOM

    max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE
    input_lengths = seqlen.input_lengths + seqlen.cache_lengths

    out = torch.empty_like(query)

    if kv_cache.dtype == torch.float8_e4m3fn:
        key = kv_cache.key.view(torch.uint8)
        value = kv_cache.value.view(torch.uint8)
        kv_cache_dtype = "fp8"
    else:
        key = kv_cache.key
        value = kv_cache.value
        kv_cache_dtype = "auto"

    # NOTE(woosuk): We use a simple heuristic to decide whether to use
    # PagedAttention V1 or V2. If the number of partitions is 1, we use
    # V1 to avoid the overhead of reduction. Also, if the number of
    # sequences or heads is large, we use V1 since there is enough work
    # to parallelize.
    use_v1 = (
        max_s <= 8192
        and (max_num_partitions == 1 or num_seqs * num_heads > 512)
        and not use_custom
    )
    if use_v1:
        ops.paged_attention_v1(
            out,
            query,
            key,
            value,
            num_kv_heads,
            softmax_scale,
            block_tables,
            input_lengths,
            block_size,
            max_s,
            None,
            kv_cache_dtype,
            kv_scales.key_scale_cpu,
            kv_scales.value_scale_cpu,
        )
    else:
        # Run PagedAttention V2.
        assert _PARTITION_SIZE % block_size == 0
        tmp_output = torch.zeros(
            size=(num_seqs, num_heads, max_num_partitions, head_size),
            dtype=out.dtype,
            device=out.device,
        )
        exp_sums = torch.zeros(
            size=(num_seqs, num_heads, max_num_partitions),
            dtype=torch.float32,
            device=out.device,
        )
        max_logits = torch.zeros_like(exp_sums)

        if not use_custom:
            ops.paged_attention_v2(
                out,
                exp_sums,
                max_logits,
                tmp_output,
                query,
                key,
                value,
                num_kv_heads,
                softmax_scale,
                block_tables,
                input_lengths,
                block_size,
                max_s,
                None,
                kv_cache_dtype,
                kv_scales.key_scale_cpu,
                kv_scales.value_scale_cpu,
            )
        else:
            ops.paged_attention_rocm(
                out,
                exp_sums,
                max_logits,
                tmp_output,
                query,
                key,
                value,
                num_kv_heads,
                softmax_scale,
                block_tables,
                input_lengths,
                block_size,
                max_s,
                None,
                kv_cache_dtype,
                kv_scales.key_scale_cpu,
                kv_scales.value_scale_cpu,
                None,
                _PARTITION_SIZE,
            )

    return out


if ENGINE != "triton":
    try:
        import flash_attn_2_cuda

        log_master(
            logger.info,
            "ROCm: using Flash Attention 2 Composable Kernel implementation.",
        )
    except ImportError as e:
        if major >= 8:
            architecture_suffix = f"-{SYSTEM}"
            raise ImportError(
                "Flash Attention V2 is not installed.\n"
                "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
                f"or install flash attention v2 with `cd server && make install install-flash-attention-v2{architecture_suffix}`"
            )
        elif is_sm75:
            raise ImportError(
                "Flash Attention is not installed.\n"
                "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
                "or install flash attention with `cd server && make install install-flash-attention`"
            ) from e
        else:
            for idx in range(torch.cuda.device_count()):
                name = torch.cuda.get_device_name(idx)
                if "MI210" not in name and "MI250" not in name:
                    raise ImportError(
                        f"AMD GPU {torch.cuda.get_device_name(idx)} does not support flash-attention"
                    )
            raise ImportError(
                f"AMD GPU with ROCm capability {major} {minor} is not supported"
            ) from e


SUPPORTS_WINDOWING = False


def attention(
    *,
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    kv_cache: KVCache,
    kv_scales: KVScales,
    seqlen: Seqlen,
    block_tables: torch.Tensor,
    softmax_scale: float,
    window_size_left: int = -1,
    causal: bool = True,
    softcap: Optional[float] = None,
):
    if ENGINE == "ck":
        if window_size_left <= 0 and window_size_left != -1:
            raise ValueError("`window_size_left` must be > 0 or -1")

        out = torch.empty_like(query)

        if softcap is None:
            softcap = 0.0

        # We do not need to check window_size_left (not supported) here, so it is already checked ahead of time at model load.
        return flash_attn_2_cuda.varlen_fwd(
            query,
            # flashdecoding: pass the KV caches, paged: pass the KV.
            kv_cache.key if ATTENTION == "flashdecoding" else key,
            kv_cache.value if ATTENTION == "flashdecoding" else value,
            out,
            seqlen.cu_seqlen_q,
            seqlen.cu_seqlen_k,
            None,
            None,
            block_tables if ATTENTION == "flashdecoding" else None,
            None,
            seqlen.max_q,
            seqlen.max_k,
            0.0,
            softmax_scale,
            False,
            causal,
            window_size_left,
            0,
            softcap,
            False,
            None,
        )[0]

    elif ENGINE == "triton":
        from .flash_attn_triton import triton_attention

        if softcap is not None:
            raise NotImplementedError("softcap is only available with CK flash attn")

        out = torch.empty_like(query)

        # We do not need to check window_size_left (not supported) here, so it is already checked ahead of time at model load.
        output, _ = triton_attention(
            query,
            key,
            value,
            out,
            seqlen.cu_seqlen_q,
            seqlen.cu_seqlen_q,
            seqlen.max_q,
            seqlen.max_k,
            causal,
            softmax_scale,
        )
        return output

    else:
        raise RuntimeError(f"Unknown attention engine {ENGINE}")


__all__ = [
    "SUPPORTS_WINDOWING",
    "attention",
    "paged_attention",
]


================================================
FILE: server/text_generation_server/layers/awq/conversion_utils.py
================================================
import torch
from typing import List


AWQ_PACK_ORDER = [0, 2, 4, 6, 1, 3, 5, 7]
REVERSE_AWQ_PACK_ORDER = [0, 4, 1, 5, 2, 6, 3, 7]


def pack(imatrix: torch.Tensor, direction: str = "column"):
    """
    Packs a 4-bit integer matrix into a packed 32-bit integer matrix.
    Args:
        imatrix (torch.Tensor): matrix of integers
        direction (str): direction of packing, either "column" or "row"
    Returns:
        qmatrix (torch.Tensor): packed matrix of integers
    """
    shifts = torch.arange(0, 32, 4, dtype=torch.int32, device=imatrix.device)

    imatrix = imatrix.to(torch.int8) & 0x0F  # eventually correct overflow

    if direction == "column":
        imatrix = imatrix.view(-1, imatrix.shape[1] // (32 // 4), (32 // 4))
        qmatrix = torch.bitwise_left_shift(imatrix, shifts[None, None, :]).sum(dim=-1)

    elif direction == "row":
        imatrix = imatrix.view(imatrix.shape[0] // (32 // 4), (32 // 4), -1)
        qmatrix = torch.bitwise_left_shift(imatrix, shifts[None, :, None]).sum(dim=1)

    qmatrix = qmatrix.to(torch.int32)

    return qmatrix


def unpack(qmatrix: torch.Tensor, direction: str = "column"):
    """
    Unpacks a 32-bit packed integer matrix into a 4-bit integer matrix.
    Args:
        qmatrix (torch.Tensor): matrix of packed integers
        direction (str): direction of unpacking, either "column" or "row"
    Returns:
        imatrix (torch.Tensor): matrix of integers
    """
    shifts = torch.arange(0, 32, 4, device=qmatrix.device)

    if direction == "column":
        imatrix = torch.bitwise_right_shift(
            qmatrix[:, :, None], shifts[None, None, :]
        ).view(qmatrix.shape[0], -1)

    elif direction == "row":
        imatrix = torch.bitwise_right_shift(
            qmatrix[:, None, :], shifts[None, :, None]
        ).view(-1, qmatrix.shape[-1])

    imatrix = imatrix.to(torch.int8) & 0x0F  # eventually correct overflow

    return imatrix


def apply_order(
    imatrix: torch.Tensor,
    direction: str = "column",
    order: List[int] = AWQ_PACK_ORDER,
):
    """
    Applies the order to a 4-bit integer matrix.
    Args:
        imatrix (torch.Tensor): matrix of integers
        direction (str): direction of applying order, either "column" or "row"
        order (List[int]): order to apply, default is AWQ_PACK_ORDER
    Returns:
        imatrix (torch.Tensor): matrix of integers
    """
    if direction == "column":
        imatrix = imatrix.view(-1, (32 // 4))[:, order].view(imatrix.shape)
    elif direction == "row":
        imatrix = imatrix.view((32 // 4), -1)[order, :].view(imatrix.shape)

    return imatrix


def fast_awq_to_gptq(qweight, qzeros):
    # awq uses column packing for both weights and zeros
    izeros = unpack(qzeros, direction="column")
    iweights = unpack(qweight, direction="column")

    # Reverse the order of the iweight and izeros tensors
    izeros = apply_order(izeros, direction="column", order=REVERSE_AWQ_PACK_ORDER)
    iweights = apply_order(iweights, direction="column", order=REVERSE_AWQ_PACK_ORDER)
    # Subtract 1 from the izeros tensor (gptq adds 1 to the zeros)
    izeros = izeros - 1
    # exllama uses row packing for weights and column packing for zeros
    qzeros = pack(izeros, direction="column")
    qweight = pack(iweights, direction="row")

    return qweight, qzeros


================================================
FILE: server/text_generation_server/layers/awq/quantize/__init__.py
================================================
from text_generation_server.utils.import_utils import SYSTEM

if SYSTEM == "ipex":
    from .ipex import WQLinear
elif SYSTEM == "cuda":
    from .cuda import WQLinear

__all__ = ["WQLinear"]


================================================
FILE: server/text_generation_server/layers/awq/quantize/cuda.py
================================================
# Copied logic from https://github.com/mit-han-lab/llm-awq/blob/f084f40bd996f3cf3a0633c1ad7d9d476c318aaa/awq/quantize/qmodule.py

from typing import Optional
import torch
import torch.nn as nn
import awq_inference_engine  # with CUDA kernels


# class ScaledActivation(nn.Module):
#     def __init__(self, module, scales):
#         super().__init__()
#         self.act = module
#         self.scales = nn.Parameter(scales.data)
#
#     def forward(self, x):
#         return self.act(x) / self.scales.view(1, 1, -1).to(x.device)


class WQLinear(nn.Module):
    def __init__(
        self, w_bit, group_size, qweight, qzeros, scales, bias: Optional[torch.Tensor]
    ):
        super().__init__()

        if w_bit not in [4]:
            raise NotImplementedError("Only 4-bit are supported for now.")

        self.in_features = qweight.shape[0]
        self.out_features = qweight.shape[1] * 32 // w_bit

        self.w_bit = w_bit
        self.group_size = group_size if group_size != -1 else self.in_features
        # quick sanity check (make sure aligment)
        assert self.in_features % self.group_size == 0
        assert self.out_features % (32 // self.w_bit) == 0

        self.qweight = qweight
        self.qzeros = qzeros
        self.scales = scales
        self.bias = bias

    @torch.no_grad()
    def forward(self, x):
        out_shape = x.shape[:-1] + (self.out_features,)
        out = awq_inference_engine.gemm_forward_cuda(
            x.reshape(-1, x.shape[-1]), self.qweight, self.scales, self.qzeros, 8
        )
        out = out + self.bias if self.bias is not None else out
        return out.reshape(out_shape)


================================================
FILE: server/text_generation_server/layers/awq/quantize/ipex.py
================================================
from typing import Optional
import torch
import torch.nn as nn
import intel_extension_for_pytorch as ipex


class WQLinear(nn.Module):
    def __init__(
        self, w_bit, group_size, qweight, qzeros, scales, bias: Optional[torch.Tensor]
    ):
        super().__init__()

        if w_bit not in [4]:
            raise NotImplementedError("Only 4-bit are supported for now.")

        self.in_features = qweight.shape[0]
        self.out_features = qweight.shape[1] * 32 // w_bit

        self.w_bit = w_bit
        self.group_size = group_size if group_size != -1 else self.in_features
        # quick sanity check (make sure aligment)
        assert self.in_features % self.group_size == 0
        assert self.out_features % (32 // self.w_bit) == 0

        self.qweight = qweight
        self.qzeros = qzeros
        self.scales = scales
        self.bias = bias
        self.woq_linear = (
            ipex.llm.quantization.IPEXWeightOnlyQuantizedLinear.from_weight(
                self.qweight,
                self.scales,
                self.qzeros,
                self.in_features,
                self.out_features,
                bias=self.bias,
                group_size=self.group_size,
                quant_method=ipex.llm.quantization.QuantMethod.AWQ_GEMM,
                dtype=ipex.llm.quantization.QuantDtype.INT4,
            )
        )

    @torch.no_grad()
    def forward(self, x):
        out_shape = x.shape[:-1] + (self.out_features,)
        out = self.woq_linear(x.reshape(-1, x.shape[-1]))
        return out.reshape(out_shape)


================================================
FILE: server/text_generation_server/layers/bnb.py
================================================
from dataclasses import dataclass

import bitsandbytes as bnb
import torch
from bitsandbytes.nn import Int8Params, Params4bit
from text_generation_server.utils.weights import UnquantizedWeight


@dataclass
class BNBWeight(UnquantizedWeight):
    weight: torch.Tensor

    def get_linear(self, bias: torch.Tensor):
        return Linear8bitLt(self.weight, bias, has_fp16_weights=False, threshold=6.0)


class Linear8bitLt(torch.nn.Module):
    def __init__(
        self,
        weight,
        bias,
        has_fp16_weights=True,
        memory_efficient_backward=False,
        threshold=0.0,
        index=None,
    ):
        super().__init__()
        assert (
            not memory_efficient_backward
        ), "memory_efficient_backward is no longer required and the argument is deprecated in 0.37.0 and will be removed in 0.39.0"
        self.state = bnb.MatmulLtState()
        self.index = index

        # Necessary for stacked layers
        self.state.threshold = threshold
        self.state.has_fp16_weights = has_fp16_weights
        self.state.memory_efficient_backward = memory_efficient_backward
        if threshold > 0.0 and not has_fp16_weights:
            self.state.use_pool = True

        self.weight = Int8Params(
            weight.data,
            has_fp16_weights=has_fp16_weights,
            requires_grad=has_fp16_weights,
        )
        self.weight.cuda(weight.device)
        self.bias = bias

    def init_8bit_state(self):
        self.state.CB = self.weight.CB
        self.state.SCB = self.weight.SCB
        self.weight.CB = None
        self.weight.SCB = None

    def forward(self, x: torch.Tensor):
        self.state.is_training = self.training
        if self.weight.CB is not None:
            self.init_8bit_state()

        # weights are cast automatically as Int8Params, but the bias has to be cast manually
        if self.bias is not None and self.bias.dtype != x.dtype:
            self.bias.data = self.bias.data.to(x.dtype)

        out = bnb.matmul(x, self.weight, bias=self.bias, state=self.state)

        if not self.state.has_fp16_weights:
            if self.state.CB is not None and self.state.CxB is not None:
                # we converted 8-bit row major to turing/ampere format in the first inference pass
                # we no longer need the row-major weight
                del self.state.CB
                self.weight.data = self.state.CxB
        return out


@dataclass
class BNBFP4Weight(UnquantizedWeight):
    weight: torch.Tensor

    def get_linear(self, bias: torch.Tensor):
        return Linear4bit(self.weight, bias, quant_type="fp4")


@dataclass
class BNBNF4Weight(UnquantizedWeight):
    weight: torch.Tensor

    def get_linear(self, bias: torch.Tensor):
        return Linear4bit(self.weight, bias, quant_type="nf4")


class Linear4bit(torch.nn.Module):
    def __init__(self, weight, bias, quant_type):
        super().__init__()
        self.weight = Params4bit(
            weight.data,
            requires_grad=False,
            compress_statistics=True,
            quant_type=quant_type,
        )
        self.compute_dtype = None
        self.weight.cuda(weight.device)
        self.bias = bias

    def forward(self, x: torch.Tensor):
        # weights are cast automatically as Int8Params, but the bias has to be cast manually
        if self.bias is not None and self.bias.dtype != x.dtype:
            self.bias.data = self.bias.data.to(x.dtype)

        if getattr(self.weight, "quant_state", None) is None:
            print(
                "FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first."
            )
        inp_dtype = x.dtype
        if self.compute_dtype is not None:
            x = x.to(self.compute_dtype)

        bias = None if self.bias is None else self.bias.to(self.compute_dtype)
        out = bnb.matmul_4bit(
            x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state
        )

        out = out.to(inp_dtype)

        return out


================================================
FILE: server/text_generation_server/layers/compressed_tensors/__init__.py
================================================
from .loader import CompressedTensorsLoader

__all__ = ["CompressedTensorsLoader"]


================================================
FILE: server/text_generation_server/layers/compressed_tensors/loader.py
================================================
from typing import Any, Dict, List, Union

from compressed_tensors import QuantizationConfig, QuantizationStatus
from compressed_tensors.config import CompressionFormat
from compressed_tensors.quantization import (
    QuantizationScheme,
    QuantizationType,
    find_name_or_class_matches,
)
from loguru import logger
from pydantic import ValidationError
from torch import nn

from text_generation_server.layers.compressed_tensors.w8an_fp import W8ANFpLoader
from text_generation_server.layers.compressed_tensors.w8a8_int import W8A8IntLoader
from text_generation_server.layers.compressed_tensors.wna16_int_24 import (
    WNA16Int24Loader,
)
from text_generation_server.layers.compressed_tensors.wna16_int import WNA16IntLoader
from text_generation_server.utils.log import log_once
from text_generation_server.utils.weights import (
    DefaultWeightsLoader,
    UnquantizedWeight,
    Weights,
    WeightsLoader,
)

# compressed-tensors can match modules as quantization targets. However,
# they need to be objects rather than classes or class names. Since we
# need to match `Linear` targets, make an instance that can be re-used.
_EMPTY_LINEAR: nn.Module = nn.Linear(0, 0)


class CompressedTensorsLoader(WeightsLoader):
    """Loader for checkpoints stored in the compressed-tensors format."""

    def __init__(self, config: Dict[str, Any]):
        quantization_config_raw = config.get("quantization_config")
        if quantization_config_raw is None:
            # `compression_config` was renamed to `quantization_config`; support
            # retained for backward compatibility.
            quantization_config_raw = config.get("compression_config")
        if quantization_config_raw is None:
            raise ValueError(
                "Checkpoint does not have compressed-tensors configuration"
            )

        try:
            quantization_config = QuantizationConfig.model_validate(
                quantization_config_raw
            )
        except ValidationError as e:
            raise ValueError("Cannot parse compressed-tensors configuration") from e

        if quantization_config.quantization_status not in (
            QuantizationStatus.COMPRESSED,
            QuantizationStatus.FROZEN,
        ):
            raise ValueError(
                f"Model quantization was not finished, status was: {quantization_config.quantization_status}"
            )

        self.ignore = (
            quantization_config.ignore if quantization_config.ignore is not None else []
        )
        self.loaders = self._get_target_loaders(quantization_config)

        for target, loader in self.loaders.items():
            log_once(
                logger.info,
                f"Using {loader} for compressed-tensors target '{target}'",
            )

    def get_weights(self, weights: Weights, prefix: str):
        loader = self._lookup_loader(prefix)
        return loader.get_weights(weights, prefix)

    def get_weights_col_packed(
        self,
        weights: "Weights",
        prefix: str,
        block_sizes: Union[int, List[int]],
    ):
        loader = self._lookup_loader(prefix)
        return loader.get_weights_col_packed(weights, prefix, block_sizes)

    def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
        loader = self._lookup_loader(prefixes[0])
        return loader.get_multi_weights_col(weights, prefixes, dim)

    def get_weights_row(self, weights: Weights, prefix: str):
        loader = self._lookup_loader(prefix)
        return loader.get_weights_row(weights, prefix)

    def _get_target_loaders(
        self, quantization_config: QuantizationConfig
    ) -> Dict[str, WeightsLoader]:
        """
        A compressed-tensors checkpoint can use different quantizations
        for different targets. This method returns a dictionary with a
        loader per target.
        """

        loaders: Dict[str, WeightsLoader] = {}

        format = quantization_config.format

        for group_name, group in quantization_config.config_groups.items():
            # The group configuration can be a string, but does that ever
            # happen in a serialized quantization config?
            assert isinstance(group, QuantizationScheme)

            loader = self._create_loader_for_group(format, group_name, group)

            # A quantized parameter group can have multiple targets, add the
            # loader for all the targets.
            for target in group.targets:
                if target in loaders:
                    raise ValueError(
                        f"Target '{target} has multiple configured loaders'"
                    )
                loaders[target] = loader

        return loaders

    def _create_loader_for_group(
        self, format: str, group_name: str, group: QuantizationScheme
    ) -> WeightsLoader:
        """
        Find and create a loader for the group with the given quantization
        scheme.
        """
        # NOTE: we ignore group.output_activations because we don't support
        #       output quantization yet.

        input_activations = group.input_activations
        weights = group.weights
        if (
            format
            in {
                CompressionFormat.float_quantized.value,
                CompressionFormat.naive_quantized.value,
            }
            and weights is not None
            and weights.type == QuantizationType.FLOAT
            and weights.num_bits == 8
        ):
            # FP W8A8 or W8A16.
            return W8ANFpLoader(input_activations=input_activations, weights=weights)
        elif (
            format == CompressionFormat.pack_quantized.value
            and weights is not None
            and weights.type == QuantizationType.INT
            and weights.num_bits in (4, 8)
        ):
            # INT W4A16 or W8A16 (GPTQ/AWQ-like).
            return WNA16IntLoader(weights)
        elif (
            format == CompressionFormat.marlin_24.value
            and weights is not None
            and weights.type == QuantizationType.INT
            and weights.num_bits in (4, 8)
        ):
            return WNA16Int24Loader(weights)
        elif (
            format
            in {
                CompressionFormat.int_quantized.value,
                CompressionFormat.naive_quantized.value,
            }
            and weights is not None
            and weights.type == QuantizationType.INT
            and weights.num_bits == 8
        ):
            return W8A8IntLoader(input_args=input_activations, weight_args=weights)
        else:
            raise ValueError(
                f"Group '{group_name}' has unsupported compressed-tensors configurtion"
            )

    def _lookup_loader(self, prefix: str) -> WeightsLoader:
        """
        Look up the loader to use for a given parameter name (prefix).
        """

        if len(find_name_or_class_matches(prefix, _EMPTY_LINEAR, self.ignore)) > 0:
            return DefaultWeightsLoader(UnquantizedWeight)

        # We currently only handle linear layers, so unconditionally pass
        # a `Linear` instance.
        targets = find_name_or_class_matches(prefix, _EMPTY_LINEAR, self.loaders.keys())
        if len(targets) == 0:
            raise ValueError(
                f"Cannot find compressed-tensors target for prefix: {prefix}"
            )
        return self.loaders[targets[0]]


================================================
FILE: server/text_generation_server/layers/compressed_tensors/w8a8_int.py
================================================
from typing import List, Optional, Union, TypeVar
from dataclasses import dataclass

from loguru import logger
import torch
from compressed_tensors.quantization import QuantizationArgs, QuantizationType

from text_generation_server.layers.fp8 import _load_scalar_or_matrix_scale
from text_generation_server.utils.import_utils import SYSTEM
from text_generation_server.utils.kernels import load_kernel
from text_generation_server.utils.log import log_once
from text_generation_server.utils.weights import Weight, Weights, WeightsLoader

if SYSTEM == "cuda":
    quantization = load_kernel(
        module="quantization", repo_id="kernels-community/quantization"
    )
else:
    quantization = None


class W8A8IntLoader(WeightsLoader):
    """
    Loader for w8a8 integer compressed-tensors parameters.
    """

    def __init__(
        self,
        *,
        input_args: Optional[QuantizationArgs],
        weight_args: QuantizationArgs,
    ):
        if weight_args.type != QuantizationType.INT and weight_args.num_bits != 8:
            raise ValueError(
                f"{type(self).__name__} only supports w8a8 int checkpoints"
            )

        if not weight_args.symmetric:
            raise ValueError("Checkpoints with asymmetric weights are not supported")

        self.load_weight_scale = not weight_args.dynamic

        if input_args is not None:
            self.input_symmetric = input_args.symmetric

            if not input_args.dynamic:
                log_once(
                    logger.warning,
                    "Forcing dynamic input quantization for compressed_tensors w8a8 int checkpoint (for better accuracy).",
                )
        else:
            self.input_symmetric = True

    def __str__(self) -> str:
        def scale_to_str(scale):
            return "static" if scale else "dynamic"

        def symmetric_to_str(symmetric):
            return "symmetric" if symmetric else "asymmetric"

        return f"{self.__class__.__name__} (w8a8 int, input: dynamic/{symmetric_to_str(self.input_symmetric)}, weight: {scale_to_str(self.load_weight_scale)}/symmetric))"

    def get_weights(self, weights: "Weights", prefix: str):
        w = weights.get_tensor(f"{prefix}.weight", to_dtype=False)

        weight_scale = None
        if self.load_weight_scale:
            weight_scale = weights.get_tensor(
                f"{prefix}.weight_scale", to_dtype=False
            ).reshape(-1)

        return Int8Weight(
            input_symmetric=self.input_symmetric,
            weight=w,
            weight_scale=weight_scale,
        )

    def get_weights_col_packed(
        self,
        weights: Weights,
        prefix: str,
        block_sizes: Union[int, List[int]],
    ):
        w = weights.get_packed_sharded(
            f"{prefix}.weight", dim=0, block_sizes=block_sizes, to_dtype=False
        )

        weight_scale = None
        if self.load_weight_scale:
            weight_scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
            if weight_scale.numel() > 1:
                weight_scale = weights.get_packed_sharded(
                    f"{prefix}.weight_scale",
                    dim=0,
                    block_sizes=block_sizes,
                    to_dtype=False,
                )
            weight_scale = weight_scale.reshape(-1)

        return Int8Weight(
            input_symmetric=self.input_symmetric,
            weight=w,
            weight_scale=weight_scale,
        )

    def get_multi_weights_col(self, weights: "Weights", prefixes: List[str], dim: int):
        w = [
            weights.get_sharded(f"{p}.weight", dim=0, to_dtype=False) for p in prefixes
        ]
        shapes = [x.shape for x in w]

        w = torch.cat(w, dim=dim)

        weight_scale = None
        if self.load_weight_scale:
            weight_scale = [
                _load_scalar_or_matrix_scale(weights, f"{p}.weight_scale", shape)
                for p, shape in zip(prefixes, shapes)
            ]
            weight_scale = torch.cat(weight_scale, dim=0).reshape(-1, 1)

        return Int8Weight(
            input_symmetric=self.input_symmetric,
            weight=w,
            weight_scale=weight_scale,
        )

    def get_weights_row(self, weights: "Weights", prefix: str):
        w = weights.get_sharded(f"{prefix}.weight", dim=1, to_dtype=False)

        weight_scale = None
        if self.load_weight_scale:
            weight_scale = weights.get_tensor(
                f"{prefix}.weight_scale", to_dtype=False
            ).reshape(-1)

        return Int8Weight(
            input_symmetric=self.input_symmetric,
            weight=w,
            weight_scale=weight_scale,
        )


OtherT = TypeVar("OtherT")


def _get_tensor_or_else(
    weights: Weights, prefix: str, other: OtherT
) -> Union[torch.Tensor, OtherT]:
    # Even if a checkpoint uses e.g. zero-points, they can be elided:
    # https://github.com/neuralmagic/compressed-tensors/blob/db6ccb25b265e8370813ecab5e95714a6728b5a6/src/compressed_tensors/compressors/quantized_compressors/base.py#L105
    if weights.has_tensor(prefix):
        return weights.get_tensor(prefix, to_dtype=False)
    else:
        return other


@dataclass
class Int8Weight(Weight):
    input_symmetric: bool
    weight: torch.Tensor
    weight_scale: Optional[torch.Tensor]

    def get_linear(self, bias: torch.Tensor):
        if self.weight_scale is None:
            assert quantization is not None
            qweight, weight_scale, _ = quantization.scaled_int8_quant(self.weight)
            return W8A8IntLinear(
                bias=bias,
                input_symmetric=self.input_symmetric,
                weight=qweight,
                weight_scale=weight_scale,
            )
        else:
            return W8A8IntLinear(
                bias=bias,
                input_symmetric=self.input_symmetric,
                weight=self.weight,
                weight_scale=self.weight_scale,
            )


class W8A8IntLinear(torch.nn.Module):
    def __init__(
        self,
        *,
        bias: Optional[torch.Tensor],
        input_symmetric: bool,
        weight: torch.Tensor,
        weight_scale: torch.Tensor,
    ):
        super().__init__()

        weight_scale = weight_scale.to(torch.float32)

        self.bias = bias
        self.input_symmetric = input_symmetric
        # cutlass kernels require transposed weights.
        self.weight = weight.t()
        self.weight_scale = weight_scale

        if input_symmetric:
            self.zero_point_adj = None
        else:
            # https://github.com/vllm-project/vllm/blob/8d59dbb00044a588cab96bcdc028006ed922eb06/csrc/quantization/cutlass_w8a8/Epilogues.md#scaledepilogueazp
            self.zero_point_adj = self.weight.sum(
                dim=0, keepdim=True, dtype=torch.int32
            )

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        assert quantization is not None

        qinput, input_scale, input_zero_point = quantization.scaled_int8_quant(
            input=input,
            scale=None,
            azp=None,
            symmetric=self.input_symmetric,
        )

        if self.input_symmetric:
            return quantization.cutlass_scaled_mm(
                a=qinput,
                b=self.weight,
                scale_a=input_scale,
                scale_b=self.weight_scale,
                out_dtype=input.dtype,
                bias=self.bias,
            )
        else:
            assert (
                self.zero_point_adj is not None
                and input_scale is not None
                and (self.input_symmetric or input_zero_point is not None)
            )

            return quantization.cutlass_scaled_mm_azp(
                a=qinput,
                b=self.weight,
                scale_a=input_scale,
                scale_b=self.weight_scale,
                out_dtype=input.dtype,
                azp_adj=self.zero_point_adj,
                azp=input_zero_point,
                bias=self.bias,
            )


================================================
FILE: server/text_generation_server/layers/compressed_tensors/w8an_fp.py
================================================
from typing import List, Optional, Union

import torch
from compressed_tensors.quantization import QuantizationArgs, QuantizationType

from text_generation_server.layers.fp8 import (
    Fp8Weight,
    _load_scalar_or_matrix_scale,
    requantize_with_max_scale,
    normalize_e4m3fn_to_native_float8,
)
from text_generation_server.utils.weights import Weights, WeightsLoader
from text_generation_server.utils.import_utils import SYSTEM


class W8ANFpLoader(WeightsLoader):
    """
    Loader for W8A8/W8A16 FP compressed-tensors parameters.
    """

    def __init__(
        self,
        *,
        input_activations: Optional[QuantizationArgs],
        weights: QuantizationArgs,
    ):
        assert weights.type == QuantizationType.FLOAT and weights.num_bits == 8

        # We ignore the `strategy` option which sets the scales to be
        # per-tensor, per-channel or per-token. What scales are supported
        # is dependent on the kernels used (e.g. cutlass can do tokenwise,
        # Torch cannot, and FP8-Marlin does not quantize inputs at all).
        # So, instead we try to use the best-possible configuration.

        self.load_weight_scale = not weights.dynamic
        self.load_input_scale = (
            input_activations is not None and not input_activations.dynamic
        )
        self.force_w8a16 = (
            input_activations is not None and input_activations.num_bits == 16
        )

    def __str__(self) -> str:
        def scale_to_str(scale):
            return "static" if scale else "dynamic"

        quantization_type = f"W8A{16 if self.force_w8a16 else 8}"

        return f"{self.__class__.__name__} ({quantization_type}, weight: {scale_to_str(self.load_weight_scale)}, input: {scale_to_str(self.load_input_scale)})"

    def get_weights(self, weights: "Weights", prefix: str):
        w = weights.get_tensor(f"{prefix}.weight")

        weight_scale = None
        if self.load_weight_scale:
            weight_scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)

            if SYSTEM == "cuda":
                weight_scale = weight_scale.reshape(-1).expand(w.shape[0])

        input_scale = None
        if self.load_input_scale:
            input_scale = weights.get_tensor(
                f"{prefix}.input_scale", to_dtype=False
            ).reshape(-1)

        return Fp8Weight(
            weight=w,
            weight_scale=weight_scale,
            input_scale=input_scale,
            dtype=weights.dtype,
            force_w8a16=self.force_w8a16,
        )

    def get_weights_col_packed(
        self,
        weights: Weights,
        prefix: str,
        block_sizes: Union[int, List[int]],
    ):
        w = weights.get_packed_sharded(
            f"{prefix}.weight", dim=0, block_sizes=block_sizes
        )

        weight_scale = None
        if self.load_weight_scale:
            weight_scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
            if weight_scale.numel() > 1:
                weight_scale = weights.get_packed_sharded(
                    f"{prefix}.weight_scale",
                    dim=0,
                    block_sizes=block_sizes,
                    to_dtype=False,
                )
            if SYSTEM == "cuda":
                weight_scale = weight_scale.reshape(-1).expand(w.shape[0])

        input_scale = None
        if self.load_input_scale:
            input_scale = weights.get_tensor(f"{prefix}.input_scale", to_dtype=False)
            if input_scale.numel() > 1:
                input_scale = weights.get_packed_sharded(
                    f"{prefix}.input_scale",
                    dim=0,
                    block_sizes=block_sizes,
                    to_dtype=False,
                )
            input_scale = input_scale.reshape(-1).max()

        return Fp8Weight(
            weight=w,
            weight_scale=weight_scale,
            input_scale=input_scale,
            dtype=weights.dtype,
            force_w8a16=self.force_w8a16,
        )

    def get_multi_weights_col(self, weights: "Weights", prefixes: List[str], dim: int):
        # FIXME: Force to_device to false as fp8 weights do not support torch.cat on device yet
        w = [
            weights.get_sharded(f"{p}.weight", dim=0, to_device=False) for p in prefixes
        ]
        shapes = [x.shape for x in w]

        # Concat then send to the device
        w = torch.cat(w, dim=dim).to(weights.device)

        weight_scale = None
        if self.load_weight_scale:
            weight_scale = [
                _load_scalar_or_matrix_scale(weights, f"{p}.weight_scale", shape)
                for p, shape in zip(prefixes, shapes)
            ]
            weight_scale = torch.cat(weight_scale, dim=0).reshape(-1)

        input_scale = None
        if self.load_input_scale:
            input_scale = [
                _load_scalar_or_matrix_scale(weights, f"{p}.input_scale", shape)
                for p, shape in zip(prefixes, shapes)
                if weights.has_tensor(f"{p}.input_scale")
            ]
            assert len(input_scale) == 0 or len(input_scale) == len(prefixes)
            input_scale = (
                torch.cat(input_scale, dim=0).reshape(-1).max()
                if len(input_scale) != 0
                else None
            )

        if self.load_weight_scale and SYSTEM == "rocm":
            w, weight_scale, input_scale = normalize_e4m3fn_to_native_float8(
                w, weight_scale, input_scale
            )

            if weight_scale.numel() == len(prefixes):
                logical_widths = [x[0] for x in shapes]
                w, weight_scale = requantize_with_max_scale(
                    w, weight_scale.to(weights.device), logical_widths, weights.dtype
                )

        return Fp8Weight(
            weight=w,
            weight_scale=weight_scale,
            input_scale=input_scale,
            dtype=weights.dtype,
            force_w8a16=self.force_w8a16,
        )

    def get_weights_row(self, weights: "Weights", prefix: str):
        w = weights.get_sharded(f"{prefix}.weight", dim=1)
        weight_scale = None
        if self.load_weight_scale:
            weight_scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)

            if SYSTEM == "cuda":
                weight_scale = weight_scale.reshape(-1).expand(w.shape[0])

        input_scale = None
        if self.load_input_scale:
            input_scale = weights.get_tensor(
                f"{prefix}.input_scale", to_dtype=False
            ).reshape(-1)

        return Fp8Weight(
            weight=w,
            weight_scale=weight_scale,
            input_scale=input_scale,
            dtype=weights.dtype,
            force_w8a16=self.force_w8a16,
        )


================================================
FILE: server/text_generation_server/layers/compressed_tensors/wna16_int.py
================================================
from typing import List, Union

import torch
from compressed_tensors.quantization import ActivationOrdering, QuantizationArgs
from loguru import logger

from text_generation_server.layers.marlin.gptq import repack_gptq_for_marlin
from text_generation_server.utils.log import log_once
from text_generation_server.utils.weights import Weights, WeightsLoader


class WNA16IntLoader(WeightsLoader):
    """
    Loader for W4A16/W8A16 INT compressed-tensors parameters.
    """

    def __init__(self, weights: QuantizationArgs):
        self.weights = weights
        self.desc_act = self.weights.actorder == ActivationOrdering.GROUP
        self.groupsize = (
            -1 if self.weights.group_size is None else self.weights.group_size
        )

    def __str__(self) -> str:
        quantization_type = f"W{self.weights.num_bits}A16"

        return f"{self.__class__.__name__} ({quantization_type})"

    def get_weights(self, weights: Weights, prefix: str):
        log_once(logger.info, "Using GPTQ-Marlin kernels")
        try:
            weight_packed = weights.get_tensor(f"{prefix}.weight_packed").t()
        except RuntimeError:
            raise RuntimeError(
                f"Cannot load w{self.weights.num_bits}a16 weight, make sure the model is already quantized"
            )

        zero_point = None
        if not self.weights.symmetric:
            zero_point = weights.get_tensor(f"{prefix}.weight_zero_point").t()

        g_idx = None
        if self.desc_act:
            g_idx = weights.get_tensor(f"{prefix}.weight_g_idx")

        scales = weights.get_tensor(f"{prefix}.weight.scales").t()

        return repack_gptq_for_marlin(
            qweight=weight_packed.contiguous(),
            scales=scales,
            qzeros=zero_point,
            g_idx=g_idx,
            bits=self.weights.num_bits,
            desc_act=self.desc_act,
            groupsize=self.groupsize,
            quant_method="compressed-tensors",
            sym=self.weights.symmetric,
            sharded_infeatures=False,
        )

    def get_weights_col_packed(
        self,
        weights: Weights,
        prefix: str,
        block_sizes: Union[int, List[int]],
    ):
        try:
            weight_packed = weights.get_packed_sharded(
                f"{prefix}.weight_packed", dim=0, block_sizes=block_sizes
            ).t()
        except RuntimeError:
            raise RuntimeError(
                f"Cannot load w{self.weights.num_bits}a16 weight, make sure the model is already quantized"
            )
        scales = weights.get_packed_sharded(
            f"{prefix}.weight_scale", dim=0, block_sizes=block_sizes
        ).t()
        scales = scales.to(dtype=weights.dtype)

        zero_point = None
        if not self.weights.symmetric:
            zero_point = weights.get_packed_sharded(
                f"{prefix}.qzeros", dim=0, block_sizes=block_sizes
            ).t()

        g_idx = None
        if self.desc_act:
            g_idx = weights.get_tensor(f"{prefix}.g_idx")

        return repack_gptq_for_marlin(
            qweight=weight_packed.contiguous(),
            scales=scales,
            qzeros=zero_point,
            g_idx=g_idx,
            bits=self.weights.num_bits,
            desc_act=self.desc_act,
            groupsize=self.groupsize,
            quant_method="compressed-tensors",
            sym=self.weights.symmetric,
            sharded_infeatures=False,
        )

    def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
        try:
            weight_packed = torch.cat(
                [
                    weights.get_sharded(f"{p}.weight_packed", dim=0).t()
                    for p in prefixes
                ],
                dim=1,
            )
        except RuntimeError:
            raise RuntimeError(
                f"Cannot load w{self.weights.num_bits}a16 weight, make sure the model is already quantized"
            )

        scales = torch.cat(
            [weights.get_sharded(f"{p}.weight_scale", dim=0).t() for p in prefixes],
            dim=1,
        )

        zero_point = None
        if not self.weights.symmetric:
            zero_point = torch.cat(
                [weights.get_sharded(f"{p}.qzeros", dim=0).t() for p in prefixes], dim=1
            ).t()

        g_idx = None
        if self.desc_act:
            w = [weights.get_tensor(f"{p}.g_idx") for p in prefixes]
            for w2 in w[1:]:
                torch.testing.assert_close(w2, w[0])
            g_idx = w[0]

        return repack_gptq_for_marlin(
            qweight=weight_packed.contiguous(),
            scales=scales,
            qzeros=zero_point,
            g_idx=g_idx,
            bits=self.weights.num_bits,
            desc_act=self.desc_act,
            groupsize=self.groupsize,
            quant_method="compressed-tensors",
            sym=self.weights.symmetric,
            sharded_infeatures=False,
        )

    def get_weights_row(self, weights: Weights, prefix: str):
        log_once(logger.info, "Using GPTQ-Marlin kernels")
        try:
            weight_packed = weights.get_sharded(f"{prefix}.weight_packed", dim=1).t()
        except RuntimeError:
            raise RuntimeError(
                f"Cannot load `{self.quantize}` weight, make sure the model is already quantized."
            )

        zero_point = None
        if not self.weights.symmetric:
            if self.desc_act or self.groupsize == -1:
                zero_point = weights.get_tensor(f"{prefix}.weight_zero_point").t()
            else:
                zero_point = weights.get_sharded(
                    f"{prefix}.weight_zero_point", dim=1
                ).t()

        g_idx = None
        if self.desc_act:
            g_idx = weights.get_sharded(f"{prefix}.g_idx", dim=0)

        if self.desc_act or self.groupsize == -1:
            scales = weights.get_tensor(f"{prefix}.weight_scale").t()
        else:
            scales = weights.get_sharded(f"{prefix}.weight_scale", dim=1).t()

        sharded_in_features = weights.process_group.size() > 1

        return repack_gptq_for_marlin(
            qweight=weight_packed.contiguous(),
            scales=scales,
            qzeros=zero_point,
            g_idx=g_idx,
            bits=self.weights.num_bits,
            desc_act=self.desc_act,
            groupsize=self.groupsize,
            quant_method="compressed-tensors",
            sym=self.weights.symmetric,
            sharded_infeatures=sharded_in_features,
        )


================================================
FILE: server/text_generation_server/layers/compressed_tensors/wna16_int_24.py
================================================
from typing import List, Union

import torch


from compressed_tensors.quantization import QuantizationArgs, QuantizationType
from text_generation_server.layers.marlin.marlin import GPTQMarlin24Weight
from text_generation_server.utils.weights import Weights, WeightsLoader


class WNA16Int24Loader(WeightsLoader):
    """
    Loader for W4A16/W8A16 INT 2:4 sparsity compressed-tensors checkpoints.
    """

    def __init__(self, weight_args: QuantizationArgs):
        super().__init__()

        if weight_args.type != QuantizationType.INT:
            raise ValueError(
                f"{type(self).__name__} only supports wNa8 int checkpoints"
            )

        if weight_args.strategy == "group" and weight_args.group_size is None:
            raise ValueError("`group_size` must be set when `actorder` is `group`")

        self.bits = weight_args.num_bits
        self.group_size = weight_args.group_size

    def __str__(self) -> str:
        quantization_type = f"W{self.bits}A16 2:4 sparsity"

        return f"{self.__class__.__name__} ({quantization_type})"

    def get_weights(self, weights: Weights, prefix: str):
        """
        Get weights at the given prefix and apply without tensor paralllism.
        """
        weight_packed = weights.get_tensor(f"{prefix}.weight_packed")
        meta = weights.get_tensor(f"{prefix}.meta")
        scale_packed = weights.get_tensor(f"{prefix}.scale_packed")
        return GPTQMarlin24Weight(
            weight_packed=weight_packed,
            meta=meta,
            scale_packed=scale_packed,
            bits=self.bits,
        )

    def get_weights_col_packed(
        self,
        weights: Weights,
        prefix: str,
        block_sizes: Union[int, List[int]],
    ):
        weight_packed = weights.get_packed_sharded(
            f"{prefix}.weight_packed", dim=1, block_sizes=block_sizes
        )
        meta = weights.get_packed_sharded(
            f"{prefix}.meta", dim=1, block_sizes=block_sizes
        )
        scale_packed = weights.get_packed_sharded(
            f"{prefix}.scale_packed", dim=1, block_sizes=block_sizes
        )
        return GPTQMarlin24Weight(
            weight_packed=weight_packed,
            meta=meta,
            scale_packed=scale_packed,
            bits=self.bits,
        )

    def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
        weight_packed = torch.cat(
            [weights.get_sharded(f"{p}.weight_packed", dim=1) for p in prefixes], dim=1
        )
        meta = torch.cat(
            [weights.get_sharded(f"{p}.meta", dim=1) for p in prefixes], dim=1
        )
        scale_packed = torch.cat(
            [weights.get_sharded(f"{p}.scale_packed", dim=1) for p in prefixes], dim=1
        )
        return GPTQMarlin24Weight(
            weight_packed=weight_packed,
            meta=meta,
            scale_packed=scale_packed,
            bits=self.bits,
        )

    def get_weights_row(self, weights: Weights, prefix: str):
        weight_packed = weights.get_sharded(f"{prefix}.weight_packed", dim=0)
        meta = weights.get_sharded(f"{prefix}.meta", dim=0)
        if self.group_size is None:
            scale_packed = weights.get_tensor(f"{prefix}.scale_packed")
        else:
            scale_packed = weights.get_sharded(f"{prefix}.scale_packed", dim=0)

        return GPTQMarlin24Weight(
            weight_packed=weight_packed,
            meta=meta,
            scale_packed=scale_packed,
            bits=self.bits,
        )


================================================
FILE: server/text_generation_server/layers/conv.py
================================================
from accelerate import init_empty_weights
import torch


@classmethod
def load_conv2d(cls, prefix, weights, in_channels, out_channels, kernel_size, stride):
    weight = weights.get_tensor(f"{prefix}.weight")
    bias = weights.get_tensor(f"{prefix}.bias")
    with init_empty_weights():
        conv2d = cls(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
        )

    conv2d.weight = torch.nn.Parameter(weight)
    conv2d.bias = torch.nn.Parameter(bias)
    return conv2d


@classmethod
def load_conv2d_no_bias(
    cls, prefix, weights, in_channels, out_channels, kernel_size, stride
):
    weight = weights.get_tensor(f"{prefix}.weight")
    with init_empty_weights():
        conv2d = cls(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
        )

    conv2d.weight = torch.nn.Parameter(weight)
    conv2d.bias = None
    return conv2d


torch.nn.Conv2d.load = load_conv2d
torch.nn.Conv2d.load_no_bias = load_conv2d_no_bias


================================================
FILE: server/text_generation_server/layers/eetq.py
================================================
from dataclasses import dataclass

import torch
from text_generation_server.utils.kernels import load_kernel
from text_generation_server.utils.weights import UnquantizedWeight

quantization_eetq = load_kernel(
    module="quantization_eetq", repo_id="kernels-community/quantization-eetq"
)


@dataclass
class EETQWeight(UnquantizedWeight):
    weight: torch.Tensor

    def get_linear(self, bias: torch.Tensor):
        try:
            from text_generation_server.layers.eetq import EETQLinear

            return EETQLinear(self.weight, bias)
        except ImportError:
            raise ImportError(
                "Please install EETQ from https://github.com/NetEase-FuXi/EETQ"
            )


class EETQLinear(torch.nn.Module):
    def __init__(
        self,
        weight,
        bias,
    ) -> None:
        super().__init__()
        device = weight.device
        if weight.dtype != torch.float16:
            weight = weight.to(dtype=torch.float16)
        weight = torch.t(weight).contiguous().cpu()
        weight, scale = quantization_eetq.quant_weights(weight, torch.int8, False)

        self.weight = weight.cuda(device)
        self.scale = scale.cuda(device)
        self.bias = bias.cuda(device) if bias is not None else None

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        output = quantization_eetq.w8_a16_gemm(input, self.weight, self.scale)
        output = output + self.bias if self.bias is not None else output
        return output


================================================
FILE: server/text_generation_server/layers/exl2.py
================================================
from dataclasses import dataclass
from typing import List, Union

import torch
from text_generation_server.utils.weights import Weight, Weights, WeightsLoader


@dataclass
class Exl2Weight(Weight):
    """
    Exllama2 exl2 quantized weights.
    """

    q_weight: torch.Tensor
    q_scale: torch.Tensor
    q_invperm: torch.Tensor
    q_scale_max: torch.Tensor
    q_groups: torch.Tensor

    def __post_init__(self):
        self.q_scale_max /= 256
        self.q_invperm = self.q_invperm.short()

    @property
    def device(self) -> torch.device:
        return self.q_weight.device

    def get_linear(self, bias: torch.Tensor):
        from text_generation_server.layers.gptq import ExllamaQuantLinear

        return ExllamaQuantLinear(self, bias)


class Exl2WeightsLoader(WeightsLoader):
    """Loader for exl2-quantized weights."""

    def get_weights(self, weights: "Weights", prefix: str):
        """
        Get weights at the given prefix and apply without tensor paralllism.
        """
        try:
            q_weight = weights.get_tensor(f"{prefix}.q_weight")
        except RuntimeError:
            raise RuntimeError(
                "Cannot load `exl2`-quantized weight, make sure the model is already quantized."
            )

        q_scale = weights.get_tensor(f"{prefix}.q_scale")
        q_invperm = weights.get_tensor(f"{prefix}.q_invperm")
        q_scale_max = weights.get_tensor(f"{prefix}.q_scale_max")
        q_groups = weights.get_tensor(f"{prefix}.q_groups")

        return Exl2Weight(
            q_weight=q_weight,
            q_scale=q_scale,
            q_invperm=q_invperm,
            q_scale_max=q_scale_max,
            q_groups=q_groups,
        )

    def get_weights_col_packed(
        self,
        weights: Weights,
        prefix: str,
        block_sizes: Union[int, List[int]],
    ):
        raise RuntimeError("Column-packed weights are not supported for exl")

    def get_weights_col(self, weights: Weights, prefix: str):
        # Sharding is not yet supported, so we return the weights as-is.
        return self.get_weights(weights, prefix)

    def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
        raise ValueError("get_multi_weights_col is not supported for exl2")

    def get_weights_row(self, weights: Weights, prefix: str):
        # Sharding is not yet supported, so we return the weights as-is.
        return self.get_weights(weights, prefix)


================================================
FILE: server/text_generation_server/layers/fp8.py
================================================
from dataclasses import dataclass
import os
from typing import Optional, Tuple, Type, Union, List

import torch
from loguru import logger

from text_generation_server.utils.import_utils import SYSTEM
from text_generation_server.utils.kernels import load_kernel
from text_generation_server.utils.weights import (
    Weight,
    WeightsLoader,
    UnquantizedWeight,
    Weights,
)
from text_generation_server.utils.log import log_once

if SYSTEM == "cuda":
    quantization = load_kernel(
        module="quantization", repo_id="kernels-community/quantization"
    )
else:
    quantization = None

try:
    from moe_kernels.fp8_utils import w8a8_block_fp8_matmul, per_token_group_quant_fp8
except ImportError:
    w8a8_block_fp8_matmul = None
    per_token_group_quant_fp8 = None

quant_dtype: torch.dtype = (
    torch.float8_e4m3fnuz if SYSTEM == "rocm" else torch.float8_e4m3fn
)

if SYSTEM == "cuda" and quantization is not None:
    major, minor = torch.cuda.get_device_capability()
    CUTLASS_FP8_AVAILABLE = quantization.cutlass_scaled_mm_supports_fp8(
        major * 10 + minor
    )
else:
    CUTLASS_FP8_AVAILABLE = False


def get_fp8_linear(force_w8a16: bool = False) -> Type[torch.nn.Module]:
    """
    Return an FP8 linear `Module` that is compatible with the current system.
    """

    if SYSTEM == "cuda":
        major, _ = torch.cuda.get_device_capability()
        # Marlin is W8A16, use it when:
        #
        # - On capability 8.x where x < 8: W8A8 FP8 GEMM is not supported.
        # - On capability 8.9: W8A8 FP8 GEMM is supported, but Marlin-FP8 is faster.
        # - On capability 9.x when force_w8a16: cutlass kernels do not support W8A16.
        if (major == 8 or (major == 9 and force_w8a16)) and os.getenv(
            "USE_CUTLASS_W8A8", "0"
        ) != "1":
            # NOTE: Capability 8.9 is supported by cutlass kernels, but FP8-Marlin
            #       gives better decoding throughput on L4 and L40.
            from text_generation_server.layers.marlin import GPTQMarlinFP8Linear

            if major == 8 and minor == 9:
                log_once(
                    logger.info,
                    "GPU supports FP8, but using Marlin FP8 kernel for better performance",
                )
            else:
                log_once(
                    logger.info, "GPU does not support FP8, using Marlin FP8 kernel"
                )

            return GPTQMarlinFP8Linear

    # On other systems let Torch decide if the hardware supports FP8.
    return Fp8Linear


def normalize_e4m3fn_to_native_float8(
    weight: torch.Tensor,
    weight_scale: torch.Tensor,
    input_scale: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
    if weight.dtype == torch.float8_e4m3fn and SYSTEM == "rocm":
        # The bits pattern 10000000(-128) represents zero in e4m3fn
        # but NaN in e4m3fnuz. So here we set it to 0.
        # https://onnx.ai/onnx/technical/float8.html
        weight_as_int8 = weight.view(torch.int8)
        ROCM_FP8_NAN_AS_INT = -128
        weight_as_int8[weight_as_int8 == ROCM_FP8_NAN_AS_INT] = 0
        weight = weight_as_int8.view(torch.float8_e4m3fnuz)

        # For the same bits representation, e4m3fnuz value is half of
        # the e4m3fn value, so we should double the scaling factor to
        # get the same dequantized value.
        # https://onnx.ai/onnx/technical/float8.html
        weight_scale = weight_scale * 2.0
        if input_scale is not None:
            input_scale = input_scale * 2.0
    return weight, weight_scale, input_scale


def per_tensor_dequantize(
    tensor: torch.Tensor,
    inv_scale: Union[float, torch.Tensor],
    dtype: torch.dtype = torch.float16,
) -> torch.Tensor:
    fake_qweight = tensor.to(dtype)
    dq_weight = fake_qweight * inv_scale
    return dq_weight


def requantize_with_max_scale(
    weight: torch.Tensor,
    weight_scale: torch.Tensor,
    logical_widths: int,
    dtype: torch.dtype,
) -> Tuple[torch.Tensor, torch.Tensor]:
    # Max scale to be used for requanitzation.
    max_w_scale = weight_scale.max().float()

    start = 0
    for idx, logical_width in enumerate(logical_widths):
        end = start + logical_width
        weight_dq = per_tensor_dequantize(
            weight[start:end, :], weight_scale[idx], dtype
        )
        weight[start:end, :], max_w_scale_normalized = fp8_quantize(
            weight_dq, max_w_scale
        )
        start = end

    return weight, max_w_scale_normalized


def fp8_quantize(
    weight: torch.Tensor,
    scale: Optional[torch.Tensor] = None,
    scale_upper_bound: Optional[torch.Tensor] = None,
    qdtype: torch.dtype = torch.float8_e4m3fn,
    scalar: bool = False,
):
    """
    This function returns a reciprocal of the scale, so that a tensor can be unscaled
    by multiplying it with the returned scale. If a scale is given through the `scale`
    argument, it must also be a reciprocal (so that scales from an FP8 checkpoint can
    be used without modification).
    """
    if quantization is not None:
        shape = weight.shape
        qweight, scale = quantization.scaled_fp8_quant(
            weight.reshape(-1, shape[-1]),
            scale=scale,
            scale_ub=scale_upper_bound,
            # TODO: don't do this when we have to use the Torch kernel.
            use_per_token_if_dynamic=not scalar,
        )

        return qweight.reshape(shape), scale

    finfo = torch.finfo(qdtype)

    if scale is None:
        # Calculate the scale as dtype max divided by absmax
        scale = finfo.max / weight.abs().max().clamp(min=1e-12, max=scale_upper_bound)
        # scale and clamp the tensor to bring it to
        # the representative range of float8 data type
        # (as default cast is unsaturated)
        qweight = (weight * scale).clamp(min=finfo.min, max=finfo.max)
        scale = scale.float().reciprocal()
    else:
        if SYSTEM == "rocm":
            scale = scale / 2.0
        # Use reciprocal to avoid more expensive division.
        qweight = (weight * scale.reciprocal()).clamp(min=finfo.min, max=finfo.max)

    # Return both float8 data and the inverse scale (as float),
    # as both required as inputs to torch._scaled_mm
    qweight = qweight.to(qdtype)

    if SYSTEM == "rocm":
        qweight, scale, _ = normalize_e4m3fn_to_native_float8(qweight, scale)

    return qweight, scale


class HybridFP8UnquantLoader(WeightsLoader):
    """Weight loader that loads FP8 and unquantized Torch tensors."""

    def __init__(
        self,
        activation_scale_ub: Optional[float],
        to_fp8: bool,
        weight_block_size: Optional[List[int]] = None,
    ):
        self.activation_scale_ub = activation_scale_ub
        self.to_fp8 = to_fp8
        self.weight_block_size = weight_block_size

    def get_weights(self, weights: "Weights", prefix: str):
        w = weights.get_tensor(f"{prefix}.weight")

        if w.dtype == torch.float8_e4m3fn:
            if self.weight_block_size is not None:
                scale = weights.get_tensor(f"{prefix}.weight_scale_inv")
                return Fp8Weight(
                    weight=w,
                    weight_scale=scale,
                    activation_scale_ub=self.activation_scale_ub,
                    dtype=weights.dtype,
                    weight_block_size=self.weight_block_size,
                )
            # FP8 branch
            scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)

            if SYSTEM == "cuda":
                scale.reshape(-1).expand(w.shape[0])

            input_scale = None
            if weights.has_tensor(f"{prefix}.input_scale"):
                input_scale = (
                    weights.get_tensor(f"{prefix}.input_scale", to_dtype=False)
                    .reshape(-1)
                    .max()
                )

            return Fp8Weight(
                weight=w,
                weight_scale=scale,
                input_scale=input_scale,
                activation_scale_ub=self.activation_scale_ub,
                dtype=weights.dtype,
            )
        if self.to_fp8:
            return Fp8Weight(weight=w, dtype=weights.dtype)

        return UnquantizedWeight(w)

    def get_weights_col_packed(
        self,
        weights: Weights,
        prefix: str,
        block_sizes: Union[int, List[int]],
    ):
        w = weights.get_packed_sharded(
            f"{prefix}.weight", dim=0, block_sizes=block_sizes
        )

        if w.dtype == torch.float8_e4m3fn:
            # FP8 branch
            scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)

            if scale.numel() > 1:
                scale = weights.get_packed_sharded(
                    f"{prefix}.weight_scale",
                    dim=0,
                    block_sizes=block_sizes,
                    to_dtype=False,
                )
            if SYSTEM == "cuda":
                scale = scale.reshape(-1).expand(w.shape[0])

            input_scale = None
            if weights.has_tensor(f"{prefix}.input_scale"):
                input_scale = weights.get_tensor(
                    f"{prefix}.input_scale", to_dtype=False
                )
                if input_scale.numel() > 1:
                    input_scale = weights.get_packed_sharded(
                        f"{prefix}.input_scale",
                        dim=0,
                        block_sizes=block_sizes,
                        to_dtype=False,
                    )
                input_scale = input_scale.reshape(-1).max()

            return Fp8Weight(
                weight=w,
                weight_scale=scale,
                input_scale=input_scale,
                activation_scale_ub=self.activation_scale_ub,
                dtype=weights.dtype,
            )
        if self.to_fp8:
            return Fp8Weight(weight=w, dtype=weights.dtype)

        return UnquantizedWeight(w)

    def get_multi_weights_col(self, weights: "Weights", prefixes: List[str], dim: int):
        # FIXME: Force to_device to false as fp8 weights do not support torch.cat on device yet
        w = [
            weights.get_sharded(f"{p}.weight", dim=0, to_device=False) for p in prefixes
        ]
        shapes = [x.shape for x in w]

        # Concat then send to the device
        w = torch.cat(w, dim=dim).to(weights.device)

        # FP8 branch
        if w.dtype == torch.float8_e4m3fn:
            if self.weight_block_size is not None:
                scale = [
                    weights.get_sharded(f"{p}.weight_scale_inv", dim=0, to_device=False)
                    for p in prefixes
                ]
                scale = torch.cat(scale, dim=dim)
                scale = scale.to(weights.device)
                return Fp8Weight(
                    weight=w,
                    weight_scale=scale,
                    activation_scale_ub=self.activation_scale_ub,
                    dtype=weights.dtype,
                    weight_block_size=self.weight_block_size,
                )

            scale = [
                _load_scalar_or_matrix_scale(weights, f"{p}.weight_scale", shape)
                for p, shape in zip(prefixes, shapes)
            ]
            scale = torch.cat(scale, dim=0).reshape(-1)

            input_scale = [
                _load_scalar_or_matrix_scale(weights, f"{p}.input_scale", shape)
                for p, shape in zip(prefixes, shapes)
                if weights.has_tensor(f"{p}.input_scale")
            ]
            assert len(input_scale) == 0 or len(input_scale) == len(prefixes)
            input_scale = (
                torch.cat(input_scale, dim=0).reshape(-1).max()
                if len(input_scale) != 0
                else None
            )

            if SYSTEM == "rocm":
                w, scale, input_scale = normalize_e4m3fn_to_native_float8(
                    w, scale, input_scale
                )

                if scale.numel() == len(prefixes):
                    logical_widths = [x[0] for x in shapes]
                    w, scale = requantize_with_max_scale(
                        w, scale.to(weights.device), logical_widths, weights.dtype
                    )

            return Fp8Weight(
                weight=w,
                weight_scale=scale,
                input_scale=input_scale,
                activation_scale_ub=self.activation_scale_ub,
                dtype=weights.dtype,
            )
        if self.to_fp8:
            return Fp8Weight(weight=w, dtype=weights.dtype)

        return UnquantizedWeight(w)

    def get_weights_row(self, weights: "Weights", prefix: str):
        w = weights.get_sharded(f"{prefix}.weight", dim=1)
        # FP8 branch
        if w.dtype == torch.float8_e4m3fn:
            if self.weight_block_size is not None:
                # XXX: Yes the weights is named scale_inv, but corresponds to scale it seems.
                scale = weights.get_sharded(f"{prefix}.weight_scale_inv", dim=1)

                return Fp8Weight(
                    weight=w,
                    weight_scale=scale,
                    activation_scale_ub=self.activation_scale_ub,
                    dtype=weights.dtype,
                    weight_block_size=self.weight_block_size,
                )

            scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)

            if SYSTEM == "cuda":
                scale = scale.reshape(-1).expand(w.shape[0])

            input_scale = None
            if weights.has_tensor(f"{prefix}.input_scale"):
                input_scale = (
                    weights.get_tensor(f"{prefix}.input_scale", to_dtype=False)
                    .reshape(-1)
                    .max()
                )

            return Fp8Weight(
                weight=w,
                weight_scale=scale,
                input_scale=input_scale,
                activation_scale_ub=self.activation_scale_ub,
                dtype=weights.dtype,
            )
        if self.to_fp8:
            return Fp8Weight(weight=w, dtype=weights.dtype)

        return UnquantizedWeight(w)


@dataclass
class Fp8Weight(Weight):
    weight: torch.Tensor
    dtype: torch.dtype
    weight_scale: Optional[torch.Tensor] = None
    input_scale: Optional[torch.Tensor] = None
    activation_scale_ub: Optional[float] = None
    force_w8a16: bool = False
    weight_block_size: Optional[List[int]] = None

    def get_linear(self, bias: torch.Tensor):
        if self.weight_scale is None:
            return get_fp8_linear(force_w8a16=self.force_w8a16).from_unquant(
                self.weight, bias, self.dtype
            )
        # This is not checked by the fbgemm kernels, but they require contiguous
        # memory. Can be non-contiguous when we e.g. expand from scalars.
        self.weight_scale = self.weight_scale.contiguous()
        return get_fp8_linear(force_w8a16=self.force_w8a16).from_fp8(
            weight=self.weight,
            scale=self.weight_scale,
            dtype=self.dtype,
            bias=bias,
            input_scale=self.input_scale,
            scale_upper_bound=self.activation_scale_ub,
            weight_block_size=self.weight_block_size,
        )


class Fp8Linear(torch.nn.Module):
    _device_identity_cache = {}

    def __init__(
        self,
        qweight: torch.Tensor,
        scale: torch.Tensor,
        dtype: torch.dtype,
        bias: Optional[torch.Tensor] = None,
        input_scale: Optional[torch.Tensor] = None,
        scale_upper_bound: Optional[float] = None,
        weight_block_size: Optional[List[int]] = None,
    ) -> None:
        super().__init__()
        if CUTLASS_FP8_AVAILABLE:
            log_once(logger.info, "Using cutlass w8a8 kernels")
        if SYSTEM == "rocm" and qweight.dtype == torch.float8_e4m3fn:
            qweight, scale, input_scale = normalize_e4m3fn_to_native_float8(
                weight=qweight, weight_scale=scale, input_scale=input_scale
            )

        self.dtype = dtype
        self.qweight = qweight
        self.scale = scale.float()
        self.input_scale = input_scale.float() if input_scale is not None else None
        self.weight_block_size = weight_block_size

        if CUTLASS_FP8_AVAILABLE and scale_upper_bound is not None:
            self.scale_upper_bound = torch.tensor(
                scale_upper_bound, dtype=torch.float32, device=qweight.device
            )
        else:
            self.scale_upper_bound = scale_upper_bound

        self.bias = bias if bias is not None else None

    @classmethod
    def from_unquant(cls, weight, bias, dtype):
        qweight, scale = fp8_quantize(weight, scalar=not CUTLASS_FP8_AVAILABLE)
        return cls(
            qweight=qweight,
            scale=scale,
            dtype=dtype,
            bias=bias,
            input_scale=None,
            scale_upper_bound=None,
        )

    @classmethod
    def from_fp8(
        cls,
        weight: torch.Tensor,
        scale: torch.Tensor,
        dtype: torch.dtype,
        bias: Optional[torch.Tensor] = None,
        **kwargs,
    ) -> "Fp8Linear":
        input_scale = kwargs.get("input_scale", None)
        scale_upper_bound = kwargs.get("scale_upper_bound", None)
        weight_block_size = kwargs.get("weight_block_size", None)

        return cls(
            qweight=weight,
            scale=scale,
            input_scale=input_scale,
            scale_upper_bound=scale_upper_bound,
            bias=bias,
            dtype=dtype,
            weight_block_size=weight_block_size,
        )

    @classmethod
    def get_shared_device_identity(cls, device):
        # Input scaling factors are no longer optional in _scaled_mm starting
        # from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
        if device not in cls._device_identity_cache:
            cls._device_identity_cache[device] = torch.ones(1, device=device)
        return cls._device_identity_cache[device]

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        if self.weight_block_size is not None:
            # https://arxiv.org/pdf/2412.19437
            # At a more granular level. As illustrated in Figure 7 (a), (1) for activations, we group and
            # scale elements on a 1x128 tile basis (i.e., per token per 128 channels); and (2) for weights, we
            # group and scale elements on a 128x128 block basis (i.e., per 128 input channels per 128 output
            # channels).
            qinput, scale = per_token_group_quant_fp8(input, self.weight_block_size[1])
            output = w8a8_block_fp8_matmul(
                qinput,
                self.qweight,
                scale,
                self.scale,
                self.weight_block_size,
                output_dtype=input.dtype,
            )

            if self.bias is not None:
                output = output + self.bias
            return output.to(dtype=input.dtype)
        if CUTLASS_FP8_AVAILABLE:
            # cutlass FP8 supports per-token scales, so get non-scalar scales.
            qinput, scale = fp8_quantize(
                input, scale_upper_bound=self.scale_upper_bound, scalar=False
            )
            return quantization.cutlass_scaled_mm(
                qinput, self.qweight.t(), scale, self.scale, input.dtype, self.bias
            )

        qinput, scale = fp8_quantize(
            input,
            self.input_scale,
            scale_upper_bound=self.scale_upper_bound,
            scalar=True,
        )

        per_tensor_weights = self.scale.numel() == 1
        per_tensor_activations = scale.numel() == 1

        if SYSTEM != "rocm" or (per_tensor_weights and per_tensor_activations):
            output = torch._scaled_mm(
                qinput,
                self.qweight.t(),
                out_dtype=self.dtype,
                scale_a=scale,
                scale_b=self.scale,
                bias=self.bias,
            )

            if isinstance(output, tuple) and len(output) == 2:
                output = output[0]
        else:
            device_identity = None
            if SYSTEM == "rocm":
                device_identity = self.get_shared_device_identity(self.qweight.device)

            output = torch._scaled_mm(
                qinput,
                self.qweight.t(),
                scale_a=device_identity,
                scale_b=device_identity,
                out_dtype=torch.float32,
            )
            if isinstance(output, tuple) and len(output) == 2:
                output = output[0]

            output = output * scale * self.scale.t()
            if self.bias is not None:
                output = output + self.bias

            output = output.to(dtype=self.dtype)

        return output


def _load_scalar_or_matrix_scale(weights: Weights, prefix: str, shape: torch.Size):
    scale = weights.get_tensor(prefix, to_dtype=False)

    if scale.numel() > 1:
        scale = weights.get_sharded(prefix, dim=0, to_dtype=False)
    elif SYSTEM == "rocm":
        return scale.reshape(-1)
    return scale.reshape(-1).expand(shape[0])


================================================
FILE: server/text_generation_server/layers/gptq/__init__.py
================================================
import os
from dataclasses import dataclass
from typing import List, Optional, Union

import torch
from loguru import logger
from text_generation_server.utils.import_utils import SYSTEM
from text_generation_server.utils.log import log_once
from text_generation_server.utils.weights import (
    Weight,
    Weights,
    WeightsLoader,
    DefaultWeightsLoader,
)
import math


@dataclass
class GPTQWeight(Weight):
    qweight: torch.Tensor
    qzeros: torch.Tensor
    scales: torch.Tensor
    g_idx: Optional[torch.Tensor]
    bits: int
    groupsize: int
    use_awq_kernel: bool
    use_exllama: bool

    def __post_init__(self):
        if self.scales.dtype == torch.float:
            self.scales = self.scales.half()

    @property
    def device(self) -> torch.device:
        return self.qweight.device

    def get_linear(self, bias: torch.Tensor):
        if self.use_awq_kernel:
            if SYSTEM == "rocm":
                raise NotImplementedError(
                    "AWQ GEMM kernel can't be used on ROCm systems, please use `--quantize gptq` instead "
                    "to use Exllama/GPTQ kernels for AWQ inference."
                )
            try:
                from text_generation_server.layers.awq.quantize import WQLinear

                return WQLinear(
                    w_bit=self.bits,
                    group_size=self.groupsize,
                    qweight=self.qweight,
                    qzeros=self.qzeros,
                    scales=self.scales,
                    bias=bias,
                )
            except ImportError:
                raise NotImplementedError(
                    "You do not seem to have awq installed, either install it (cd server &&  make install-awq), or try using GPTQ `---quantize gptq` a conversion AWQ->GPTQ will happen on the fly"
                )
        elif self.use_exllama:
            try:
                from text_generation_server.layers.gptq import ExllamaQuantLinear
            except ImportError:
                raise NotImplementedError(
                    "Exllama gptq kernels are not installed. Install them `cd server/exllama_kernels && python setup.py install && cd ../exllamav2_kernels && python setup.py install`"
                )

            return ExllamaQuantLinear(self, bias)
        else:
            if SYSTEM == "ipex" and not (
                self.device.type == "xpu"
                and (
                    self.bits != 4
                    or math.ceil(
                        (self.qweight.shape[0] * 32 // self.bits) / self.groupsize
                    )
                    != self.scales.shape[0]
                )
            ):
                from .ipex import QuantLinear
            else:
                from .triton import QuantLinear
            return QuantLinear(
                self.qweight,
                self.qzeros,
                self.scales,
                self.g_idx,
                bias,
                self.bits,
                self.groupsize,
            )


class GPTQWeightsLoader(WeightsLoader):
    """
    Loader for GPTQ- and AWQ-quantized weights.
    """

    def __init__(
        self,
        *,
        bits: int,
        desc_act: bool,
        groupsize: int,
        quant_method: str,
        quantize: str,
        sym: bool,
        modules_to_not_convert: List[str],
    ):
        self.bits = bits
        self.desc_act = desc_act
        self.groupsize = groupsize
        self.quant_method = quant_method
        self.quantize = quantize
        self.sym = sym
        self.modules_to_not_convert = modules_to_not_convert

    def get_weights(self, weights: Weights, prefix: str):
        self._get_gptq_params(weights)

        use_exllama = True
        if self.bits != 4:
            use_exllama = False

        if self.desc_act:
            log_once(logger.warning, "Disabling exllama because desc_act=True")
            use_exllama = False

        if self.is_layer_skipped_quantization(prefix, self.modules_to_not_convert):
            return DefaultWeightsLoader.get_weights(weights, prefix)

        try:
            qweight = weights.get_tensor(f"{prefix}.qweight")
        except RuntimeError:
            raise RuntimeError(
                "Cannot load `gptq` weight, make sure the model is already quantized, or quantize it with `text-generation-server quantize ORIGINAL_MODEL_ID NEW_MODEL_ID`"
            )

        if self.quantize == "gptq" and self.quant_method == "gptq":
            g_idx = weights.get_tensor(f"{prefix}.g_idx")
        else:
            g_idx = None

        from text_generation_server.layers.gptq import (
            HAS_EXLLAMA,
            CAN_EXLLAMA,
            GPTQWeight,
        )

        if use_exllama:
            if not HAS_EXLLAMA:
                if CAN_EXLLAMA:
                    log_once(
                        logger.warning,
                        "Exllama GPTQ cuda kernels (which are faster) could have been used, but are not currently installed, try using BUILD_EXTENSIONS=True",
                    )
                use_exllama = False
            else:
                log_once(logger.info, f"Using exllama kernels v{HAS_EXLLAMA}")

        qzeros = weights.get_tensor(f"{prefix}.qzeros")
        scales = weights.get_tensor(f"{prefix}.scales")

        if use_exllama and g_idx is not None:
            g_idx = g_idx - g_idx[0]

        if self.quantize == "gptq" and self.quant_method == "awq":
            log_once(
                logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
            )
            from text_generation_server.layers.awq.conversion_utils import (
                fast_awq_to_gptq,
            )

            qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
            if use_exllama:
                g_idx = None
            else:
                g_idx = (
                    torch.arange(
                        qweight.shape[0] * (32 // self.bits),
                        device=qweight.device,
                    )
                    // self.groupsize
                ).to(dtype=torch.int32)

        return GPTQWeight(
            qweight=qweight,
            qzeros=qzeros,
            scales=scales,
            g_idx=g_idx,
            bits=self.bits,
            groupsize=self.groupsize,
            use_awq_kernel=self.quantize == "awq",
            use_exllama=use_exllama,
        )

    def is_layer_skipped_quantization(
        self, prefix: str, modules_to_not_convert: List[str]
    ):
        return any(module_name in prefix for module_name in modules_to_not_convert)

    def get_weights_col_packed(
        self,
        weights: Weights,
        prefix: str,
        block_sizes: Union[int, List[int]],
    ):
        if self.is_layer_skipped_quantization(prefix, self.modules_to_not_convert):
            return DefaultWeightsLoader.get_weights_col_packed(
                weights, prefix, block_sizes
            )
        try:
            qweight = weights.get_packed_sharded(
                f"{prefix}.qweight", dim=1, block_sizes=block_sizes
            )
        except RuntimeError:
            raise RuntimeError(
                f"Cannot load `{self.quantize}` weight, make sure the model is already quantized."
            )
        scales = weights.get_packed_sharded(
            f"{prefix}.scales", dim=1, block_sizes=block_sizes
        )
        scales = scales.to(dtype=weights.dtype)

        self._get_gptq_params(weights)

        qzeros = weights.get_packed_sharded(
            f"{prefix}.qzeros", dim=1, block_sizes=block_sizes
        )
        if self.quantize == "gptq" and self.quant_method == "gptq":
            g_idx = weights.get_tensor(f"{prefix}.g_idx")
        elif self.quantize == "gptq" and self.quant_method == "awq":
            log_once(
                logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
            )
            from text_generation_server.layers.awq.conversion_utils import (
                fast_awq_to_gptq,
            )

            qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
            g_idx = (
                torch.arange(
                    qweight.shape[0] * (32 // self.bits),
                    device=qweight.device,
                )
                // self.groupsize
            ).to(dtype=torch.int32)
        else:
            g_idx = None

        return GPTQWeight(
            qweight=qweight,
            qzeros=qzeros,
            scales=scales,
            g_idx=g_idx,
            bits=self.bits,
            groupsize=self.groupsize,
            use_awq_kernel=self.quantize == "awq",
            use_exllama=False,
        )

    def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
        if self.is_layer_skipped_quantization(prefixes[0], self.modules_to_not_convert):
            return DefaultWeightsLoader.get_multi_weights_col(weights, prefixes, dim)
        try:
            qweight = torch.cat(
                [weights.get_sharded(f"{p}.qweight", dim=1) for p in prefixes], dim=1
            )
        except RuntimeError:
            raise RuntimeError(
                f"Cannot load `{self.quantize}` weight, make sure the model is already quantized"
            )

        scales = torch.cat(
            [weights.get_sharded(f"{p}.scales", dim=1) for p in prefixes], dim=1
        )

        self._get_gptq_params(weights)

        qzeros = torch.cat(
            [weights.get_sharded(f"{p}.qzeros", dim=1) for p in prefixes], dim=1
        )

        from text_generation_server.layers.gptq import HAS_EXLLAMA

        use_exllama = (
            self.bits == 4
            and HAS_EXLLAMA
            and self.quantize == "gptq"
            and not self.desc_act
        )

        if self.quantize == "gptq" and self.quant_method == "gptq":
            w = [weights.get_tensor(f"{p}.g_idx") for p in prefixes]
            for w2 in w[1:]:
                torch.testing.assert_close(w2, w[0])
            g_idx = w[0]
        elif self.quantize == "gptq" and self.quant_method == "awq":
            log_once(
                logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
            )
            from text_generation_server.layers.awq.conversion_utils import (
                fast_awq_to_gptq,
            )

            qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
            if use_exllama:
                g_idx = None
            else:
                g_idx = (
                    torch.arange(
                        qweight.shape[0] * (32 // self.bits),
                        device=qweight.device,
                    )
                    // self.groupsize
                ).to(dtype=torch.int32)
        else:
            g_idx = None

        return GPTQWeight(
            qweight=qweight,
            qzeros=qzeros,
            scales=scales,
            g_idx=g_idx,
            bits=self.bits,
            groupsize=self.groupsize,
            use_awq_kernel=self.quantize == "awq",
            use_exllama=use_exllama,
        )

    def get_weights_row(self, weights: Weights, prefix: str):
        self._get_gptq_params(weights)

        use_exllama = True
        desc_act = self.desc_act
        if self.bits != 4:
            use_exllama = False

        if self.desc_act:
            log_once(logger.warning, "Disabling exllama because desc_act=True")
            use_exllama = False

        if self.is_layer_skipped_quantization(prefix, self.modules_to_not_convert):
            return DefaultWeightsLoader.get_weights_row(weights, prefix)
        try:
            qweight = weights.get_sharded(f"{prefix}.qweight", dim=0)
        except RuntimeError:
            raise RuntimeError(
                "Cannot load `gptq` weight, make sure the model is already quantized, or quantize it with `text-generation-server quantize ORIGINAL_MODEL_ID NEW_MODEL_ID`"
            )

        if self.quantize == "gptq" and self.quant_method == "gptq":
            g_idx = weights.get_sharded(f"{prefix}.g_idx", dim=0)
        else:
            g_idx = None

        if weights.process_group.size() > 1:
            if g_idx is not None:
                if (
                    not torch.equal(
                        # Remove g_idx[0] to adapt the check with TP>1.
                        (g_idx - g_idx[0]).cpu(),
                        torch.tensor(
                            [i // self.groupsize for i in range(g_idx.shape[0])],
                            dtype=torch.int32,
                        ),
                    )
                    and not (g_idx == 0).all()
                ):
                    # Exllama implementation does not support row tensor parallelism with act-order, as
                    # it would require to reorder input activations that are split unto several GPUs
                    use_exllama = False
                    desc_act = True

        from text_generation_server.layers.gptq import (
            CAN_EXLLAMA,
            HAS_EXLLAMA,
            GPTQWeight,
        )

        if use_exllama:
            if not HAS_EXLLAMA:
                if CAN_EXLLAMA:
                    log_once(
                        logger.warning,
                        "Exllama GPTQ cuda kernels (which are faster) could have been used, but are not currently installed, try using BUILD_EXTENSIONS=True",
                    )
                use_exllama = False
            else:
                log_once(logger.info, f"Using exllama kernels v{HAS_EXLLAMA}")

        if not desc_act and self.groupsize != -1:
            qzeros = weights.get_sharded(f"{prefix}.qzeros", dim=0)
            scales = weights.get_sharded(f"{prefix}.scales", dim=0)
            if g_idx is not None:
                # qzeros, scales sharded, and g_idx must be adjusted accordingly
                g_idx = g_idx - g_idx[0]
        else:
            qzeros = weights.get_tensor(f"{prefix}.qzeros")
            scales = weights.get_tensor(f"{prefix}.scales")

        if self.quantize == "gptq" and self.quant_method == "awq":
            log_once(
                logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
            )
            from text_generation_server.layers.awq.conversion_utils import (
                fast_awq_to_gptq,
            )

            qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
            if use_exllama:
                g_idx = None
            else:
                g_idx = (
                    torch.arange(
                        qweight.shape[0] * (32 // self.bits),
                        device=qweight.device,
                    )
                    // self.groupsize
                ).to(dtype=torch.int32)

        return GPTQWeight(
            qweight=qweight,
            qzeros=qzeros,
            scales=scales,
            g_idx=g_idx,
            bits=self.bits,
            groupsize=self.groupsize,
            use_awq_kernel=self.quantize == "awq",
            use_exllama=use_exllama,
        )

    def _get_gptq_params(self, weights: Weights):
        if weights.has_tensor("gptq_bits") and weights.has_tensor("gptq_groupsize"):
            self.bits = weights.get_tensor("gptq_bits").item()
            self.groupsize = weights.get_tensor("gptq_groupsize").item()
            self.desc_act = False
            # `server quantize` used asymmetric quantization unconditionally
            # before the `gptq_sym` setting tensor was added.
            self.sym = (
                weights.get_tensor("gptq_sym").item()
                if weights.has_tensor("gptq_sym")
                else False
            )
            self.quant_method = "gptq"


# Needs to be at the end because circular import.
try:
    major, _minor = torch.cuda.get_device_capability()
except Exception:
    major = 1

HAS_EXLLAMA = False
CAN_EXLLAMA = major >= 8 or SYSTEM == "rocm"
V2 = os.getenv("EXLLAMA_VERSION", "2") == "2"
if os.getenv("DISABLE_EXLLAMA") == "True":
    HAS_EXLLAMA = False
elif CAN_EXLLAMA:
    try:
        if V2:
            from text_generation_server.layers.gptq.exllamav2 import (
                QuantLinear as ExllamaQuantLinear,  # noqa: F401
                create_exllama_buffers,  # noqa: F401
                set_device,  # noqa: F401
            )

            HAS_EXLLAMA = "2"
        else:
            from text_generation_server.layers.gptq.exllama import (
                Ex4bitLinear as ExllamaQuantLinear,  # noqa: F401
                create_exllama_buffers,  # noqa: F401
                set_device,  # noqa: F401
            )

            HAS_EXLLAMA = "1"

    except ImportError:
        pass


================================================
FILE: server/text_generation_server/layers/gptq/custom_autotune.py
================================================
# https://github.com/fpgaminer/GPTQ-triton
"""
Mostly the same as the autotuner in Triton, but with a few changes like using 40 runs instead of 100.
"""

import builtins
import math
import time
from typing import Dict

import triton


class Autotuner(triton.KernelInterface):
    def __init__(
        self,
        fn,
        arg_names,
        configs,
        key,
        reset_to_zero,
        prune_configs_by: Dict = None,
        nearest_power_of_two: bool = False,
    ):
        """
        :param prune_configs_by: a dict of functions that are used to prune configs, fields:
                'perf_model': performance model used to predicate running time with different configs, returns running time
                'top_k': number of configs to bench
                'prune_num_stages_by'(optional): a function used to prune num_stages. It take configs:List[Config] as its input, and returns pruned configs.
                'nearest_power_of_two'(optional): whether to round key arguments to the nearest power of two when caching tuning results
        """
        if not configs:
            self.configs = [triton.Config({}, num_warps=4, num_stages=2)]
        else:
            self.configs = configs
        self.key_idx = [arg_names.index(k) for k in key]
        self.nearest_power_of_two = nearest_power_of_two
        self.cache = {}
        # hook to reset all required tensor to zeros before relaunching a kernel
        self.hook = lambda args: 0
        if reset_to_zero is not None:
            self.reset_idx = [arg_names.index(k) for k in reset_to_zero]

            def _hook(args):
                for i in self.reset_idx:
                    args[i].zero_()

            self.hook = _hook
        self.arg_names = arg_names
        # prune configs
        if prune_configs_by:
            perf_model, top_k = (
                prune_configs_by["perf_model"],
                prune_configs_by["top_k"],
            )
            if "early_config_prune" in prune_configs_by:
                early_config_prune = prune_configs_by["early_config_prune"]
        else:
            perf_model, top_k, early_config_prune = None, None, None
        self.perf_model, self.configs_top_k = perf_model, top_k
        self.early_config_prune = early_config_prune
        self.fn = fn

    def _bench(self, *args, config, **meta):
        # check for conflicts, i.e. meta-parameters both provided
        # as kwargs and by the autotuner
        conflicts = meta.keys() & config.kwargs.keys()
        if conflicts:
            raise ValueError(
                f"Conflicting meta-parameters: {', '.join(conflicts)}."
                " Make sure that you don't re-define auto-tuned symbols."
            )
        # augment meta-parameters with tunable ones
        current = dict(meta, **config.kwargs)

        def kernel_call():
            if config.pre_hook:
                config.pre_hook(self.nargs)
            self.hook(args)
            self.fn.run(
                *args,
                num_warps=config.num_warps,
                num_stages=config.num_stages,
                **current,
            )

        try:
            # In testings using only 40 reps seems to be close enough and it appears to be what PyTorch uses
            # PyTorch also sets fast_flush to True, but I didn't see any speedup so I'll leave the default
            return triton.testing.do_bench(
                kernel_call, quantiles=(0.5, 0.2, 0.8), rep=40
            )
        except triton.OutOfResources:
            return [float("inf"), float("inf"), float("inf")]

    def run(self, *args, **kwargs):
        self.nargs = dict(zip(self.arg_names, args))
        if len(self.configs) > 1:
            key = tuple(args[i] for i in self.key_idx)

            # This reduces the amount of autotuning by rounding the keys to the nearest power of two
            # In my testing this gives decent results, and greatly reduces the amount of tuning required
            if self.nearest_power_of_two:
                key = tuple([2 ** int(math.log2(x) + 0.5) for x in key])

            if key not in self.cache:
                # prune configs
                pruned_configs = self.prune_configs(kwargs)
                bench_start = time.time()
                timings = {
                    config: self._bench(*args, config=config, **kwargs)
                    for config in pruned_configs
                }
                bench_end = time.time()
                self.bench_time = bench_end - bench_start
                self.cache[key] = builtins.min(timings, key=timings.get)
                self.hook(args)
                self.configs_timings = timings
            config = self.cache[key]
        else:
            config = self.configs[0]
        self.best_config = config
        if config.pre_hook is not None:
            config.pre_hook(self.nargs)
        return self.fn.run(
            *args,
            num_warps=config.num_warps,
            num_stages=config.num_stages,
            **kwargs,
            **config.kwargs,
        )

    def prune_configs(self, kwargs):
        pruned_configs = self.configs
        if self.early_config_prune:
            pruned_configs = self.early_config_prune(self.configs, self.nargs)
        if self.perf_model:
            top_k = self.configs_top_k
            if isinstance(top_k, float) and top_k <= 1.0:
                top_k = int(len(self.configs) * top_k)
            if len(pruned_configs) > top_k:
                est_timing = {
                    config: self.perf_model(
                        **self.nargs,
                        **kwargs,
                        **config.kwargs,
                        num_stages=config.num_stages,
                        num_warps=config.num_warps,
                    )
                    for config in pruned_configs
                }
                pruned_configs = sorted(est_timing.keys(), key=lambda x: est_timing[x])[
                    :top_k
                ]
        return pruned_configs

    def warmup(self, *args, **kwargs):
        self.nargs = dict(zip(self.arg_names, args))
        for config in self.prune_configs(kwargs):
            self.fn.warmup(
                *args,
                num_warps=config.num_warps,
                num_stages=config.num_stages,
                **kwargs,
                **config.kwargs,
            )
        self.nargs = None


def autotune(
    configs, key, prune_configs_by=None, reset_to_zero=None, nearest_power_of_two=False
):
    """
    Decorator for auto-tuning a :code:`triton.jit`'d function.
    .. highlight:: python
    .. code-block:: python
            @triton.autotune(configs=[
                    triton.Config(meta={'BLOCK_SIZE': 128}, num_warps=4),
                    triton.Config(meta={'BLOCK_SIZE': 1024}, num_warps=8),
                    ],
                    key=['x_size'] # the two above configs will be evaluated anytime
                                                    # the value of x_size changes
            )
            @triton.jit
            def kernel(x_ptr, x_size, **META):
                    BLOCK_SIZE = META['BLOCK_SIZE']
    :note: When all the configurations are evaluated, the kernel will run multiple time.
                    This means that whatever value the kernel updates will be updated multiple times.
                    To avoid this undesired behavior, you can use the `reset_to_zero` argument, which
                    reset the value of the provided tensor to `zero` before running any configuration.
    :param configs: a list of :code:`triton.Config` objects
    :type configs: list[triton.Config]
    :param key: a list of argument names whose change in value will trigger the evaluation of all provided configs.
    :type key: list[str]
    :param prune_configs_by: a dict of functions that are used to prune configs, fields:
            'perf_model': performance model used to predicate running time with different configs, returns running time
            'top_k': number of configs to bench
            'early_config_prune'(optional): a function used to do early prune (eg, num_stages). It take configs:List[Config] as its input, and returns pruned configs.
    :param reset_to_zero: a list of argument names whose value will be reset to zero before evaluating any configs.
    :type reset_to_zero: list[str]
    """

    def decorator(fn):
        return Autotuner(
            fn,
            fn.arg_names,
            configs,
            key,
            reset_to_zero,
            prune_configs_by,
            nearest_power_of_two,
        )

    return decorator


def matmul248_kernel_config_pruner(configs, nargs):
    """
    The main purpose of this function is to shrink BLOCK_SIZE_* when the corresponding dimension is smaller.
    """
    m = max(2 ** int(math.ceil(math.log2(nargs["M"]))), 16)
    n = max(2 ** int(math.ceil(math.log2(nargs["N"]))), 16)
    k = max(2 ** int(math.ceil(math.log2(nargs["K"]))), 16)

    used = set()
    for config in configs:
        block_size_m = min(m, config.kwargs["BLOCK_SIZE_M"])
        block_size_n = min(n, config.kwargs["BLOCK_SIZE_N"])
        block_size_k = min(k, config.kwargs["BLOCK_SIZE_K"])
        group_size_m = config.kwargs["GROUP_SIZE_M"]

        if (
            block_size_m,
            block_size_n,
            block_size_k,
            group_size_m,
            config.num_stages,
            config.num_warps,
        ) in used:
            continue

        used.add(
            (
                block_size_m,
                block_size_n,
                block_size_k,
                group_size_m,
                config.num_stages,
                config.num_warps,
            )
        )
        yield triton.Config(
            {
                "BLOCK_SIZE_M": block_size_m,
                "BLOCK_SIZE_N": block_size_n,
                "BLOCK_SIZE_K": block_size_k,
                "GROUP_SIZE_M": group_size_m,
            },
            num_stages=config.num_stages,
            num_warps=config.num_warps,
        )


================================================
FILE: server/text_generation_server/layers/gptq/exllama.py
================================================
from text_generation_server.layers.gptq import GPTQWeight
import torch
from exllama_kernels import make_q4, q4_matmul, prepare_buffers, set_tuning_params

# Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension
none_tensor = torch.empty((1, 1), device="meta")


def ext_make_q4(qweight, qzeros, scales, g_idx, device):
    """Construct Q4Matrix, return handle"""
    return make_q4(
        qweight, qzeros, scales, g_idx if g_idx is not None else none_tensor, device
    )


def ext_q4_matmul(x, q4, q4_width):
    """Matrix multiplication, returns x @ q4"""
    outshape = x.shape[:-1] + (q4_width,)
    x = x.view(-1, x.shape[-1])
    output = torch.empty((x.shape[0], q4_width), dtype=torch.float16, device=x.device)

    q4_matmul(x, q4, output)

    return output.view(outshape)


MAX_DQ = 1
MAX_INNER = 1
ACT_ORDER = False
DEVICE = None

TEMP_STATE = None
TEMP_DQ = None


def set_device(device):
    global DEVICE
    DEVICE = device


def create_exllama_buffers(max_total_tokens: int):
    global MAX_DQ, MAX_INNER, ACT_ORDER, DEVICE, TEMP_STATE, TEMP_DQ

    assert DEVICE is not None, "call set_device first"

    if not ACT_ORDER:
        max_total_tokens = 1

    # This temp_state buffer is required to reorder X in the act-order case.
    temp_state = torch.zeros(
        (max_total_tokens, MAX_INNER), dtype=torch.float16, device=DEVICE
    )
    temp_dq = torch.zeros((1, MAX_DQ), dtype=torch.float16, device=DEVICE)

    # This temp_dq buffer is required to dequantize weights when using cuBLAS, typically for the prefill.
    prepare_buffers(DEVICE, temp_state, temp_dq)

    matmul_recons_thd = 8
    matmul_fused_remap = False
    matmul_no_half2 = False
    set_tuning_params(matmul_recons_thd, matmul_fused_remap, matmul_no_half2)

    TEMP_STATE, TEMP_DQ = temp_state, temp_dq


class Ex4bitLinear(torch.nn.Module):
    """Linear layer implementation with per-group 4-bit quantization of the weights"""

    def __init__(self, weight: GPTQWeight, bias):
        super().__init__()
        global MAX_DQ, MAX_INNER, ACT_ORDER, DEVICE
        assert weight.bits == 4

        self.device = weight.qweight.device
        self.qweight = weight.qweight
        self.qzeros = weight.qzeros
        self.scales = weight.scales
        self.g_idx = weight.g_idx.cpu() if weight.g_idx is not None else None
        self.bias = bias if bias is not None else None

        if self.g_idx is not None and (
            (self.g_idx == 0).all()
            or torch.equal(
                weight.g_idx.cpu(),
                torch.tensor(
                    [i // weight.groupsize for i in range(weight.g_idx.shape[0])],
                    dtype=torch.int32,
                ),
            )
        ):
            self.empty_g_idx = True
            self.g_idx = None

        assert self.device.type == "cuda"
        assert self.device.index is not None

        self.q4 = ext_make_q4(
            self.qweight, self.qzeros, self.scales, self.g_idx, self.device.index
        )

        self.height = weight.qweight.shape[0] * 8
        self.width = weight.qweight.shape[1]

        # Infer groupsize from height of qzeros
        self.groupsize = None
        if self.qzeros.shape[0] > 1:
            self.groupsize = (self.qweight.shape[0] * 8) // (self.qzeros.shape[0])

        if self.groupsize is not None:
            assert weight.groupsize == self.groupsize

        # Handle act-order matrix
        if self.g_idx is not None:
            if self.groupsize is None:
                raise ValueError("Found group index but no groupsize. What do?")
            self.act_order = True
        else:
            self.act_order = False

        DEVICE = self.qweight.device

        MAX_DQ = max(MAX_DQ, self.qweight.numel() * 8)

        if self.act_order:
            MAX_INNER = max(MAX_INNER, self.height, self.width)

            ACT_ORDER = True

    def forward(self, x):
        out = ext_q4_matmul(x, self.q4, self.width)

        if self.bias is not None:
            out.add_(self.bias)
        return out


================================================
FILE: server/text_generation_server/layers/gptq/exllamav2.py
================================================
# Adapted from turboderp exllama: https://github.com/turboderp/exllamav2

from dataclasses import dataclass
from typing import Optional
import torch
import torch.nn as nn

from loguru import logger

from text_generation_server.layers.exl2 import Exl2Weight
from text_generation_server.layers.gptq import GPTQWeight
from text_generation_server.utils.log import log_master

try:
    from exllamav2.ext import exllamav2_ext

    make_q_matrix = exllamav2_ext.make_q_matrix
    gemm_half_q_half = exllamav2_ext.gemm_half_q_half
except ImportError:
    log_master(logger.warning, "exllamav2_kernels not installed.")
    raise

# Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension
none_tensor = torch.empty((1, 1), device="meta")


@dataclass
class _ExtraTensors:
    """Additional generated quantizer tensors."""

    q_group_map: Optional[torch.Tensor] = None
    q_invperm: Optional[torch.Tensor] = None
    q_perm: Optional[torch.Tensor] = None


def ext_gemm_half_q_half(x, q_handle, q4_width, force_cuda):
    """Matrix multiplication, returns x @ q4"""
    output_shape = x.shape[:-1] + (q4_width,)
    x = x.view(-1, x.shape[-1])
    output = torch.empty((x.shape[0], q4_width), dtype=torch.half, device=x.device)
    gemm_half_q_half(x, q_handle, output, force_cuda)
    return output.view(output_shape)


def make_group_map(q_groups: torch.Tensor, num_qrows: int):
    gr = q_groups.tolist()
    group_map = []
    num_groups = len(gr) // 2

    for i in range(num_groups):
        bits = gr[i * 2]
        if i < num_groups - 1:
            qrows = gr[i * 2 + 3] - gr[i * 2 + 1]
        else:
            qrows = num_qrows - gr[i * 2 + 1]
        rows = qrows * 32 // bits
        for j in range(rows):
            group_map += [i]
            group_map += [rows - j]

    return torch.tensor(group_map, dtype=torch.short, device=q_groups.device)


# Create Q matrix


def ext_make_q_matrix(
    w: Exl2Weight | GPTQWeight,
    extra: _ExtraTensors,
    temp_dq,
    key: Optional[str] = None,
):
    """
    Create Q matrix
    """
    # max_dq_size = 512*(1024**2)
    # max_dq_rows = max_dq_size // out_features[0]
    max_dq_rows = 0

    # EXL2
    if isinstance(w, Exl2Weight):
        extra.q_group_map = make_group_map(w.q_groups, w.q_weight.shape[0])
        extra.q_perm = torch.argsort(w.q_invperm).short()

        return make_q_matrix(
            w.q_weight,
            extra.q_perm,
            w.q_invperm,
            w.q_scale,
            w.q_scale_max,
            w.q_groups,
            extra.q_group_map,
            none_tensor,  # zeros
            none_tensor,  # scales
            none_tensor,  # g_idx
            none_tensor,  # bias
            temp_dq,
            max_dq_rows,
        )
    # GPTQ
    elif isinstance(w, GPTQWeight):
        if w.scales.dtype == torch.float:
            w.scales = w.scales.half()

        # GPTQ with g_idx (act_order)
        if w.g_idx is not None and not (w.g_idx == 0).all().item():
            extra.q_perm = torch.empty(
                (w.qweight.shape[0] * 8,),
                dtype=torch.short,
                device=w.qweight.device,
            )
            extra.q_invperm = torch.empty_like(extra.q_perm)
            # make_q4 segfaults if g_idx is not on cpu in the act-order case. In the non act-order case, None needs to be passed for g_idx.
            return make_q_matrix(
                w.qweight,
                extra.q_perm,
                extra.q_invperm,
                none_tensor,  # q_scale
                none_tensor,  # q_scale_max
                none_tensor,  # q_groups
                none_tensor,  # q_group_map
                w.qzeros,
                w.scales,
                w.g_idx.cpu(),
                none_tensor,  # bias
                temp_dq,
                max_dq_rows,
            )
        # GPTQ without g_idx
        else:
            return make_q_matrix(
                w.qweight,
                none_tensor,  # q_perm
                none_tensor,  # q_invperm
                none_tensor,  # q_scale
                none_tensor,  # q_scale_max
                none_tensor,  # q_groups
                none_tensor,  # q_group_map
                w.qzeros,
                w.scales,
                none_tensor,  # g_idx
                none_tensor,  # bias
                temp_dq,
                max_dq_rows,
            )
    else:
        RuntimeError("Cannot create handle")


DEVICE = None
LAYERS = []


def set_device(device):
    global DEVICE
    DEVICE = device


def create_exllama_buffers(max_total_tokens: int):
    global LAYERS, DEVICE

    # No need to initialize scratch space if there are no layers
    # that use ExLLamav2.
    if len(LAYERS) == 0:
        return

    # Find the size of the scratch space.
    scratch_bytes = max(
        layer.scratch_space_fixed(max_input_len=max_total_tokens, max_batch_size=1)
        for layer in LAYERS
    )
    temp_dq = ExLlamaV2DeviceTensors(DEVICE, scratch_bytes)

    for layer in LAYERS:
        layer.post_init(temp_dq)


class QuantLinear(nn.Module):
    QUANT_TYPE = "exllamav2"

    """Linear layer implementation with per-group 4-bit quantization of the weights"""

    def __init__(
        self,
        weight: Exl2Weight | GPTQWeight,
        bias: torch.Tensor,
    ):
        super().__init__()

        self.q_handle = None
        self.q_tensors = weight
        self.extra_tensors = _ExtraTensors()

        if isinstance(weight, Exl2Weight):
            self.infeatures = weight.q_invperm.shape[0]
            self.outfeatures = weight.q_weight.shape[1]
        elif isinstance(weight, GPTQWeight):
            if weight.bits != 4:
                raise ValueError(
                    f"Exllamav2 kernel supports only bits=4, requested bits={weight.bits}. Something is wrong in the model initialization."
                )

            self.infeatures = weight.qweight.shape[0] // weight.bits * 32
            self.outfeatures = weight.qweight.shape[1]

        self.padding = -self.outfeatures % 32
        self.outfeatures = self.outfeatures + self.padding

        self.device = weight.device
        self.bias = bias if bias is not None else None

        global LAYERS
        LAYERS.append(self)

    def post_init(self, temp_dq):
        device = self.q_tensors.device
        assert device.type == "cuda"
        assert device.index is not None
        temp_dq = temp_dq.get_scratch_slice(self.temp_dq_size())

        # We NEED to keep a pointer on Python side, otherwise the garbage collector will mess with us,
        # and `Memory access fault by GPU node-2` will EAT you.
        self.temp_dq = temp_dq
        self.q_handle = ext_make_q_matrix(self.q_tensors, self.extra_tensors, temp_dq)

    def forward(self, x, force_cuda=False):
        output = ext_gemm_half_q_half(x, self.q_handle, self.outfeatures, force_cuda)

        if self.bias is not None:
            output.add_(self.bias)
        return output

    def temp_dq_size(self):
        return self.infeatures * self.outfeatures * 2 + 128

    def temp_fwd_size(self, max_input_len, max_batch_size):
        return self.outfeatures * max_input_len * max_batch_size * 4 + 128

    def scratch_space_fixed(self, max_input_len, max_batch_size):
        return self.temp_dq_size() + self.temp_fwd_size(max_input_len, max_batch_size)


class ExLlamaV2DeviceTensors:

    device_idx: int
    scratch_bytes: int
    scratch_idx: int
    scratch: torch.tensor = None

    def __init__(self, device, scratch_bytes):
        self.device = device
        self.scratch_bytes = scratch_bytes

    def prepare(self):
        self.scratch = torch.empty(
            (self.scratch_bytes // 2,), dtype=torch.half, device=self.device
        )

    def get_scratch_slice(self, size_bytes):

        if self.scratch is None:
            self.prepare()

        size_bytes = ((size_bytes + 127) // 128) * 128
        size_half = size_bytes // 2
        scratch_slice = self.scratch.narrow(0, 0, size_half)
        return scratch_slice


================================================
FILE: server/text_generation_server/layers/gptq/ipex.py
================================================
import math
import numpy as np
import torch
import torch.nn as nn

import intel_extension_for_pytorch as ipex


class QuantLinear(nn.Module):
    def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize):
        super().__init__()
        self.register_buffer("qweight", qweight)
        self.register_buffer("qzeros", qzeros)
        self.register_buffer("scales", scales)
        self.register_buffer("g_idx", g_idx)
        if bias is not None:
            self.register_buffer("bias", bias)
        else:
            self.bias = None
        if bits not in [4]:
            raise NotImplementedError("Only 4 bits are supported.")
        self.bits = bits
        self.maxq = 2**self.bits - 1
        self.groupsize = groupsize

        self.outfeatures = qweight.shape[1]
        self.infeatures = qweight.shape[0] * 32 // bits
        self.woq_linear = (
            ipex.llm.quantization.IPEXWeightOnlyQuantizedLinear.from_weight(
                self.qweight,
                self.scales,
                self.qzeros,
                self.infeatures,
                self.outfeatures,
                bias=self.bias,
                group_size=self.groupsize,
                g_idx=g_idx,
                quant_method=ipex.llm.quantization.QuantMethod.GPTQ_GEMM,
                dtype=ipex.llm.quantization.QuantDtype.INT4,
            )
        )

    @classmethod
    def new(cls, bits, groupsize, infeatures, outfeatures, bias):
        if bits not in [4]:
            raise NotImplementedError("Only 4 bits are supported.")

        qweight = torch.zeros((infeatures // 32 * bits, outfeatures), dtype=torch.int32)
        qzeros = torch.zeros(
            (math.ceil(infeatures / groupsize), outfeatures // 32 * bits),
            dtype=torch.int32,
        )
        scales = torch.zeros(
            (math.ceil(infeatures / groupsize), outfeatures), dtype=torch.float16
        )
        g_idx = torch.tensor(
            [i // groupsize for i in range(infeatures)], dtype=torch.int32
        )
        if bias:
            bias = torch.zeros((outfeatures), dtype=torch.float16)
        else:
            bias = None
        return cls(qweight, qzeros, scales, g_idx, bias, bits, groupsize)

    def pack(self, linear, scales, zeros, g_idx=None):
        self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx

        scales = scales.t().contiguous()
        zeros = zeros.t().contiguous()
        scale_zeros = zeros * scales
        self.scales = scales.clone().half()
        if linear.bias is not None:
            self.bias = linear.bias.clone().half()

        intweight = []
        for idx in range(self.infeatures):
            intweight.append(
                torch.round(
                    (linear.weight.data[:, idx] + scale_zeros[self.g_idx[idx]])
                    / self.scales[self.g_idx[idx]]
                ).to(torch.int)[:, None]
            )
        intweight = torch.cat(intweight, dim=1)
        intweight = intweight.t().contiguous()
        intweight = intweight.numpy().astype(np.uint32)
        qweight = np.zeros(
            (intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32
        )
        i = 0
        row = 0
        while row < qweight.shape[0]:
            if self.bits in [4]:
                for j in range(i, i + (32 // self.bits)):
                    qweight[row] |= intweight[j] << (self.bits * (j - i))
                i += 32 // self.bits
                row += 1
            else:
                raise NotImplementedError("Only 4 bits are supported.")

        qweight = qweight.astype(np.int32)
        self.qweight = torch.from_numpy(qweight)

        zeros -= 1
        zeros = zeros.numpy().astype(np.uint32)
        qzeros = np.zeros(
            (zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32
        )
        i = 0
        col = 0
        while col < qzeros.shape[1]:
            if self.bits in [4]:
                for j in range(i, i + (32 // self.bits)):
                    qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
                i += 32 // self.bits
                col += 1
            else:
                raise NotImplementedError("Only 4 bits are supported.")

        qzeros = qzeros.astype(np.int32)
        self.qzeros = torch.from_numpy(qzeros)

    def forward(self, x):
        out_shape = x.shape[:-1] + (self.outfeatures,)
        out = self.woq_linear(x.reshape(-1, x.shape[-1]))
        return out.reshape(out_shape)


================================================
FILE: server/text_generation_server/layers/gptq/quantize.py
================================================
import time
import torch.nn as nn
import math
import json
import os
import torch
import transformers

from texttable import Texttable
from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer
from huggingface_hub import HfApi
from accelerate import init_empty_weights
from text_generation_server.utils import initialize_torch_distributed, Weights
from text_generation_server.utils.hub import weight_files
from text_generation_server.layers.gptq import QuantLinear
from loguru import logger
from typing import Optional
from text_generation_server.layers.gptq.utils import torch_snr_error

from text_generation_server.utils.weights import DefaultWeightsLoader, UnquantizedWeight

DEV = torch.device("cuda:0")


class Quantizer(nn.Module):
    def __init__(self, shape=1):
        super(Quantizer, self).__init__()
        self.register_buffer("maxq", torch.tensor(0))
        self.register_buffer("scale", torch.zeros(shape))
        self.register_buffer("zero", torch.zeros(shape))

    def configure(
        self,
        bits,
        perchannel=False,
        sym=True,
        mse=False,
        norm=2.4,
        grid=100,
        maxshrink=0.8,
        trits=False,
    ):
        self.maxq = torch.tensor(2**bits - 1)
        self.perchannel = perchannel
        self.sym = sym
        self.mse = mse
        self.norm = norm
        self.grid = grid
        self.maxshrink = maxshrink
        if trits:
            self.maxq = torch.tensor(-1)
        self.scale = torch.zeros_like(self.scale)

    def _quantize(self, x, scale, zero, maxq):
        if maxq < 0:
            return (x > scale / 2).float() * scale + (x < zero / 2).float() * zero
        q = torch.clamp(torch.round(x / scale) + zero, 0, maxq)
        return scale * (q - zero)

    def find_params(self, x, weight=False):
        dev = x.device
        self.maxq = self.maxq.to(dev)

        shape = x.shape
        if self.perchannel:
            if weight:
                x = x.flatten(1)
            else:
                if len(shape) == 4:
                    x = x.permute([1, 0, 2, 3])
                    x = x.flatten(1)
                if len(shape) == 3:
                    x = x.reshape((-1, shape[-1])).t()
                if len(shape) == 2:
                    x = x.t()
        else:
            x = x.flatten().unsqueeze(0)

        tmp = torch.zeros(x.shape[0], device=dev)
        xmin = torch.minimum(x.min(1)[0], tmp)
        xmax = torch.maximum(x.max(1)[0], tmp)

        if self.sym:
            xmax = torch.maximum(torch.abs(xmin), xmax)
            tmp = xmin < 0
            if torch.any(tmp):
                xmin[tmp] = -xmax[tmp]
        tmp = (xmin == 0) & (xmax == 0)
        xmin[tmp] = -1
        xmax[tmp] = +1

        if self.maxq < 0:
            self.scale = xmax
            self.zero = xmin
        else:
            self.scale = (xmax - xmin) / self.maxq
            if self.sym:
                self.zero = torch.full_like(self.scale, (self.maxq + 1) / 2)
            else:
                self.zero = torch.round(-xmin / self.scale)

        if self.mse:
            best = torch.full([x.shape[0]], float("inf"), device=dev)
            for i in range(int(self.maxshrink * self.grid)):
                p = 1 - i / self.grid
                xmin1 = p * xmin
                xmax1 = p * xmax
                scale1 = (xmax1 - xmin1) / self.maxq
                zero1 = torch.round(-xmin1 / scale1) if not self.sym else self.zero
                q = self._quantize(
                    x, scale1.unsqueeze(1), zero1.unsqueeze(1), self.maxq
                )
                q -= x
                q.abs_()
                q.pow_(self.norm)
                err = torch.sum(q, 1)
                tmp = err < best
                if torch.any(tmp):
                    best[tmp] = err[tmp]
                    self.scale[tmp] = scale1[tmp]
                    self.zero[tmp] = zero1[tmp]
        if not self.perchannel:
            if weight:
                tmp = shape[0]
            else:
                tmp = shape[1] if len(shape) != 3 else shape[2]
            self.scale = self.scale.repeat(tmp)
            self.zero = self.zero.repeat(tmp)

        if weight:
            shape = [-1] + [1] * (len(shape) - 1)
            self.scale = self.scale.reshape(shape)
            self.zero = self.zero.reshape(shape)
            return
        if len(shape) == 4:
            self.scale = self.scale.reshape((1, -1, 1, 1))
            self.zero = self.zero.reshape((1, -1, 1, 1))
        if len(shape) == 3:
            self.scale = self.scale.reshape((1, 1, -1))
            self.zero = self.zero.reshape((1, 1, -1))
        if len(shape) == 2:
            self.scale = self.scale.unsqueeze(0)
            self.zero = self.zero.unsqueeze(0)

    def quantize(self, x):
        if self.ready():
            return self._quantize(x, self.scale, self.zero, self.maxq)

        return x

    def enabled(self):
        return self.maxq > 0

    def ready(self):
        return torch.all(self.scale != 0)


class GPTQ:
    def __init__(self, layer, observe=False):
        self.layer = layer
        self.dev = self.layer.weight.device
        W = layer.weight.data.clone()
        if isinstance(self.layer, nn.Conv2d):
            W = W.flatten(1)
        if isinstance(self.layer, transformers.Conv1D):
            W = W.t()
        self.rows = W.shape[0]
        self.columns = W.shape[1]
        self.H = torch.zeros((self.columns, self.columns), device=self.dev)
        self.nsamples = 0
        self.quantizer = Quantizer()
        self.observe = observe

    def add_batch(self, inp, out):
        # Hessian H = 2 X XT + λ I
        if self.observe:
            self.inp1 = inp
            self.out1 = out
        else:
            self.inp1 = None
            self.out1 = None

        if len(inp.shape) == 2:
            inp = inp.unsqueeze(0)
        tmp = inp.shape[0]
        if isinstance(self.layer, nn.Linear) or isinstance(
            self.layer, transformers.Conv1D
        ):
            if len(inp.shape) == 3:
                inp = inp.reshape((-1, inp.shape[-1]))
            inp = inp.t()
        if isinstance(self.layer, nn.Conv2d):
            unfold = nn.Unfold(
                self.layer.kernel_size,
                dilation=self.layer.dilation,
                padding=self.layer.padding,
                stride=self.layer.stride,
            )
            inp = unfold(inp)
            inp = inp.permute([1, 0, 2])
            inp = inp.flatten(1)
        self.H *= self.nsamples / (self.nsamples + tmp)
        self.nsamples += tmp
        # inp = inp.float()
        inp = math.sqrt(2 / self.nsamples) * inp.float()
        # self.H += 2 / self.nsamples * inp.matmul(inp.t())
        self.H += inp.matmul(inp.t())

    def print_loss(self, name, q_weight, weight_error, timecost):
        table = Texttable()
        length = 28
        name = (
            (name + " " * (length - len(name)))
            if len(name) <= length
            else name[:length]
        )

        table.header(["name", "weight_error", "fp_inp_SNR", "q_inp_SNR", "time"])

        # assign weight
        self.layer.weight.data = q_weight.reshape(self.layer.weight.shape).to(
            self.layer.weight.data.dtype
        )

        if self.inp1 is not None:
            # quantize input to int8
            quantizer = Quantizer()
            quantizer.configure(8, perchannel=False, sym=True, mse=False)
            quantizer.find_params(self.inp1)
            q_in = quantizer.quantize(self.inp1).type(torch.float16)
            q_out = self.layer(q_in)

            # get kinds of SNR
            q_SNR = torch_snr_error(q_out, self.out1).item()
            fp_SNR = torch_snr_error(self.layer(self.inp1), self.out1).item()
        else:
            q_SNR = "-"
            fp_SNR = "-"

        table.add_row([name, weight_error, fp_SNR, q_SNR, timecost])
        print(table.draw().split("\n")[-2])

    def fasterquant(
        self, blocksize=128, percdamp=0.01, groupsize=-1, act_order=False, name=""
    ):
        self.layer.to(self.dev)

        W = self.layer.weight.data.clone()
        if isinstance(self.layer, nn.Conv2d):
            W = W.flatten(1)
        if isinstance(self.layer, transformers.Conv1D):
            W = W.t()
        W = W.float()

        tick = time.time()

        if not self.quantizer.ready():
            self.quantizer.find_params(W, weight=True)

        H = self.H
        if not self.observe:
            del self.H
        dead = torch.diag(H) == 0
        H[dead, dead] = 1
        W[:, dead] = 0

        if act_order:
            perm = torch.argsort(torch.diag(H), descending=True)
            W = W[:, perm]
            H = H[perm][:, perm]

        Losses = torch.zeros_like(W)
        Q = torch.zeros_like(W)

        damp = percdamp * torch.mean(torch.diag(H))
        diag = torch.arange(self.columns, device=self.dev)
        H[diag, diag] += damp
        H = torch.linalg.cholesky(H)
        H = torch.cholesky_inverse(H)
        try:
            H = torch.linalg.cholesky(H, upper=True)
        except Exception:
            # Addition because Falcon fails on h_to_4h
            H = torch.linalg.cholesky(
                H + 1e-5 * torch.eye(H.shape[0]).to(H.device), upper=True
            )
        Hinv = H

        g_idx = []
        scale = []
        zero = []
        now_idx = 1

        for i1 in range(0, self.columns, blocksize):
            i2 = min(i1 + blocksize, self.columns)
            count = i2 - i1

            W1 = W[:, i1:i2].clone()
            Q1 = torch.zeros_like(W1)
            Err1 = torch.zeros_like(W1)
            Losses1 = torch.zeros_like(W1)
            Hinv1 = Hinv[i1:i2, i1:i2]

            for i in range(count):
                w = W1[:, i]
                d = Hinv1[i, i]

                if groupsize != -1:
                    if (i1 + i) % groupsize == 0:
                        self.quantizer.find_params(
                            W[:, (i1 + i) : (i1 + i + groupsize)], weight=True
                        )

                    if ((i1 + i) // groupsize) - now_idx == -1:
                        scale.append(self.quantizer.scale)
                        zero.append(self.quantizer.zero)
                        now_idx += 1

                q = self.quantizer.quantize(w.unsqueeze(1)).flatten()
                Q1[:, i] = q
                Losses1[:, i] = (w - q) ** 2 / d**2

                err1 = (w - q) / d
                W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0))
                Err1[:, i] = err1

            Q[:, i1:i2] = Q1
            Losses[:, i1:i2] = Losses1 / 2

            W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:])

        torch.cuda.synchronize()
        error = torch.sum(Losses).item()

        groupsize = groupsize if groupsize != -1 else self.columns
        g_idx = [i // groupsize for i in range(self.columns)]
        g_idx = torch.tensor(g_idx, dtype=torch.int32, device=Q.device)
        if act_order:
            invperm = torch.argsort(perm)
            Q = Q[:, invperm]
            g_idx = g_idx[invperm]

        if isinstance(self.layer, transformers.Conv1D):
            Q = Q.t()

        self.print_loss(
            name=name, q_weight=Q, weight_error=error, timecost=(time.time() - tick)
        )

        if scale == []:
            scale.append(self.quantizer.scale)
            zero.append(self.quantizer.zero)
        scale = torch.cat(scale, dim=1)
        zero = torch.cat(zero, dim=1)
        return scale, zero, g_idx, error

    def free(self):
        self.inp1 = None
        self.out1 = None
        self.H = None
        self.Losses = None
        self.Trace = None
        torch.cuda.empty_cache()


def get_wikitext2(nsamples, seed, seqlen, model_id, trust_remote_code):
    from datasets import load_dataset

    traindata = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
    testdata = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")

    try:
        tokenizer = AutoTokenizer.from_pretrained(
            model_id, use_fast=False, trust_remote_code=trust_remote_code
        )
    except Exception:
        tokenizer = AutoTokenizer.from_pretrained(
            model_id, use_fast=True, trust_remote_code=trust_remote_code
        )

    trainenc = tokenizer("\n\n".join(traindata["text"]), return_tensors="pt")
    testenc = tokenizer("\n\n".join(testdata["text"]), return_tensors="pt")

    import random

    random.seed(seed)
    trainloader = []
    for _ in range(nsamples):
        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
        j = i + seqlen
        inp = trainenc.input_ids[:, i:j]
        tar = inp.clone()
        tar[:, :-1] = -100
        trainloader.append((inp, tar))
    return trainloader, testenc


def get_ptb(nsamples, seed, seqlen, model_id, trust_remote_code):
    from datasets import load_dataset

    traindata = load_dataset("ptb_text_only", "penn_treebank", split="train")
    valdata = load_dataset("ptb_text_only", "penn_treebank", split="validation")

    try:
        tokenizer = AutoTokenizer.from_pretrained(
            model_id, use_fast=False, trust_remote_code=trust_remote_code
        )
    except Exception:
        tokenizer = AutoTokenizer.from_pretrained(
            model_id, use_fast=True, trust_remote_code=trust_remote_code
        )

    trainenc = tokenizer("\n\n".join(traindata["sentence"]), return_tensors="pt")
    testenc = tokenizer("\n\n".join(valdata["sentence"]), return_tensors="pt")

    import random

    random.seed(seed)
    trainloader = []
    for _ in range(nsamples):
        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
        j = i + seqlen
        inp = trainenc.input_ids[:, i:j]
        tar = inp.clone()
        tar[:, :-1] = -100
        trainloader.append((inp, tar))
    return trainloader, testenc


def get_c4(nsamples, seed, seqlen, model_id, trust_remote_code):
    from datasets import load_dataset

    traindata = load_dataset(
        "allenai/c4",
        "allenai--c4",
        data_files={"train": "en/c4-train.00000-of-01024.json.gz"},
        split="train",
        use_auth_token=False,
    )
    valdata = load_dataset(
        "allenai/c4",
        "allenai--c4",
        data_files={"validation": "en/c4-validation.00000-of-00008.json.gz"},
        split="validation",
        use_auth_token=False,
    )

    try:
        tokenizer = AutoTokenizer.from_pretrained(
            model_id, use_fast=False, trust_remote_code=trust_remote_code
        )
    except Exception:
        tokenizer = AutoTokenizer.from_pretrained(
            model_id, use_fast=True, trust_remote_code=trust_remote_code
        )

    import random

    random.seed(seed)
    trainloader = []
    for _ in range(nsamples):
        while True:
            i = random.randint(0, len(traindata) - 1)
            trainenc = tokenizer(traindata[i]["text"], return_tensors="pt")
            if trainenc.input_ids.shape[1] >= seqlen:
                break
        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
        j = i + seqlen
        inp = trainenc.input_ids[:, i:j]
        tar = inp.clone()
        tar[:, :-1] = -100
        trainloader.append((inp, tar))

    import random

    random.seed(0)
    valenc = []
    for _ in range(256):
        while True:
            i = random.randint(0, len(valdata) - 1)
            tmp = tokenizer(valdata[i]["text"], return_tensors="pt")
            if tmp.input_ids.shape[1] >= seqlen:
                break
        i = random.randint(0, tmp.input_ids.shape[1] - seqlen - 1)
        j = i + seqlen
        valenc.append(tmp.input_ids[:, i:j])
    valenc = torch.hstack(valenc)

    class TokenizerWrapper:
        def __init__(self, input_ids):
            self.input_ids = input_ids

    valenc = TokenizerWrapper(valenc)

    return trainloader, valenc


def get_ptb_new(nsamples, seed, seqlen, model_id, trust_remote_code):
    from datasets import load_dataset

    traindata = load_dataset("ptb_text_only", "penn_treebank", split="train")
    testdata = load_dataset("ptb_text_only", "penn_treebank", split="test")

    try:
        tokenizer = AutoTokenizer.from_pretrained(
            model_id, use_fast=False, trust_remote_code=trust_remote_code
        )
    except Exception:
        tokenizer = AutoTokenizer.from_pretrained(
            model_id, use_fast=True, trust_remote_code=trust_remote_code
        )

    trainenc = tokenizer(" ".join(traindata["sentence"]), return_tensors="pt")
    testenc = tokenizer(" ".join(testdata["sentence"]), return_tensors="pt")

    import random

    random.seed(seed)
    trainloader = []
    for _ in range(nsamples):
        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
        j = i + seqlen
        inp = trainenc.input_ids[:, i:j]
        tar = inp.clone()
        tar[:, :-1] = -100
        trainloader.append((inp, tar))
    return trainloader, testenc


def get_c4_new(nsamples, seed, seqlen, model_id, trust_remote_code):
    from datasets import load_dataset

    traindata = load_dataset(
        "allenai/c4",
        "allenai--c4",
        data_files={"train": "en/c4-train.00000-of-01024.json.gz"},
        split="train",
    )
    valdata = load_dataset(
        "allenai/c4",
        "allenai--c4",
        data_files={"validation": "en/c4-validation.00000-of-00008.json.gz"},
        split="validation",
    )

    try:
        tokenizer = AutoTokenizer.from_pretrained(
            model_id, use_fast=False, trust_remote_code=trust_remote_code
        )
    except Exception:
        tokenizer = AutoTokenizer.from_pretrained(
            model_id, use_fast=True, trust_remote_code=trust_remote_code
        )

    import random

    random.seed(seed)
    trainloader = []
    for _ in range(nsamples):
        while True:
            i = random.randint(0, len(traindata) - 1)
            trainenc = tokenizer(traindata[i]["text"], return_tensors="pt")
            if trainenc.input_ids.shape[1] >= seqlen:
                break
        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
        j = i + seqlen
        inp = trainenc.input_ids[:, i:j]
        tar = inp.clone()
        tar[:, :-1] = -100
        trainloader.append((inp, tar))

    valenc = tokenizer(" ".join(valdata[:1100]["text"]), return_tensors="pt")
    valenc = valenc.input_ids[:, : (256 * seqlen)]

    class TokenizerWrapper:
        def __init__(self, input_ids):
            self.input_ids = input_ids

    valenc = TokenizerWrapper(valenc)

    return trainloader, valenc


def get_loaders(
    name, nsamples=128, seed=0, seqlen=2048, model_id="", trust_remote_code=False
):
    if "wikitext2" in name:
        return get_wikitext2(nsamples, seed, seqlen, model_id, trust_remote_code)
    if "ptb" in name:
        if "new" in name:
            return get_ptb_new(nsamples, seed, seqlen, model_id, trust_remote_code)
        return get_ptb(nsamples, seed, seqlen, model_id, trust_remote_code)
    if "c4" in name:
        if "new" in name:
            return get_c4_new(nsamples, seed, seqlen, model_id, trust_remote_code)
        return get_c4(nsamples, seed, seqlen, model_id, trust_remote_code)


def find_layers(module, layers=(nn.Conv2d, nn.Linear), name=""):
    # Skip last lm_head linear
    # Need isintance Falcon is inheriting Linear.
    if isinstance(module, layers) and "lm_head" not in name:
        return {name: module}
    res = {}
    for name1, child in module.named_children():
        res.update(
            find_layers(
                child, layers=layers, name=name + "." + name1 if name != "" else name1
            )
        )
    return res


@torch.no_grad()
def sequential(
    model,
    dataloader,
    dev,
    nsamples,
    bits,
    groupsize,
    *,
    hooks,
    percdamp=0.01,
    sym: bool = False,
    act_order: bool = False,
):
    print("Starting ...")

    use_cache = model.config.use_cache
    model.config.use_cache = False
    try:
        layers = model.model.layers
        prefix = "model.layers"
    except Exception:
        layers = model.transformer.h
        prefix = "transformer.h"

    dtype = next(iter(model.parameters())).dtype
    inps = torch.zeros(
        (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
    )

    cache = {"i": 0}
    extra = {}

    class Catcher(nn.Module):
        def __init__(self, module):
            super().__init__()
            self.module = module

        def forward(self, inp, **kwargs):
            inps[cache["i"]] = inp
            cache["i"] += 1
            extra.update(kwargs.copy())
            raise ValueError

    layers[0] = Catcher(layers[0])
    for batch in dataloader:
        try:
            model(batch[0].cuda())
        except ValueError:
            pass
    layers[0] = layers[0].module

    # layers[0] = layers[0].cpu()
    # model.model.embed_tokens = model.model.embed_tokens.cpu()
    # model.model.norm = model.model.norm.cpu()
    torch.cuda.empty_cache()
    for hook in hooks:
        hook.remove()

    outs = torch.zeros_like(inps)

    extra = {
        k: v.to(dev) if isinstance(v, torch.Tensor) else v for k, v in extra.items()
    }

    print("Ready.")

    quantizers = {}
    for i in range(len(layers)):
        print(f"Quantizing layer {i+1}/{len(layers)}..")
        print("+------------------+--------------+------------+-----------+-------+")
        print("|       name       | weight_error | fp_inp_SNR | q_inp_SNR | time  |")
        print("+==================+==============+============+===========+=======+")

        layer = layers[i]
        layer.load()
        full = find_layers(layer)
        sequential = [list(full.keys())]

        for names in sequential:
            subset = {n: full[n] for n in names}
            gptq = {}
            for name in subset:
                gptq[name] = GPTQ(subset[name])
                gptq[name].quantizer.configure(
                    bits, perchannel=True, sym=sym, mse=False
                )
                pass

            def add_batch(name):
                nonlocal gptq

                def tmp(_, inp, out):
                    gptq[name].add_batch(inp[0].data, out.data)

                return tmp

            handles = []
            for name in subset:
                handles.append(subset[name].register_forward_hook(add_batch(name)))
            for j in range(nsamples):
                outs[j] = layer(inps[j].unsqueeze(0), **extra)[0]
            for h in handles:
                h.remove()

            for name in subset:
                scale, zero, g_idx, error = gptq[name].fasterquant(
                    percdamp=percdamp,
                    groupsize=groupsize,
                    act_order=act_order,
                    name=name,
                )
                quantizers[f"{prefix}.{i}.{name}"] = (
                    gptq[name].quantizer.cpu(),
                    scale.cpu(),
                    zero.cpu(),
                    g_idx.cpu(),
                    bits,
                    groupsize,
                )

                gptq[name].free()

        for j in range(nsamples):
            outs[j] = layer(inps[j].unsqueeze(0), **extra)[0]

        layer.unload()
        del layer
        del gptq
        torch.cuda.empty_cache()

        inps, outs = outs, inps
        print("+------------------+--------------+------------+-----------+-------+")
        print("\n")

    model.config.use_cache = use_cache

    return quantizers


def make_quant_linear(module, names, bits, groupsize, name=""):
    if isinstance(module, QuantLinear):
        return
    for attr in dir(module):
        tmp = getattr(module, attr)
        name1 = name + "." + attr if name != "" else attr
        if name1 in names:
            delattr(module, attr)
            setattr(
                module,
                attr,
                QuantLinear.new(
                    bits,
                    groupsize,
                    tmp.in_features,
                    tmp.out_features,
                    tmp.bias is not None,
                ),
            )
    for name1, child in module.named_children():
        make_quant_linear(
            child, names, bits, groupsize, name + "." + name1 if name != "" else name1
        )


# TODO: perform packing on GPU
def pack(model, quantizers, bits, groupsize):
    layers = find_layers(model)
    layers = {n: layers[n] for n in quantizers}
    make_quant_linear(model, quantizers, bits, groupsize)
    qlayers = find_layers(model, (QuantLinear,))
    print("Packing ...")
    for name in qlayers:
        print(name)
        quantizers[name], scale, zero, g_idx, _, _ = quantizers[name]
        qlayers[name].pack(layers[name], scale, zero, g_idx)
    print("Done.")
    return model


def setdeepattr(module, full_name, tensor):
    current = module
    tokens = full_name.split(".")
    for token in tokens[:-1]:
        current = getattr(current, token)
    setattr(current, tokens[-1], tensor)


def getdeepattr(module, full_name):
    current = module
    tokens = full_name.split(".")
    for token in tokens:
        current = getattr(current, token)
    return current


def load_weights_pre_hook(module_name, weights, recursive=False):
    def inner(module, args):
        print(f"Pre hook {module_name}")
        local_params = {}
        for k, v in module.named_parameters():
            if not recursive and k.count(".") != 1:
                continue
            local_params[k] = v
        for k, v in module.named_buffers():
            if not recursive and k.count(".") != 1:
                continue
            local_params[k] = v

        for local_param in local_params:
            current_tensor = getdeepattr(module, local_param)
            if current_tensor.device == torch.device("meta"):
                # print(f"Loading {local_param}")
                if module_name:
                    tensor_name = f"{module_name}.{local_param}"
                else:
                    tensor_name = local_param
                tensor = weights.get_tensor(tensor_name)
                setdeepattr(module, local_param, nn.Parameter(tensor))
            else:
                tensor = current_tensor.to(device=torch.device("cuda:0"))
                if current_tensor.requires_grad:
                    tensor = nn.Parameter(tensor)
                setdeepattr(module, local_param, tensor)

    return inner


def load_weights_post_hook(module_name, weights, recursive=False):
    def inner(module, args, output):
        print(f"Post hook {module_name}")
        local_params = {}
        for k, v in module.named_parameters():
            if not recursive and k.count(".") != 1:
                continue
            local_params[k] = v
        for k, v in module.named_buffers():
            if not recursive and k.count(".") != 1:
                continue
            local_params[k] = v
        for local_param in local_params:
            # print(f"Unloading {local_param}")
            current_tensor = getdeepattr(module, local_param)
            setdeepattr(
                module,
                local_param,
                nn.Parameter(current_tensor.to(device=torch.device("cpu"))),
            )
        return output

    return inner


def quantize(
    model_id: str,
    bits: int,
    groupsize: int,
    output_dir: str,
    revision: str,
    trust_remote_code: bool,
    upload_to_model_id: Optional[str],
    percdamp: float,
    act_order: bool,
    sym: bool,
):
    print("loading model")
    config = AutoConfig.from_pretrained(
        model_id,
        trust_remote_code=trust_remote_code,
    )

    with init_empty_weights():
        model = AutoModelForCausalLM.from_config(
            config, torch_dtype=torch.float16, trust_remote_code=trust_remote_code
        )
    model = model.eval()

    print("LOADED model")
    files = weight_files(model_id, revision, extension=".safetensors")
    process_group, _, _ = initialize_torch_distributed()
    weights = Weights(
        files,
        device=torch.device("cuda:0"),
        dtype=torch.float16,
        process_group=process_group,
        aliases={"embed_tokens.weight": ["lm_head.weight"]},
        weights_loader=DefaultWeightsLoader(UnquantizedWeight),
    )
    hooks = []
    for name, module in model.named_modules():

        def load(module, name):
            def _load():
                load_weights_pre_hook(name, weights, recursive=True)(module, None)

            return _load

        def unload(module, name):
            def _unload():
                load_weights_post_hook(name, weights, recursive=True)(
                    module, None, None
                )

            return _unload

        module.load = load(module, name)
        module.unload = unload(module, name)
        hooks.append(
            module.register_forward_pre_hook(load_weights_pre_hook(name, weights))
        )
        hooks.append(
            module.register_forward_hook(load_weights_post_hook(name, weights))
        )
    model.seqlen = 2048

    dataset = "wikitext2"
    nsamples = 128
    seed = None

    dataloader, testloader = get_loaders(
        dataset,
        nsamples=nsamples,
        seed=seed,
        model_id=model_id,
        seqlen=model.seqlen,
        trust_remote_code=trust_remote_code,
    )

    tick = time.time()
    quantizers = sequential(
        model,
        dataloader,
        DEV,
        nsamples,
        bits,
        groupsize,
        percdamp=percdamp,
        act_order=act_order,
        hooks=hooks,
        sym=sym,
    )
    print(time.time() - tick)

    pack(model, quantizers, bits, groupsize)
    from safetensors.torch import save_file
    from huggingface_hub import split_torch_state_dict_into_shards

    state_dict = model.state_dict()
    state_dict = {k: v.cpu().contiguous() for k, v in state_dict.items()}

    max_shard_size = "10GB"
    state_dict_split = split_torch_state_dict_into_shards(
        state_dict,
        filename_pattern="model.safetensors",
        max_shard_size=max_shard_size,
    )
    index = None
    if state_dict_split.is_sharded:
        index = {
            "metadata": state_dict_split.metadata,
            "weight_map": state_dict_split.tensor_to_filename,
        }
    shards = state_dict_split.filename_to_tensors
    os.makedirs(output_dir, exist_ok=True)
    for shard_file, shard in shards.items():
        save_file(
            shard,
            os.path.join(output_dir, shard_file),
            metadata={
                "format": "pt",
                "quantized": "gptq",
                "origin": "text-generation-inference",
            },
        )
    if index is None:
        path_to_weights = os.path.join(output_dir, "model.safetensors")
        logger.info(f"Model weights saved in {path_to_weights}")
    else:
        save_index_file = "model.safetensors.index.json"
        save_index_file = os.path.join(output_dir, save_index_file)
        with open(save_index_file, "w", encoding="utf-8") as f:
            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
            f.write(content)
        logger.info(
            f"The model is bigger than the maximum size per checkpoint ({max_shard_size}) and is going to be "
            f"split in {len(shards)} checkpoint shards. You can find where each parameters has been saved in the "
            f"index located at {save_index_file}."
        )
    config = AutoConfig.from_pretrained(model_id, trust_remote_code=trust_remote_code)
    config.quantization_config = {
        "bits": bits,
        "group_size": groupsize,
        "damp_percent": percdamp,
        "desc_act": act_order,
        "static_groups": False,
        "sym": sym,
        "quant_method": "gptq",
    }
    config.save_pretrained(output_dir)
    logger.info("Saved config")
    logger.info("Saving tokenizer")
    tokenizer = AutoTokenizer.from_pretrained(
        model_id, trust_remote_code=trust_remote_code
    )
    tokenizer.save_pretrained(output_dir)
    logger.info("Saved tokenizer")

    if upload_to_model_id:
        api = HfApi()

        api.upload_folder(
            folder_path=output_dir, repo_id=upload_to_model_id, repo_type="model"
        )


================================================
FILE: server/text_generation_server/layers/gptq/triton.py
================================================
import math
import numpy as np
import torch
import torch.nn as nn
from torch.cuda.amp import custom_fwd

import triton
import triton.language as tl
from . import custom_autotune


# code based https://github.com/fpgaminer/GPTQ-triton
@custom_autotune.autotune(
    configs=[
        triton.Config(
            {
                "BLOCK_SIZE_M": 64,
                "BLOCK_SIZE_N": 256,
                "BLOCK_SIZE_K": 32,
                "GROUP_SIZE_M": 8,
            },
            num_stages=4,
            num_warps=4,
        ),
        triton.Config(
            {
                "BLOCK_SIZE_M": 128,
                "BLOCK_SIZE_N": 128,
                "BLOCK_SIZE_K": 32,
                "GROUP_SIZE_M": 8,
            },
            num_stages=4,
            num_warps=4,
        ),
        triton.Config(
            {
                "BLOCK_SIZE_M": 64,
                "BLOCK_SIZE_N": 128,
                "BLOCK_SIZE_K": 32,
                "GROUP_SIZE_M": 8,
            },
            num_stages=4,
            num_warps=4,
        ),
        triton.Config(
            {
                "BLOCK_SIZE_M": 128,
                "BLOCK_SIZE_N": 32,
                "BLOCK_SIZE_K": 32,
                "GROUP_SIZE_M": 8,
            },
            num_stages=4,
            num_warps=4,
        ),
        triton.Config(
            {
                "BLOCK_SIZE_M": 64,
                "BLOCK_SIZE_N": 64,
                "BLOCK_SIZE_K": 32,
                "GROUP_SIZE_M": 8,
            },
            num_stages=4,
            num_warps=4,
        ),
        triton.Config(
            {
                "BLOCK_SIZE_M": 64,
                "BLOCK_SIZE_N": 128,
                "BLOCK_SIZE_K": 32,
                "GROUP_SIZE_M": 8,
            },
            num_stages=2,
            num_warps=8,
        ),
        triton.Config(
            {
                "BLOCK_SIZE_M": 64,
                "BLOCK_SIZE_N": 64,
                "BLOCK_SIZE_K": 64,
                "GROUP_SIZE_M": 8,
            },
            num_stages=3,
            num_warps=8,
        ),
        triton.Config(
            {
                "BLOCK_SIZE_M": 32,
                "BLOCK_SIZE_N": 32,
                "BLOCK_SIZE_K": 128,
                "GROUP_SIZE_M": 8,
            },
            num_stages=2,
            num_warps=4,
        ),
    ],
    key=["M", "N", "K"],
    nearest_power_of_two=True,
    prune_configs_by={
        "early_config_prune": custom_autotune.matmul248_kernel_config_pruner,
        "perf_model": None,
        "top_k": None,
    },
)
@triton.jit
def matmul_248_kernel(
    a_ptr,
    b_ptr,
    c_ptr,
    scales_ptr,
    zeros_ptr,
    g_ptr,
    M,
    N,
    K,
    bits,
    maxq,
    stride_am,
    stride_ak,
    stride_bk,
    stride_bn,
    stride_cm,
    stride_cn,
    stride_scales,
    stride_zeros,
    BLOCK_SIZE_M: tl.constexpr,
    BLOCK_SIZE_N: tl.constexpr,
    BLOCK_SIZE_K: tl.constexpr,
    GROUP_SIZE_M: tl.constexpr,
):
    """
    Compute the matrix multiplication C = A x B.
    A is of shape (M, K) float16
    B is of shape (K//8, N) int32
    C is of shape (M, N) float16
    scales is of shape (G, N) float16
    zeros is of shape (G, N) float16
    g_ptr is of shape (K) int32
    """
    infearure_per_bits = 32 // bits

    pid = tl.program_id(axis=0)
    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
    num_pid_k = tl.cdiv(K, BLOCK_SIZE_K)
    num_pid_in_group = GROUP_SIZE_M * num_pid_n
    group_id = pid // num_pid_in_group
    first_pid_m = group_id * GROUP_SIZE_M
    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
    pid_m = first_pid_m + (pid % group_size_m)
    pid_n = (pid % num_pid_in_group) // group_size_m

    offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
    offs_k = tl.arange(0, BLOCK_SIZE_K)
    a_ptrs = a_ptr + (
        offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak
    )  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
    a_mask = offs_am[:, None] < M
    # b_ptrs is set up such that it repeats elements along the K axis 8 times
    b_ptrs = b_ptr + (
        (offs_k[:, None] // infearure_per_bits) * stride_bk
        + offs_bn[None, :] * stride_bn
    )  # (BLOCK_SIZE_K, BLOCK_SIZE_N)
    g_ptrs = g_ptr + offs_k
    # shifter is used to extract the N bits of each element in the 32-bit word from B
    scales_ptrs = scales_ptr + offs_bn[None, :]
    zeros_ptrs = zeros_ptr + (offs_bn[None, :] // infearure_per_bits)

    shifter = (offs_k % infearure_per_bits) * bits
    zeros_shifter = (offs_bn % infearure_per_bits) * bits
    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)

    for k in range(0, num_pid_k):
        g_idx = tl.load(g_ptrs)

        # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
        scales = tl.load(
            scales_ptrs + g_idx[:, None] * stride_scales
        )  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
        zeros = tl.load(
            zeros_ptrs + g_idx[:, None] * stride_zeros
        )  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)

        zeros = (zeros >> zeros_shifter[None, :]) & maxq
        zeros = (zeros + 1) & maxq  # eventually avoid overflow

        a = tl.load(a_ptrs, mask=a_mask, other=0.0)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
        b = tl.load(b_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated

        # Now we need to unpack b (which is N-bit values) into 32-bit values
        b = (b >> shifter[:, None]) & maxq  # Extract the N-bit values
        b = (b - zeros) * scales  # Scale and shift

        accumulator += tl.dot(a, b)
        a_ptrs += BLOCK_SIZE_K
        b_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk
        g_ptrs += BLOCK_SIZE_K

    c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]
    c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N)
    tl.store(c_ptrs, accumulator, mask=c_mask)


def matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq):
    with (
        torch.xpu.device(input.device)
        if torch.xpu.is_available()
        else torch.cuda.device(input.device)
    ):
        output = torch.empty(
            (input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16
        )

        def grid(META):
            return (
                triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"])
                * triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"]),
            )

        matmul_248_kernel[grid](
            input,
            qweight,
            output,
            scales,
            qzeros,
            g_idx,
            input.shape[0],
            qweight.shape[1],
            input.shape[1],
            bits,
            maxq,
            input.stride(0),
            input.stride(1),
            qweight.stride(0),
            qweight.stride(1),
            output.stride(0),
            output.stride(1),
            scales.stride(0),
            qzeros.stride(0),
        )
        return output


class QuantLinearFunction(torch.autograd.Function):
    @staticmethod
    @custom_fwd(cast_inputs=torch.float16)
    def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
        output = matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq)
        return output


class QuantLinear(nn.Module):
    def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize):
        super().__init__()
        self.register_buffer("qweight", qweight)
        self.register_buffer("qzeros", qzeros)
        self.register_buffer("scales", scales)
        self.register_buffer("g_idx", g_idx)
        if bias is not None:
            self.register_buffer("bias", bias)
        else:
            self.bias = None
        if bits not in [2, 4, 8]:
            raise NotImplementedError("Only 2,4,8 bits are supported.")
        self.bits = bits
        self.maxq = 2**self.bits - 1
        self.groupsize = groupsize

        self.outfeatures = qweight.shape[1]
        self.infeatures = qweight.shape[0] * 32 // bits

    @classmethod
    def new(cls, bits, groupsize, infeatures, outfeatures, bias):
        if bits not in [2, 4, 8]:
            raise NotImplementedError("Only 2,4,8 bits are supported.")

        qweight = torch.zeros((infeatures // 32 * bits, outfeatures), dtype=torch.int32)
        qzeros = torch.zeros(
            (math.ceil(infeatures / groupsize), outfeatures // 32 * bits),
            dtype=torch.int32,
        )
        scales = torch.zeros(
            (math.ceil(infeatures / groupsize), outfeatures), dtype=torch.float16
        )
        g_idx = torch.tensor(
            [i // groupsize for i in range(infeatures)], dtype=torch.int32
        )
        if bias:
            bias = torch.zeros((outfeatures), dtype=torch.float16)
        else:
            bias = None
        return cls(qweight, qzeros, scales, g_idx, bias, bits, groupsize)

    def pack(self, linear, scales, zeros, g_idx=None):
        self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx

        scales = scales.t().contiguous()
        zeros = zeros.t().contiguous()
        scale_zeros = zeros * scales
        self.scales = scales.clone().half()
        if linear.bias is not None:
            self.bias = linear.bias.clone().half()

        intweight = []
        for idx in range(self.infeatures):
            intweight.append(
                torch.round(
                    (linear.weight.data[:, idx] + scale_zeros[self.g_idx[idx]])
                    / self.scales[self.g_idx[idx]]
                ).to(torch.int)[:, None]
            )
        intweight = torch.cat(intweight, dim=1)
        intweight = intweight.t().contiguous()
        intweight = intweight.numpy().astype(np.uint32)
        qweight = np.zeros(
            (intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32
        )
        i = 0
        row = 0
        while row < qweight.shape[0]:
            if self.bits in [2, 4, 8]:
                for j in range(i, i + (32 // self.bits)):
                    qweight[row] |= intweight[j] << (self.bits * (j - i))
                i += 32 // self.bits
                row += 1
            else:
                raise NotImplementedError("Only 2,4,8 bits are supported.")

        qweight = qweight.astype(np.int32)
        self.qweight = torch.from_numpy(qweight)

        zeros -= 1
        zeros = zeros.numpy().astype(np.uint32)
        qzeros = np.zeros(
            (zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32
        )
        i = 0
        col = 0
        while col < qzeros.shape[1]:
            if self.bits in [2, 4, 8]:
                for j in range(i, i + (32 // self.bits)):
                    qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
                i += 32 // self.bits
                col += 1
            else:
                raise NotImplementedError("Only 2,4,8 bits are supported.")

        qzeros = qzeros.astype(np.int32)
        self.qzeros = torch.from_numpy(qzeros)

    def forward(self, x):
        out_shape = x.shape[:-1] + (self.outfeatures,)
        out = QuantLinearFunction.apply(
            x.reshape(-1, x.shape[-1]),
            self.qweight,
            self.scales,
            self.qzeros,
            self.g_idx,
            self.bits,
            self.maxq,
        )
        out = out + self.bias if self.bias is not None else out
        return out.reshape(out_shape)


================================================
FILE: server/text_generation_server/layers/gptq/utils.py
================================================
import torch


# copied from https://github.com/openppl-public/ppq/blob/master/ppq/quantization/measure/norm.py
def torch_snr_error(
    y_pred: torch.Tensor, y_real: torch.Tensor, reduction: str = "mean"
) -> torch.Tensor:
    """
    Compute SNR between y_pred(tensor) and y_real(tensor)

    SNR can be calcualted as following equation:

        SNR(pred, real) = (pred - real) ^ 2 / (real) ^ 2

    if x and y are matrixs, SNR error over matrix should be the mean value of SNR error over all elements.

        SNR(pred, real) = mean((pred - real) ^ 2 / (real) ^ 2)

    Args:
        y_pred (torch.Tensor): _description_
        y_real (torch.Tensor): _description_
        reduction (str, optional): _description_. Defaults to 'mean'.

    Raises:
        ValueError: _description_
        ValueError: _description_

    Returns:
        torch.Tensor: _description_
    """
    if y_pred.shape != y_real.shape:
        raise ValueError(
            f"Can not compute snr loss for tensors with different shape. "
            f"({y_pred.shape} and {y_real.shape})"
        )
    reduction = str(reduction).lower()

    if y_pred.ndim == 1:
        y_pred = y_pred.unsqueeze(0)
        y_real = y_real.unsqueeze(0)

    y_pred = y_pred.flatten(start_dim=1)
    y_real = y_real.flatten(start_dim=1)

    noise_power = torch.pow(y_pred - y_real, 2).sum(dim=-1)
    signal_power = torch.pow(y_real, 2).sum(dim=-1)
    snr = (noise_power) / (signal_power + 1e-7)

    if reduction == "mean":
        return torch.mean(snr)
    elif reduction == "sum":
        return torch.sum(snr)
    elif reduction == "none":
        return snr
    else:
        raise ValueError("Unsupported reduction method.")


================================================
FILE: server/text_generation_server/layers/layernorm.py
================================================
import torch
from torch import nn
from accelerate import init_empty_weights
from text_generation_server.utils.import_utils import (
    SYSTEM,
)


# Monkey patching
@classmethod
def load_layer_norm(cls, prefix, weights, eps):
    weight = weights.get_tensor(f"{prefix}.weight")
    bias = weights.get_tensor(f"{prefix}.bias")
    with init_empty_weights():
        ln = cls(weight.shape, eps=eps)

    ln.weight = torch.nn.Parameter(weight)
    ln.bias = torch.nn.Parameter(bias)
    return ln


@classmethod
def load_layer_norm_no_bias(cls, prefix, weights, eps):
    weight = weights.get_tensor(f"{prefix}.weight")
    with init_empty_weights():
        ln = cls(weight.shape, eps=eps)

    ln.weight = torch.nn.Parameter(weight)
    ln.bias = None
    return ln


torch.nn.LayerNorm.load = load_layer_norm
torch.nn.LayerNorm.load_no_bias = load_layer_norm_no_bias

if SYSTEM == "cuda":
    import dropout_layer_norm

    class FastLayerNorm(nn.LayerNorm):
        def forward(self, hidden_states, residual=None):
            if hidden_states.shape[-1] > 8192:
                if residual is not None:
                    hidden_states += residual
                residual = hidden_states

                return super(FastLayerNorm, self).forward(hidden_states), residual
            else:
                (
                    normed_hidden_states,
                    residual,
                    *rest,
                ) = dropout_layer_norm.dropout_add_ln_fwd(
                    hidden_states,
                    residual,
                    self.weight,
                    self.bias,
                    None,
                    None,
                    None,
                    None,
                    0.0,
                    self.eps,
                    1.0,
                    0,
                    None,
                    False,
                    False,
                )
                if residual is None:
                    residual = hidden_states

                return normed_hidden_states, residual

elif SYSTEM == "rocm":
    import vllm._custom_ops as ops

    class FastLayerNorm(nn.LayerNorm):
        def forward(self, hidden_states, residual=None):
            if residual is not None:
                hidden_states += residual
            residual = hidden_states

            return super().forward(hidden_states), residual

elif SYSTEM == "ipex":
    import intel_extension_for_pytorch as ipex

    class FastLayerNorm(nn.LayerNorm):
        def forward(self, hidden_states, residual=None):
            out = ipex.llm.functional.add_layer_norm(
                residual,
                hidden_states,
                self.weight,
                self.bias,
                self.eps,
                residual is not None,
            )
            return out, residual if residual is not None else hidden_states


class FastRMSNorm(nn.Module):
    def __init__(self, weight: torch.Tensor, eps: float):
        super().__init__()

        self.weight = nn.Parameter(weight)
        self.variance_epsilon = eps

    @classmethod
    def load(cls, prefix, weights, eps=1e-6):
        weight = weights.get_tensor(f"{prefix}.weight")
        return cls(weight, eps)

    def forward(self, hidden_states, residual=None):
        if SYSTEM == "ipex":
            out = ipex.llm.functional.add_rms_norm(
                residual,
                hidden_states,
                self.weight,
                None,
                self.variance_epsilon,
                residual is not None,
            )
            return out, residual if residual is not None else hidden_states
        elif SYSTEM == "rocm":
            # We use VLLM RMSNorm kernel that can be compiled for RoCm, instead of Flash Attention ones that can not.
            if residual is not None:
                ops.fused_add_rms_norm(
                    hidden_states,
                    residual,
                    self.weight.data,
                    self.variance_epsilon,
                )
                return hidden_states, residual

            residual = hidden_states

            out = torch.empty_like(hidden_states)
            ops.rms_norm(
                out,
                hidden_states,
                self.weight.data,
                self.variance_epsilon,
            )
            return out, residual
        elif hidden_states.shape[-1] > 8192:
            if residual is not None:
                hidden_states += residual
            residual = hidden_states

            hidden_states = hidden_states.to(torch.float32)
            variance = hidden_states.pow(2).mean(-1, keepdim=True)
            hidden_states = hidden_states * torch.rsqrt(
                variance + self.variance_epsilon
            )

            # convert into half-precision if necessary
            if self.weight.dtype in [torch.float16, torch.bfloat16]:
                hidden_states = hidden_states.to(self.weight.dtype)

            return self.weight * hidden_states, residual
        elif SYSTEM == "cuda":
            # faster post attention rms norm
            (
                normed_hidden_states,
                res,
                *rest,
            ) = dropout_layer_norm.dropout_add_ln_fwd(
                hidden_states,
                residual,
                self.weight,
                None,
                None,
                None,
                None,
                None,
                0.0,
                self.variance_epsilon,
                1.0,
                0,
                None,
                False,
                True,  # Activate RMSNorm
            )
            if res is None:
                res = hidden_states

            return normed_hidden_states, res
        else:
            raise ValueError(
                "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
            )


================================================
FILE: server/text_generation_server/layers/linear.py
================================================
import torch
from text_generation_server.utils.import_utils import SYSTEM
from torch.nn import functional as F
import os

if SYSTEM == "rocm":
    ROCM_USE_SKINNY_GEMM = os.getenv("ROCM_USE_SKINNY_GEMM", "True").lower() in (
        "true",
        "1",
    )

    if ROCM_USE_SKINNY_GEMM:
        try:
            import vllm._custom_ops as ops
        except Exception as e:
            raise ImportError(
                f"Could not load `vllm._custom_ops` for ROCm skinny gemm. Full error: {e}"
            )


class FastLinear(torch.nn.Module):
    def __init__(
        self,
        weight,
        bias,
    ) -> None:
        super().__init__()
        self.weight = torch.nn.Parameter(weight, requires_grad=False)
        if bias is not None:
            self.bias = torch.nn.Parameter(bias, requires_grad=False)
        else:
            self.bias = None

    @classmethod
    def load(cls, config, prefix: str, weights, bias: bool):
        weight = weights.get_tensor(f"{prefix}.weight")
        if bias:
            bias = weights.get_tensor(f"{prefix}.bias")
        else:
            bias = None
        return cls(weight, bias)

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        return F.linear(input, self.weight, self.bias)


class FastLinearROCm(torch.nn.Module):
    def __init__(
        self,
        weight,
        bias,
    ) -> None:
        super().__init__()
        self.weight = torch.nn.Parameter(weight)
        if bias is not None:
            self.bias = torch.nn.Parameter(bias)
        else:
            self.bias = None

        self.cu_count = torch.cuda.get_device_properties(
            device="cuda"
        ).multi_processor_count
        self.use_skinny_gemm = (
            ROCM_USE_SKINNY_GEMM
            and "gfx1" not in torch.cuda.get_device_properties("cuda").gcnArchName
        )

    @classmethod
    def load(cls, config, prefix: str, weights, bias: bool):
        weight = weights.get_tensor(f"{prefix}.weight")
        if bias:
            bias = weights.get_tensor(f"{prefix}.bias")
        else:
            bias = None
        return cls(weight, bias)

    def forward(self, inp: torch.Tensor) -> torch.Tensor:
        weight = self.weight
        bias = self.bias

        if (
            self.use_skinny_gemm
            and inp.dtype == torch.float16
            and inp.shape[-1] % 8 == 0
        ):
            batched = False
            inp_shape = inp.shape

            if inp.dim() == 3:
                inp = inp.view(-1, inp_shape[-1])
                batched = True

            m, n, k = weight.shape[0], inp_shape[0], inp_shape[1]
            if m > 8 and n <= 4:
                out = torch.empty(
                    inp_shape[0], weight.shape[0], dtype=inp.dtype, device=weight.device
                )
                ops.wvSpltK(weight, inp, out, n, self.cu_count)
            elif m % 4 == 0 and n == 1 and k <= 8192:
                out = torch.empty(
                    inp_shape[0], weight.shape[0], dtype=inp.dtype, device=weight.device
                )
                ops.LLMM1(weight, inp, out, 4)
            else:
                out = F.linear(inp, weight)

            if batched:
                out.view(*inp_shape[:-1], out.shape[-1])

            if bias is not None:
                out = out + bias
            return out
        return F.linear(inp, self.weight, self.bias)


def get_linear(weight, bias):
    # Weights that are loaded through methods that are not
    # quantization-aware are still bare tensors. We may want
    # to change this in the future.
    if isinstance(weight, torch.Tensor):
        if SYSTEM == "rocm":
            return FastLinearROCm(weight, bias)
        else:
            return FastLinear(weight, bias)

    return weight.get_linear(bias)


================================================
FILE: server/text_generation_server/layers/lora.py
================================================
from typing import TYPE_CHECKING, Optional, List

import torch
import torch.distributed
from torch import nn
from torch.distributed import ProcessGroup
from text_generation_server.utils.import_utils import SYSTEM

from text_generation_server.utils.kernels import load_kernel

if SYSTEM == "cuda":
    punica_sgmv = load_kernel(
        module="punica_sgmv", repo_id="kernels-community/punica-sgmv"
    )
else:
    punica_sgmv = None

if SYSTEM == "ipex":
    try:
        from intel_extension_for_pytorch.llm.functional import (
            bgmv_expand,
            bgmv_shrink,
            sgmv_expand,
            sgmv_shrink,
        )
    except ImportError:
        pass


if TYPE_CHECKING:
    from text_generation_server.adapters import AdapterBatchData
    from text_generation_server.adapters.lora import BatchLoraWeights


class LoraLinear(nn.Module):
    def __init__(
        self, base_layer: nn.Module, layer_id: int, process_group: ProcessGroup
    ):
        super().__init__()
        self.base_layer = base_layer
        self.layer_id = layer_id
        self.process_group = process_group

    def forward_layer_type(
        self,
        result: torch.Tensor,
        input: torch.Tensor,
        adapter_data: "AdapterBatchData",
        layer_type: str,
        start_idx: int,
        end_idx: int,
    ) -> torch.Tensor:
        if adapter_data is None:
            return result
        data: Optional["BatchLoraWeights"] = adapter_data.data.get(layer_type)

        if data is not None and (
            SYSTEM == "ipex"
            or (punica_sgmv is not None and data.can_vectorize(self.process_group))
        ):
            # In tensor-parallel configurations, each GPU processes a specific segment of the output.
            # The 'result' tensor represents the full output, which can vary in size based on
            # the layer type (e.g., attention vs. feed-forward layers). We define the current
            # segment using start_idx and end_idx. If the segment size doesn't match this GPU's
            # slice of 'result', we create a zero tensor of the correct size for LoRA computation.
            # This approach ensures accurate LoRA application across various layer sizes and
            # configurations, adapting to different model architectures and parallelization strategies.
            #
            # Example scenarios where this is necessary:
            # 1. The adapter's size doesn't evenly divide across GPUs.
            # 2. We're processing the last segment which might be smaller.
            # 3. Different projection layers (q, k, v) have different sizes.
            if end_idx - start_idx != result.shape[1]:
                proj = torch.zeros_like(result[:, start_idx:end_idx])
            else:
                proj = result

            for r, rank_segments in data.rank_data.items():
                if SYSTEM == "ipex":
                    lora_a_ptr = rank_segments.lora_a_ptr[
                        :, self.layer_id, :
                    ].contiguous()
                    lora_b_ptr = rank_segments.lora_b_ptr[
                        :, self.layer_id, :
                    ].contiguous()
                else:
                    lora_a_ptr = rank_segments.lora_a_ptr
                    lora_b_ptr = rank_segments.lora_b_ptr

                if lora_a_ptr is None or lora_b_ptr is None:
                    raise ValueError("LoRA data is missing")

                if data.use_sgmv:
                    if SYSTEM == "ipex":
                        # Use SGMV for prefill
                        seq_len_tensor = (
                            rank_segments.segment_ends - rank_segments.segment_starts
                        ).to(torch.int64)
                        b_seq_start_loc = rank_segments.segment_starts.to(torch.int64)
                        total_tokens = seq_len_tensor.sum()
                        v = torch.zeros(
                            (total_tokens, r), dtype=input.dtype, device=input.device
                        )
                        bs = seq_len_tensor.shape[0]
                        sgmv_shrink(
                            input,
                            lora_a_ptr,
                            v,
                            b_seq_start_loc,
                            seq_len_tensor,
                            rank_segments.indices,
                            bs,
                            seq_len_tensor.max().item(),
                            1.0,
                        )
                    else:
                        # Use SGMV for prefill
                        v = punica_sgmv.lora_a_sgmv_cutlass(
                            input,
                            rank_segments.tmp_shrink,
                            lora_a_ptr,
                            rank_segments.segment_starts,
                            rank_segments.segment_ends,
                            self.layer_id,
                            r,
                        )

                    if self.process_group.size() > 1:
                        v = self.collect_lora_a(v)
                    if SYSTEM == "ipex":
                        sgmv_expand(
                            v,
                            lora_b_ptr,
                            proj,
                            b_seq_start_loc,
                            seq_len_tensor,
                            rank_segments.indices,
                            bs,
                            seq_len_tensor.max().item(),
                            add_inputs=True,
                        )
                    else:
                        punica_sgmv.lora_b_sgmv_cutlass(
                            proj,
                            v,
                            rank_segments.tmp_expand,
                            lora_b_ptr,
                            rank_segments.segment_starts,
                            rank_segments.segment_ends,
                            self.layer_id,
                        )
                else:
                    # Use BGMV for decode
                    v = torch.zeros(
                        (input.size(0), r), dtype=input.dtype, device=input.device
                    )
                    if SYSTEM == "ipex":
                        bgmv_shrink(
                            input,
                            lora_a_ptr,
                            v,
                            rank_segments.indices,
                            1.0,
                        )
                    else:
                        # TODO: error with [-1, 0], but not [0, -1]
                        punica_sgmv.add_lora_a_bgmv(
                            v,
                            input,
                            lora_a_ptr,
                            rank_segments.indices,
                            self.layer_id,
                        )

                    if self.process_group.size() > 1:
                        v = self.collect_lora_a(v)

                    if SYSTEM == "ipex":
                        bgmv_expand(
                            v,
                            lora_b_ptr,
                            proj,
                            rank_segments.indices,
                            add_inputs=True,
                        )
                    else:
                        punica_sgmv.add_lora_b_bgmv(
                            proj,
                            v,
                            lora_b_ptr,
                            rank_segments.indices,
                            self.layer_id,
                        )

            if end_idx - start_idx != result.shape[1]:
                result[:, start_idx:end_idx] += proj
        else:
            for adapter_index in adapter_data.meta.adapter_set:
                if data is not None and data.has_adapter(adapter_index):
                    adapter_mask = (
                        (adapter_data.meta.adapter_indices == adapter_index)
                        .to(input.dtype)
                        .view(-1, 1)
                    )
                    layer_result = self.forward_lora(
                        input, data, adapter_index, adapter_mask
                    )
                    result[:, start_idx:end_idx] += layer_result

        return result

    def forward_lora(
        self,
        input: torch.Tensor,
        data: "BatchLoraWeights",
        adapter_index: int,
        adapter_mask: torch.Tensor,
    ) -> torch.Tensor:
        lora_a = data.lora_a[adapter_index][self.layer_id, :, :]
        lora_b = data.lora_b[adapter_index][self.layer_id, :, :]

        lora_a = punica_sgmv.orient_for_rank(lora_a, lora_b.size(0))

        a_out = input @ lora_a
        if self.process_group.size() > 1:
            a_out = self.collect_lora_a(a_out)

        result = (a_out @ lora_b) * adapter_mask
        return result

    def collect_lora_a(self, a_out: torch.Tensor) -> torch.Tensor:
        raise NotImplementedError("Implemented in subclasses")


class TensorParallelMultiAdapterLinear(LoraLinear):
    def __init__(
        self,
        base_layer: nn.Module,
        layer_id: int,
        layer_names: List[str],
        sizes: List[int],
        process_group: ProcessGroup,
    ):
        super().__init__(base_layer, layer_id, process_group)
        self.layer_names = layer_names
        self.sizes = sizes

    @classmethod
    def load(
        cls,
        base_layer: nn.Module,
        layer_id: int,
        layer_names: List[str],
        sizes: List[int],
        process_group: ProcessGroup,
    ):
        return TensorParallelMultiAdapterLinear(
            base_layer, layer_id, layer_names, sizes, process_group
        )

    def forward(
        self, input: torch.Tensor, adapter_data: "AdapterBatchData"
    ) -> torch.Tensor:
        result = self.base_layer(input)

        # noop if no layer names are provided (e.g. for models without adapters)
        if self.layer_names is None:
            return result

        # handle models like Bloom that have inputs of shape
        # (batch_size, sequence_length, hidden_size)
        # we need to reshape them to (batch_size * sequence_length, hidden_size)
        # for the LoRA computation, then reshape back
        prev_shape = result.shape
        is_3d = len(input.shape) >= 3
        if is_3d:
            input = input.reshape(-1, input.shape[-1])
            result = result.reshape(-1, result.shape[-1])

        offset = 0
        for i, layer_name in enumerate(self.layer_names):
            start_idx = offset // self.process_group.size()
            # The 'sizes' parameter is essential in tensor-parallel setups for handling multiple
            # projection layers (q_proj, k_proj, v_proj) by defining their output dimensions. It
            # ensures correct slicing of the result tensor, accommodating variations like grouped-query
            # attention where k_proj and v_proj differ from q_proj. This allows precise application of
            # LoRA adapters to each sub-component of the multi-head attention mechanism, managing the
            # different projection sizes across layers and model architectures.
            if self.sizes is not None:
                offset += self.sizes[i]
                end_idx = offset // self.process_group.size()
            else:
                end_idx = result.shape[1]

            result = self.forward_layer_type(
                result, input, adapter_data, layer_name, start_idx, end_idx
            )

        if is_3d:
            result = result.reshape(prev_shape)

        return result

    def collect_lora_a(self, a_out: torch.Tensor) -> torch.Tensor:
        # Tensor parallel implementation of X @ A@B, where A and B are sharded column-wise.
        # We use an all-gather between X@A and (X@A)@B to ensure alignment across ranks.
        #
        # TODO(travis): this is not very efficient as we do an all-gather for every adapter,
        #   instead we could pre-allocate a (B, a, r) tensor for all adapters with the same
        #   rank, compute `a_out` on each, and then slice them into the buffer as shown here:
        #   https://discuss.pytorch.org/t/concatenate-tensors-without-memory-copying/34609
        gathered_tensors = [
            torch.empty_like(a_out) for _ in range(self.process_group.size())
        ]
        torch.distributed.all_gather(gathered_tensors, a_out)
        return torch.cat(gathered_tensors, dim=1)


class TensorParallelAdapterRowLinear(LoraLinear):
    def __init__(self, base_layer, layer_id, layer_name, process_group):
        super().__init__(base_layer, layer_id, process_group)
        self.layer_name = layer_name

    @classmethod
    def load(cls, base_layer, layer_id, layer_name, process_group):
        return cls(base_layer, layer_id, layer_name, process_group)

    def forward(
        self, input: torch.Tensor, adapter_data: "AdapterBatchData"
    ) -> torch.Tensor:
        result = self.base_layer(input)

        if self.layer_name is None:
            return result

        # Fused all-gather + all-reduce from S-LoRA paper: https://arxiv.org/abs/2311.03285
        stride = result.shape[-1] // self.process_group.size()
        start_idx = self.process_group.rank() * stride
        end_idx = (self.process_group.rank() + 1) * stride

        self.forward_layer_type(
            result, input, adapter_data, self.layer_name, start_idx, end_idx
        )

        return result

    def collect_lora_a(self, a_out: torch.Tensor) -> torch.Tensor:
        # Tensor parallel implementation of X @ A@B, where A and B are sharded row-wise.
        # We use an all-reduce between X@A and (X@A)@B to ensure alignment across ranks.
        #
        # TODO(travis): this is not very efficient as we do an all-reduce for every adapter,
        #   instead we could pre-allocate a (B, a, r) tensor for all adapters with the same
        #   rank, compute `a_out` on each, and then slice them into the buffer as shown here:
        #   https://discuss.pytorch.org/t/concatenate-tensors-without-memory-copying/34609
        torch.distributed.all_reduce(a_out, group=self.process_group)
        return a_out


================================================
FILE: server/text_generation_server/layers/marlin/__init__.py
================================================
from text_generation_server.layers.marlin.fp8 import GPTQMarlinFP8Linear
from text_generation_server.layers.marlin.gptq import (
    GPTQMarlinWeightsLoader,
    can_use_gptq_marlin,
    repack_gptq_for_marlin,
)
from text_generation_server.layers.marlin.marlin import MarlinWeightsLoader

__all__ = [
    "GPTQMarlinFP8Linear",
    "GPTQMarlinWeightsLoader",
    "MarlinWeightsLoader",
    "can_use_gptq_marlin",
    "repack_gptq_for_marlin",
]


================================================
FILE: server/text_generation_server/layers/marlin/fp8.py
================================================
from typing import Optional

import torch
import torch.nn as nn
from text_generation_server.layers.fp8 import fp8_quantize
from text_generation_server.layers.marlin.gptq import _check_valid_shape
from text_generation_server.layers.marlin.util import (
    _check_marlin_kernels,
    permute_scales,
)
from text_generation_server.utils.import_utils import SYSTEM
from text_generation_server.utils.kernels import load_kernel

if SYSTEM == "cuda":
    quantization = load_kernel(
        module="quantization", repo_id="kernels-community/quantization"
    )
else:
    quantization = None


MARLIN_TILE_SIZE = 16


class GPTQMarlinFP8Linear(nn.Module):
    """
    FP8 GPTQ-Marlin linear layer.
    """

    def __init__(
        self,
        qweight: torch.Tensor,
        scales: torch.Tensor,
        bias: Optional[torch.Tensor],
    ) -> None:
        super().__init__()

        _check_marlin_kernels()
        assert quantization is not None

        scales = scales.unsqueeze(0)
        if scales.shape[1] == 1:
            out_features, in_features = qweight.shape
            scales = scales.repeat(1, out_features)
        qweight, scales = repack_fp8_for_marlin(qweight, scales)

        in_features = qweight.shape[0] * MARLIN_TILE_SIZE
        out_features = scales.shape[1]
        _check_valid_shape(in_features=in_features, out_features=out_features)

        self.qweight = qweight
        self.scales = scales
        self.bias = bias if bias is not None else None

        self.workspace = torch.zeros(
            out_features // 64 * 16, dtype=torch.int, device=qweight.device
        )

    @classmethod
    def from_unquant(cls, weight, bias, dtype):
        qweight, scales = fp8_quantize(weight)
        return cls(qweight=qweight, scales=scales.to(dtype), bias=bias)

    @classmethod
    def from_fp8(
        cls,
        weight: torch.Tensor,
        scale: torch.Tensor,
        bias: torch.Tensor,
        dtype: torch.dtype,
        **kwargs,
    ):
        return cls(qweight=weight, scales=scale.to(dtype), bias=bias)

    def forward(self, A: torch.Tensor) -> torch.Tensor:
        assert quantization is not None

        A_flat = A.view(-1, A.shape[-1])
        C = quantization.fp8_marlin_gemm(
            A_flat,
            self.qweight,
            self.scales,
            self.workspace,
            8,
            A_flat.shape[0],
            self.scales.shape[1],
            A_flat.shape[1],
        )
        C = C.reshape(A.shape[:-1] + (self.scales.shape[1],))

        if self.bias is not None:
            C += self.bias

        return C


def pack_fp8_as_int32(fp8_tensor: torch.Tensor) -> torch.Tensor:
    """
    Repack FP8 weights to gptq format (packed int32 elements).
    """
    assert fp8_tensor.dtype == torch.float8_e4m3fn

    if fp8_tensor.shape[0] % 4 != 0:
        raise ValueError(
            f"Leading tensor dimension is not divisable by 4: {fp8_tensor.shape[0]}"
        )

    # Reshape to prepare for packing
    reshaped = fp8_tensor.reshape(-1, 4, *fp8_tensor.shape[1:])

    # Convert fp8 to uint8 (byte) representation
    byte_tensor = reshaped.view(torch.uint8)

    # Pack 4 uint8 values into one int32
    packed = torch.zeros(
        fp8_tensor.shape[0] // 4,
        fp8_tensor.shape[1],
        dtype=torch.int32,
        device=fp8_tensor.device,
    )

    for i in range(4):
        packed.bitwise_or_(byte_tensor[:, i].to(torch.int32) << i * 8)

    return packed


def repack_fp8_for_marlin(weight: torch.Tensor, scales: torch.Tensor):
    """
    Repack FP8 tensor for GPTQ-Marlin.
    """

    out_features, in_features = weight.shape

    # Torch linear layers weights with shape [out_features, in_features],
    # GPTQ-quantized weights use [in_feateres/pack_factor, in_features],
    # so transpose before packing.
    qweight = pack_fp8_as_int32(weight.t())

    perm = torch.empty(0, dtype=torch.int, device=qweight.device)
    repacked = quantization.gptq_marlin_repack(
        qweight, perm, in_features, out_features, 8
    )

    scales = permute_scales(scales)

    return repacked, scales


================================================
FILE: server/text_generation_server/layers/marlin/gptq.py
================================================
from dataclasses import dataclass
from typing import List, Optional, Union

import numpy
import torch
import torch.nn as nn
from loguru import logger
from text_generation_server.layers.marlin.util import (
    _check_marlin_kernels,
    marlin_zero_points,
    permute_scales,
    unpack_cols,
)
from text_generation_server.utils.import_utils import SYSTEM
from text_generation_server.utils.kernels import load_kernel
from text_generation_server.utils.log import log_once
from text_generation_server.utils.weights import Weight, Weights, WeightsLoader

if SYSTEM == "cuda":
    quantization = load_kernel(
        module="quantization", repo_id="kernels-community/quantization"
    )
else:
    quantization = None


try:
    major, _minor = torch.cuda.get_device_capability()
    has_sm_8_0 = major >= 8
except Exception:
    has_sm_8_0 = False


GPTQ_MARLIN_BITS = [4, 8]
GPTQ_MARLIN_GROUP_SIZES = [-1, 32, 64, 128]
MARLIN_TILE_SIZE = 16


def can_use_gptq_marlin(
    *, bits: int, groupsize: int, quant_method: str, quantize: str, sym: bool
) -> bool:
    return (
        SYSTEM == "cuda"
        and quantization is not None
        and has_sm_8_0
        and quantize in {"awq", "gptq"}
        and quant_method in {"awq", "gptq"}
        and bits in GPTQ_MARLIN_BITS
        and groupsize in GPTQ_MARLIN_GROUP_SIZES
        # We only support asymmetric quantization for AWQ.
        and (sym or quant_method == "awq")
    )


class GPTQMarlinWeightsLoader(WeightsLoader):
    """
    Loader for using GPTQ- and AWQ-quantized weights with Marlin kernels.
    """

    def __init__(
        self,
        *,
        bits: int,
        desc_act: bool,
        groupsize: int,
        quant_method: str,
        quantize: str,
        sym: bool,
    ):
        self.bits = bits
        self.desc_act = desc_act
        self.groupsize = groupsize
        self.quant_method = quant_method
        self.quantize = quantize
        self.sym = sym

    def get_weights(self, weights: Weights, prefix: str):
        log_once(logger.info, "Using GPTQ-Marlin kernels")
        try:
            qweight = weights.get_tensor(f"{prefix}.qweight")
        except RuntimeError:
            raise RuntimeError(
                f"Cannot load `{self.quantize}` weight for GPTQ -> Marlin repacking, make sure the model is already quantized"
            )

        if not self.sym:
            qzeros = weights.get_tensor(f"{prefix}.qzeros")
        else:
            qzeros = None

        if self.quant_method == "awq":
            g_idx = None
        else:
            g_idx = weights.get_tensor(f"{prefix}.g_idx")
        scales = weights.get_tensor(f"{prefix}.scales")

        return repack_gptq_for_marlin(
            qweight=qweight,
            scales=scales,
            qzeros=qzeros,
            g_idx=g_idx,
            bits=self.bits,
            desc_act=self.desc_act,
            groupsize=self.groupsize,
            quant_method=self.quant_method,
            sym=self.sym,
            sharded_infeatures=False,
        )

    def get_weights_col_packed(
        self,
        weights: Weights,
        prefix: str,
        block_sizes: Union[int, List[int]],
    ):
        try:
            qweight = weights.get_packed_sharded(
                f"{prefix}.qweight", dim=1, block_sizes=block_sizes
            )
        except RuntimeError:
            raise RuntimeError(
                f"Cannot load `{self.quantize}` weight, make sure the model is already quantized."
            )
        scales = weights.get_packed_sharded(
            f"{prefix}.scales", dim=1, block_sizes=block_sizes
        )
        scales = scales.to(dtype=weights.dtype)

        if not self.sym:
            qzeros = weights.get_packed_sharded(
                f"{prefix}.qzeros", dim=1, block_sizes=block_sizes
            )
        else:
            qzeros = None

        if self.quant_method == "awq":
            g_idx = None
        else:
            g_idx = weights.get_tensor(f"{prefix}.g_idx")
        return repack_gptq_for_marlin(
            qweight=qweight,
            scales=scales,
            qzeros=qzeros,
            g_idx=g_idx,
            bits=self.bits,
            desc_act=self.desc_act,
            groupsize=self.groupsize,
            quant_method=self.quant_method,
            sym=self.sym,
            sharded_infeatures=False,
        )

    def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
        try:
            qweight = torch.cat(
                [weights.get_sharded(f"{p}.qweight", dim=1) for p in prefixes], dim=1
            )
        except RuntimeError:
            raise RuntimeError(
                f"Cannot load `{self.quantize}` weight, make sure the model is already quantized"
            )

        scales = torch.cat(
            [weights.get_sharded(f"{p}.scales", dim=1) for p in prefixes], dim=1
        )

        if not self.sym:
            qzeros = torch.cat(
                [weights.get_sharded(f"{p}.qzeros", dim=1) for p in prefixes], dim=1
            )
        else:
            qzeros = None

        if self.quant_method == "awq":
            g_idx = None
        else:
            w = [weights.get_tensor(f"{p}.g_idx") for p in prefixes]
            for w2 in w[1:]:
                torch.testing.assert_close(w2, w[0])
            g_idx = w[0]

        return repack_gptq_for_marlin(
            qweight=qweight,
            scales=scales,
            qzeros=qzeros,
            g_idx=g_idx,
            bits=self.bits,
            desc_act=self.desc_act,
            groupsize=self.groupsize,
            quant_method=self.quant_method,
            sym=self.sym,
            sharded_infeatures=False,
        )

    def get_weights_row(self, weights: Weights, prefix: str):
        log_once(logger.info, "Using GPTQ-Marlin kernels")
        try:
            qweight = weights.get_sharded(f"{prefix}.qweight", dim=0)
        except RuntimeError:
            raise RuntimeError(
                f"Cannot load `{self.quantize}` weight for GPTQ -> Marlin repacking, make sure the model is already quantized"
            )

        if not self.sym:
            if self.desc_act or self.groupsize == -1:
                qzeros = weights.get_tensor(f"{prefix}.qzeros")
            else:
                qzeros = weights.get_sharded(f"{prefix}.qzeros", dim=0)
        else:
            qzeros = None

        if self.quant_method == "awq":
            g_idx = None
        else:
            g_idx = weights.get_sharded(f"{prefix}.g_idx", dim=0)

        if self.desc_act or self.groupsize == -1:
            scales = weights.get_tensor(f"{prefix}.scales")
        else:
            scales = weights.get_sharded(f"{prefix}.scales", dim=0)

        sharded_in_features = weights.process_group.size() > 1

        return repack_gptq_for_marlin(
            qweight=qweight,
            scales=scales,
            qzeros=qzeros,
            g_idx=g_idx,
            bits=self.bits,
            desc_act=self.desc_act,
            groupsize=self.groupsize,
            quant_method=self.quant_method,
            sym=self.sym,
            sharded_infeatures=sharded_in_features,
        )

    def _get_gptq_params(self, weights: Weights):
        if weights.has_tensor("gptq_bits") and weights.has_tensor("gptq_groupsize"):
            self.bits = weights.get_tensor("gptq_bits").item()
            self.groupsize = weights.get_tensor("gptq_groupsize").item()
            self.desc_act = False
            # `server quantize` used asymmetric quantization unconditionally
            # before the `gptq_sym` setting tensor was added.
            self.sym = (
                weights.get_tensor("gptq_sym").item()
                if weights.has_tensor("gptq_sym")
                else False
            )
            self.quant_method = "gptq"


@dataclass
class GPTQMarlinWeight(Weight):
    """
    Repacked GPTQ Marlin weights.
    """

    qweight: torch.Tensor
    qzeros: torch.Tensor
    scales: torch.Tensor
    g_idx: torch.Tensor
    perm: torch.Tensor
    bits: int
    is_full_k: bool

    def __post_init__(self):
        assert self.qweight.dtype == torch.int32
        assert self.scales.dtype in (torch.float16, torch.bfloat16)
        assert self.g_idx.dtype == torch.int32
        assert self.perm.dtype == torch.int32

    def get_linear(self, bias: torch.Tensor):
        return GPTQMarlinLinear(
            weight=self,
            bias=bias,
        )


def repack_gptq_for_marlin(
    *,
    qweight: torch.Tensor,
    qzeros: Optional[torch.Tensor],
    scales: torch.Tensor,
    g_idx: Optional[torch.Tensor],
    bits: int,
    desc_act: bool,
    groupsize: int,
    quant_method: str,
    sym: bool,
    sharded_infeatures: bool,
) -> GPTQMarlinWeight:
    """Convert GPTQ weights to a layout that's compatible with GPTQ-Marlin kernels."""
    _check_marlin_kernels()
    assert quantization is not None

    if bits not in GPTQ_MARLIN_BITS:
        supported_bits = ", ".join(str(b) for b in GPTQ_MARLIN_BITS)
        raise RuntimeError(
            f"Repacking {bits}-bit GPTQ weights as Marlin is not supported, must be one of: {supported_bits}"
        )

    if groupsize not in GPTQ_MARLIN_GROUP_SIZES:
        supported_sizes = ", ".join(str(b) for b in GPTQ_MARLIN_GROUP_SIZES)
        raise RuntimeError(
            f"Repacking GPTQ weights with group size {groupsize} as Marlin is not supported, must be one of: {supported_sizes}"
        )
    if not (sym or quant_method == "awq" or quant_method == "compressed-tensors"):
        raise RuntimeError(
            "Repacking GPTQ weights with asymmetric quantization as Marlin is not supported."
        )

    log_once(logger.info, f"Converting {quant_method} model to Marlin packing format.")

    weights_per_int = 32 // bits
    in_features = qweight.shape[0]
    out_features = qweight.shape[1]

    # AWQ uses column packing, GPTQ uses row packing
    if quant_method == "awq":
        out_features *= weights_per_int
    else:
        in_features *= weights_per_int

    if in_features % groupsize != 0:
        raise ValueError(
            f"Number of input features ({in_features}) not divisible by group size ({groupsize})"
        )

    if g_idx is not None and desc_act and groupsize != -1:
        perm = torch.argsort(g_idx).to(torch.int)
        g_idx = g_idx[perm]
    else:
        perm = torch.empty(0, dtype=torch.int, device=qweight.device)
        g_idx = torch.empty(0, dtype=torch.int, device=qweight.device)

    if quant_method == "awq":
        repacked = quantization.awq_marlin_repack(
            qweight, in_features, out_features, bits
        )
        if qzeros is not None:
            qzeros = awq_to_marlin_zero_points(
                qzeros,
                in_features // groupsize,
                out_features,
                bits,
            )

    else:
        repacked = quantization.gptq_marlin_repack(
            qweight, perm, in_features, out_features, bits
        )

    if qzeros is None:
        qzeros = torch.empty(0, dtype=torch.int, device=qweight.device)

    scales = permute_scales(scales)

    is_full_k = not (desc_act and groupsize != -1 and sharded_infeatures)

    return GPTQMarlinWeight(
        qweight=repacked,
        qzeros=qzeros,
        scales=scales,
        g_idx=g_idx,
        perm=perm,
        bits=bits,
        is_full_k=is_full_k,
    )


class GPTQMarlinLinear(nn.Module):
    """
    Linear layer for GPTQ weights that were converted for the GPTQ-Marlin
    kernels.
    """

    def __init__(
        self,
        *,
        weight: GPTQMarlinWeight,
        bias: Optional[torch.Tensor],
    ):
        super().__init__()

        _check_marlin_kernels()
        assert quantization is not None

        in_features = weight.qweight.shape[0] * MARLIN_TILE_SIZE
        out_features = weight.scales.shape[1]
        _check_valid_shape(in_features=in_features, out_features=out_features)

        if weight.bits not in (4, 8):
            raise ValueError("GPTQMarlinLinear only supports 4 and 8-bit quantization")

        if weight.qzeros.numel() > 0:
            if weight.bits == 4:
                self.quant_type = quantization.scalar_types.uint4
            else:
                self.quant_type = quantization.scalar_types.uint8
        else:
            if weight.bits == 4:
                self.quant_type = quantization.scalar_types.uint4b8
            else:
                self.quant_type = quantization.scalar_types.uint8b128

        self.is_full_k = weight.is_full_k

        self.qweight = weight.qweight
        self.qzeros = weight.qzeros
        self.scales = weight.scales
        self.g_idx = weight.g_idx
        self.perm = weight.perm
        if bias is not None:
            self.bias = bias
        else:
            self.bias = None

        self.workspace = torch.zeros(
            out_features // 64 * 16, dtype=torch.int, device=weight.qweight.device
        )

    def forward(self, A: torch.Tensor) -> torch.Tensor:
        assert quantization is not None

        A_flat = A.view(-1, A.shape[-1])
        C = quantization.gptq_marlin_gemm(
            A_flat,
            self.qweight,
            self.scales,
            self.qzeros,
            self.g_idx,
            self.perm,
            self.workspace,
            self.quant_type,
            A_flat.shape[0],
            self.scales.shape[1],
            A_flat.shape[1],
            self.is_full_k,
            self.qzeros.numel() > 0,
            True,
        )
        C = C.reshape(A.shape[:-1] + (self.scales.shape[1],))

        if self.bias is not None:
            C += self.bias

        return C


def awq_to_marlin_zero_points(
    q_zp_packed: torch.Tensor, size_k: int, size_n: int, num_bits: int
) -> torch.Tensor:
    # AWQ zero-points are quantized and packed on the column dim.
    # In addition, the values are permuted based on dequantizer.
    # Here we undo both of these, and then apply marlin permutation
    # and pack it back.
    q_zp = unpack_cols(q_zp_packed, num_bits, size_k, size_n)

    # Undo interleaving (use argsort(..) to get inverse perm)
    if num_bits == 4:
        undo_interleave = numpy.argsort(numpy.array([0, 2, 4, 6, 1, 3, 5, 7]))
    elif num_bits == 8:
        undo_interleave = numpy.argsort(numpy.array([0, 2, 1, 3]))
    else:
        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))

    q_zp = q_zp.reshape((-1, len(undo_interleave)))[:, undo_interleave].ravel()
    q_zp = q_zp.reshape((-1, size_n)).contiguous()

    marlin_zp = marlin_zero_points(q_zp, size_k, size_n, num_bits)
    return marlin_zp


def _check_valid_shape(in_features: int, out_features: int):
    if (in_features % 128 != 0 or out_features % 64 != 0) and (
        in_features % 64 != 0 or out_features % 128 != 0
    ):
        raise ValueError(
            f"The GPTQ Marlin kernel does not have a valid thread configuration for weight matrix with shape ({out_features}, {in_features})."
            " The shape elements must be divisible by (128, 64) or (64, 128)."
        )


================================================
FILE: server/text_generation_server/layers/marlin/marlin.py
================================================
from dataclasses import dataclass
from typing import List, Optional, Union

import torch
import torch.nn as nn

from text_generation_server.layers.marlin.util import _check_marlin_kernels
from text_generation_server.utils.import_utils import SYSTEM
from text_generation_server.utils.kernels import load_kernel
from text_generation_server.utils.weights import Weight, Weights, WeightsLoader

if SYSTEM == "cuda":
    quantization = load_kernel(
        module="quantization", repo_id="kernels-community/quantization"
    )
else:
    quantization = None


class MarlinWeightsLoader(WeightsLoader):
    """Loader for Marlin-quantized weights."""

    def __init__(self, *, bits: int, is_marlin_24: bool):
        self.bits = bits
        self.is_marlin_24 = is_marlin_24

    def get_weights(self, weights: "Weights", prefix: str):
        """
        Get weights at the given prefix and apply without tensor paralllism.
        """
        is_marlin_24 = getattr(self, "gptq_checkpoint_format", None) == "marlin_24"
        if is_marlin_24:
            try:
                B = weights.get_tensor(f"{prefix}.B_24")
            except RuntimeError:
                raise RuntimeError(
                    "Cannot load `marlin` 2:4 sparsity weight, make sure the model is already quantized."
                )

            B_meta = weights.get_tensor(f"{prefix}.B_meta")
            s = weights.get_tensor(f"{prefix}.s")
            weight = GPTQMarlin24Weight(
                weight_packed=B, meta=B_meta, scale_packed=s, bits=self.bits
            )
        else:
            try:
                B = weights.get_tensor(f"{prefix}.B")
            except RuntimeError:
                raise RuntimeError(
                    "Cannot load `marlin` weight, make sure the model is already quantized."
                )

            s = weights.get_tensor(f"{prefix}.s")
            weight = MarlinWeight(B=B, s=s)

        return weight

    def get_weights_col_packed(
        self,
        weights: Weights,
        prefix: str,
        block_sizes: Union[int, List[int]],
    ):
        if self.is_marlin_24:
            B = weights.get_packed_sharded(
                f"{prefix}.B_24", dim=1, block_sizes=block_sizes
            )
            B_meta = weights.get_packed_sharded(
                f"{prefix}.B_meta", dim=1, block_sizes=block_sizes
            )
            s = weights.get_packed_sharded(
                f"{prefix}.s", dim=1, block_sizes=block_sizes
            )

            weight = GPTQMarlin24Weight(
                weight_packed=B, meta=B_meta, scale_packed=s, bits=self.bits
            )
        else:
            B = weights.get_packed_sharded(
                f"{prefix}.B", dim=1, block_sizes=block_sizes
            )
            s = weights.get_packed_sharded(
                f"{prefix}.s", dim=1, block_sizes=block_sizes
            )
            weight = MarlinWeight(B=B, s=s)

        return weight

    def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
        if self.is_marlin_24:
            try:
                B = torch.cat(
                    [weights.get_sharded(f"{p}.B_24", dim=1) for p in prefixes], dim=1
                )
            except RuntimeError:
                raise RuntimeError(
                    "Cannot load `marlin` weight, make sure the model is already quantized"
                )

            B_meta = torch.cat(
                [weights.get_sharded(f"{p}.B_meta", dim=1) for p in prefixes], dim=1
            )

            s = torch.cat(
                [weights.get_sharded(f"{p}.s", dim=1) for p in prefixes], dim=1
            )

            weight = GPTQMarlin24Weight(
                weight_packed=B, meta=B_meta, scale_packed=s, bits=self.bits
            )
        else:
            try:
                B = torch.cat(
                    [weights.get_sharded(f"{p}.B", dim=1) for p in prefixes], dim=1
                )
            except RuntimeError:
                raise RuntimeError(
                    "Cannot load `marlin` weight, make sure the model is already quantized"
                )
            s = torch.cat(
                [weights.get_sharded(f"{p}.s", dim=1) for p in prefixes], dim=1
            )

            weight = MarlinWeight(B=B, s=s)

        return weight

    def get_weights_row(self, weights: Weights, prefix: str):
        if self.is_marlin_24:
            try:
                B = weights.get_sharded(f"{prefix}.B_24", dim=0)
            except RuntimeError:
                raise RuntimeError(
                    "Cannot load `marlin` 2:4 sparsity weight, make sure the model is already quantized."
                )

            B_meta = weights.get_sharded(f"{prefix}.B_meta", dim=0)
            num_groups = weights._get_slice(f"{prefix}.s").get_shape()[0]
            if num_groups == 1:
                # The number of groups is 1 when groupsize == -1. share
                # scales between all shards in this case.
                s = weights.get_tensor(f"{prefix}.s")
            else:
                s = weights.get_sharded(f"{prefix}.s", dim=0)

            weight = GPTQMarlin24Weight(
                weight_packed=B, meta=B_meta, scale_packed=s, bits=self.bits
            )
        else:
            try:
                B = weights.get_sharded(f"{prefix}.B", dim=0)
            except RuntimeError:
                raise RuntimeError(
                    "Cannot load `marlin` weight, make sure the model is already quantized."
                )

            num_groups = weights._get_slice(f"{prefix}.s").get_shape()[0]
            if num_groups == 1:
                # The number of groups is 1 when groupsize == -1. share
                # scales between all shards in this case.
                s = weights.get_tensor(f"{prefix}.s")
            else:
                s = weights.get_sharded(f"{prefix}.s", dim=0)
            weight = MarlinWeight(B=B, s=s)

        return weight


@dataclass
class MarlinWeight(Weight):
    """
    Marlin weights.

    Attributes:
        B (torch.Tensor): int4-quantized weights packed into int32.
        s (torch.Tensor): bfloat16/float16 scales.
    """

    B: torch.Tensor
    s: torch.Tensor

    def __post_init__(self):
        assert self.B.dtype == torch.int32
        assert self.s.dtype in [torch.float16, torch.bfloat16]

    def get_linear(self, bias: torch.Tensor):
        return MarlinLinear(weight=self, bias=bias)


class MarlinLinear(nn.Module):
    def __init__(self, *, weight: MarlinWeight, bias: Optional[torch.Tensor]):
        super().__init__()

        _check_marlin_kernels()
        assert quantization is not None

        in_features = weight.B.shape[0] * MARLIN_TILE_SIZE
        out_features = weight.s.shape[1]
        assert (
            in_features % 128 == 0
        ), f"Number of input features ({in_features}) not divisable by 128"
        assert (
            out_features % 256 == 0
        ), f"Number of output features ({out_features}) not divisable by 256"

        groupsize = -1 if weight.s.shape[0] == 1 else in_features // weight.s.shape[0]
        assert groupsize in {
            -1,
            128,
        }, f"Group size must be -1 or 128, was {groupsize}"

        self.B = weight.B
        self.s = weight.s
        if bias is not None:
            self.bias = bias
        else:
            self.bias = None

        self.workspace = torch.zeros(
            out_features // 64 * 16, dtype=torch.int, device=weight.B.device
        )

    def forward(self, A: torch.Tensor) -> torch.Tensor:
        assert quantization is not None

        C = quantization.marlin_gemm(
            A.view(-1, A.shape[-1]),
            self.B,
            self.s,
            self.workspace,
            A.shape[0],
            self.s.shape[1],
            A.shape[1],
        )
        C = C.reshape(A.shape[:-1] + (self.s.shape[1],))

        if self.bias is not None:
            C += self.bias

        return C


GPTQ_MARLIN_24_MIN_THREAD_N = 128
GPTQ_MARLIN_24_MIN_THREAD_K = 128
GPTQ_MARLIN_24_MAX_PARALLEL = 64
GPTQ_MARLIN_24_SUPPORTED_NUM_BITS = [4, 8]
GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES = [-1, 128]
MARLIN_TILE_SIZE = 16


@dataclass
class GPTQMarlin24Weight:
    """
    GPTQ-Marlin 2:4 weights.

    Attributes:
        B (torch.Tensor): int4-quantized weights packed into int32.
        B_meta (torch.Tensor): metadata for 2:4 sparsity.
        s (torch.Tensor): float16 scales.
        bits: quantized weight size.
    """

    weight_packed: torch.Tensor
    meta: torch.Tensor
    scale_packed: torch.Tensor
    bits: int

    def __post_init__(self):
        assert self.weight_packed.dtype == torch.int32
        assert self.meta.dtype == torch.int16
        assert self.scale_packed.dtype == torch.float16

    def get_linear(self, bias: torch.Tensor):
        return GPTQMarlin24Linear(
            weight=self,
            bias=bias,
        )


class GPTQMarlin24Linear(nn.Module):
    def __init__(self, *, weight: GPTQMarlin24Weight, bias: Optional[torch.Tensor]):
        super().__init__()

        _check_marlin_kernels()
        assert quantization is not None

        if weight.bits not in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS:
            supported_bits = ", ".join(
                str(b) for b in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS
            )
            raise RuntimeError(
                f"{weight.bits}-bit GPTQ Sparse 2:4 Marlin is not supported, must be one of: {supported_bits}"
            )

        in_features = weight.weight_packed.shape[0] * MARLIN_TILE_SIZE * 2
        out_features = weight.scale_packed.shape[1]
        groupsize = (
            -1
            if weight.scale_packed.shape[0] == 1
            else in_features // weight.scale_packed.shape[0]
        )

        if groupsize not in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES:
            supported_sizes = ", ".join(
                str(b) for b in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
            )
            raise RuntimeError(
                f"Group size {groupsize} is not supported, must be one of: {supported_sizes}"
            )

        if weight.bits == 4:
            self.quant_type = quantization.scalar_types.uint4b8
        else:
            self.quant_type = quantization.scalar_types.uint8b128
        weights_per_int32 = 32 // weight.bits

        assert (
            out_features % GPTQ_MARLIN_24_MIN_THREAD_N == 0
        ), f"Number of output features ({out_features}) not divisable by {GPTQ_MARLIN_24_MIN_THREAD_N} threads"
        assert (
            out_features % weights_per_int32 == 0
        ), f"Number of output features ({out_features}) not divisable by weights per int32 ({weights_per_int32})"

        assert (
            in_features % GPTQ_MARLIN_24_MIN_THREAD_K == 0
        ), f"Number of output features ({out_features}) not divisable by {GPTQ_MARLIN_24_MIN_THREAD_K} threads"
        if groupsize != -1 and in_features % groupsize != 0:
            raise ValueError(
                f"Number of input features ({in_features}) not divisable by group size ({groupsize})"
            )

        self.weight_packed = weight.weight_packed
        self.meta = weight.meta
        self.scale_packed = weight.scale_packed
        if bias is not None:
            self.bias = bias
        else:
            self.bias = None

        self.workspace = torch.zeros(
            (out_features // GPTQ_MARLIN_24_MIN_THREAD_N) * GPTQ_MARLIN_24_MAX_PARALLEL,
            dtype=torch.int,
            device=weight.weight_packed.device,
        )

    def forward(self, A: torch.Tensor) -> torch.Tensor:
        assert quantization is not None

        C = quantization.gptq_marlin_24_gemm(
            A.view(-1, A.shape[-1]),
            self.weight_packed,
            self.meta,
            self.scale_packed,
            self.workspace,
            self.quant_type,
            A.shape[0],
            self.scale_packed.shape[1],
            A.shape[1],
        )

        C = C.reshape(A.shape[:-1] + (self.scale_packed.shape[1],))

        if self.bias is not None:
            C += self.bias

        return C


================================================
FILE: server/text_generation_server/layers/marlin/util.py
================================================
import functools
from typing import List, Tuple

import numpy
import torch
from text_generation_server.utils.import_utils import SYSTEM
from text_generation_server.utils.kernels import load_kernel

if SYSTEM == "cuda":
    quantization = load_kernel(
        module="quantization", repo_id="kernels-community/quantization"
    )
else:
    quantization = None

try:
    major, _minor = torch.cuda.get_device_capability()
    has_sm_8_0 = major >= 8
except Exception:
    has_sm_8_0 = False


def _check_marlin_kernels():
    if not (SYSTEM == "cuda" and has_sm_8_0):
        raise NotImplementedError(
            "Using quantized Marlin models requires a GPU with CUDA capability 8.0 or later."
        )

    if quantization is None:
        raise NotImplementedError(
            "marlin is not installed, install it with: pip install server/marlin"
        )


# https://github.com/IST-DASLab/marlin/blob/2f6d7c10e124b3c5fa29ff8d77d568bd7af3274c/marlin/__init__.py#L40C1-L68C54
@functools.cache
def get_perms() -> Tuple[List[int], List[int]]:
    scale_perm = []
    for i in range(8):
        scale_perm.extend([i + 8 * j for j in range(8)])
    scale_perm_single = []
    for i in range(4):
        scale_perm_single.extend([2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
    return scale_perm, scale_perm_single


def permute_scales(scales: torch.Tensor):
    scale_perm, scale_perm_single = get_perms()
    out_features = scales.shape[1]
    if scales.shape[0] == 1:
        scales = scales.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
    else:
        scales = scales.reshape((-1, len(scale_perm)))[:, scale_perm]
    return scales.reshape((-1, out_features)).contiguous()


# Functions below are from vLLM


def get_pack_factor(bits: int) -> int:
    if 32 % bits != 0:
        raise ValueError(f"Cannot {bits} bit values into uint32")
    return 32 // bits


def pack_cols(
    q_w: torch.Tensor,
    num_bits: int,
    size_k: int,
    size_n: int,
):
    assert q_w.shape == (size_k, size_n)

    pack_factor = get_pack_factor(num_bits)
    assert size_n % pack_factor == 0

    orig_device = q_w.device

    q_w = q_w.cpu().numpy().astype(numpy.uint32)

    q_res = numpy.zeros((size_k, size_n // pack_factor), dtype=numpy.uint32)

    for i in range(pack_factor):
        q_res |= q_w[:, i::pack_factor] << num_bits * i

    q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
    q_res = q_res.contiguous()

    return q_res


def unpack_cols(
    packed_q_w: torch.Tensor,
    num_bits: int,
    size_k: int,
    size_n: int,
):
    pack_factor = get_pack_factor(num_bits)
    assert size_n % pack_factor == 0
    assert packed_q_w.shape == (
        size_k,
        size_n // pack_factor,
    ), "packed_q_w.shape = {} size_k = {}, size_n = {} pack_Factor = {}".format(
        packed_q_w.shape, size_k, size_n, pack_factor
    )

    orig_device = packed_q_w.device

    packed_q_w_cpu = packed_q_w.cpu().numpy().astype(numpy.uint32)
    q_res = numpy.zeros((size_k, size_n), dtype=numpy.uint32)

    mask = (1 << num_bits) - 1
    for i in range(pack_factor):
        vals = packed_q_w_cpu & mask
        packed_q_w_cpu >>= num_bits
        q_res[:, i::pack_factor] = vals

    q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
    q_res = q_res.contiguous()

    return q_res


def marlin_zero_points(
    zp: torch.Tensor, size_k: int, size_n: int, num_bits: int
) -> torch.Tensor:
    scale_perm, _ = get_perms()
    # Permute zero-points in a similar way to scales, but do not use the
    # "single" permutation, since zero-points are applied on every MMA
    zp = zp.reshape((-1, len(scale_perm)))[:, scale_perm]

    # Interleave column dim (for the dequantize code) and pack it to int32
    if num_bits == 4:
        interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
    elif num_bits == 8:
        interleave = numpy.array([0, 2, 1, 3])
    else:
        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))

    zp = zp.reshape((-1, len(interleave)))[:, interleave].ravel()
    zp = zp.reshape((-1, size_n)).contiguous()
    zp = pack_cols(zp, num_bits, size_k, size_n)

    return zp


================================================
FILE: server/text_generation_server/layers/medusa.py
================================================
import torch
from torch import nn
from typing import Tuple, Optional
from text_generation_server.utils.speculate import get_speculate
from text_generation_server.layers.linear import FastLinear
from text_generation_server.layers.tensor_parallel import (
    TensorParallelHead,
    TensorParallelColumnLinear,
)


class ResBlock(torch.nn.Module):
    def __init__(self, config, prefix, weights):
        super().__init__()
        self.linear = FastLinear.load(
            config, prefix=f"{prefix}.linear", weights=weights, bias=True
        )
        self.act = torch.nn.SiLU()

    def forward(self, x):
        return x + self.act(self.linear(x))


class MedusaModel(torch.nn.Module):
    def __init__(self, config, medusa_config, weights):
        super().__init__()
        self.heads = torch.nn.ModuleList(
            [
                MedusaHead(config, medusa_config, prefix=f"{i}", weights=weights)
                for i in range(get_speculate())
            ]
        )

    def forward(self, x):
        if not self.heads:
            return None
        speculative_logits = torch.stack([head(x) for head in self.heads], dim=1)
        return speculative_logits


class MedusaHead(torch.nn.Module):
    def __init__(self, config, medusa_config, prefix, weights):
        super().__init__()
        self.blocks = torch.nn.ModuleList(
            [
                ResBlock(config, prefix=f"{prefix}.{i}", weights=weights)
                for i in range(medusa_config["medusa_num_layers"])
            ]
        )
        n = len(self.blocks)
        self.out = FastLinear.load(
            config, prefix=f"{prefix}.{n}", weights=weights, bias=False
        )

    def forward(self, x):
        for block in self.blocks:
            x = block(x)
        x = self.out(x)
        return x


class MedusaHeadV1(nn.Module):
    def __init__(self, lm_head, medusa):
        super().__init__()
        self.lm_head = lm_head
        self.medusa = medusa

    @staticmethod
    def load(config, prefix: str, weights):
        from pathlib import Path
        from safetensors import safe_open
        import json

        speculator = config.speculator

        path = speculator["path"]
        medusa_config = str(Path(path) / "config.json")

        for fname in speculator["model_paths"]:
            filename = str(Path(path) / fname)

            with open(medusa_config, "r") as f:
                medusa_config = json.load(f)
            routing = weights.routing
            with safe_open(filename, framework="pytorch") as f:
                for k in f.keys():
                    if k in routing and routing[k] != filename:
                        raise RuntimeError(
                            f"Key {k} was found in multiple files: {filename} and {routing[k]}"
                        )
                    routing[k] = filename

        medusa = MedusaModel(config, medusa_config, weights)
        lm_head = TensorParallelHead.load(config, prefix, weights)
        return MedusaHeadV1(lm_head, medusa)

    def forward(
        self, input: torch.Tensor
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        logits = self.lm_head(input)
        # If we have too many tokens, we skip speculative logits
        if input.shape[0] > 128:
            return logits, None

        speculative_logits = self.medusa(input)
        return logits, speculative_logits


class MedusaHeadV2(nn.Module):
    def __init__(self, config, prefix, weights):
        super().__init__()
        from pathlib import Path
        from safetensors import safe_open
        import json

        speculator_path = config.speculator["path"]

        medusa_config = str(Path(speculator_path) / "config.json")
        filename = str(Path(speculator_path) / "medusa_lm_head.safetensors")

        with open(medusa_config, "r") as f:
            medusa_config = json.load(f)
        routing = weights.routing
        with safe_open(filename, framework="pytorch") as f:
            for k in f.keys():
                if k in routing and routing[k] != filename:
                    raise RuntimeError(
                        f"Key {k} was found in multiple files: {filename} and {routing[k]}"
                    )
                routing[k] = filename

        self.n_medusa_heads = get_speculate()

        assert medusa_config["medusa_num_layers"] == 1
        self.linear = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{i}.0.linear" for i in range(self.n_medusa_heads)],
            dim=0,
            weights=weights,
            bias=True,
        )
        self.process_group = weights.process_group
        self.world_size = self.process_group.size()
        self.rank = self.process_group.rank()

        self.act = torch.nn.SiLU()

        self.lm_head = TensorParallelHead.load(config, prefix, weights)

    def forward(self, x):
        # If we have too many tokens, we skip speculative logits
        if x.shape[0] > 128:
            logits = self.lm_head(x)
            return logits, None

        size = x.shape[-1]
        block_size = (size + self.world_size - 1) // self.world_size
        start = self.rank * block_size
        stop = (self.rank + 1) * block_size

        x_block = x[:, start:stop]

        # Compute all medusa heads at the same time, then reshape and move the n_medusa_heads dim to dim 1
        medusa_res = self.act(self.linear(x)).reshape(
            *x_block.shape[:-1], self.n_medusa_heads, x_block.shape[-1]
        )

        # Apply all residual medusa heads
        output = x[:, start:stop].unsqueeze(-2) + medusa_res

        # Gather medusa heads
        world_output = [
            torch.empty_like(output) for _ in range(self.process_group.size())
        ]
        torch.distributed.all_gather(world_output, output, group=self.process_group)
        world_output = torch.cat(world_output, dim=-1)

        # Stack x and medusa residual x
        stacked_x = torch.cat([x.unsqueeze(-2), world_output], dim=-2)

        # Compute lm head on x + medusa residual x
        logits = self.lm_head(stacked_x)

        # Finally, split logits from speculative logits
        logits, speculative_logits = torch.split(
            logits, [1, self.n_medusa_heads], dim=-2
        )
        # Squeeze added dimension
        logits = logits.squeeze(-2)

        return logits, speculative_logits


================================================
FILE: server/text_generation_server/layers/mlp.py
================================================
import torch
import math
from torch import nn
from torch.nn import functional as F
from typing import Optional, Tuple
from text_generation_server.layers import TensorParallelEmbedding, FastLinear
from text_generation_server.layers.tensor_parallel import TensorParallelHead
from text_generation_server.utils.speculate import get_speculate


class MLPSpeculatorLayerNorm(nn.Module):
    """
    A L2 normalization implementation
    ...
    Args
    ----
    normalized_shape : int
        Dimensionality of input data (size of final tensor axis)
    elementwise_scale_weight : torch.Tensor
        learned scaling term after normalization?
    elementwise_shift_bias : torch.Tensor
        learned bias term after normalization?
    eps : float
        Safety term to prevent division by zero. Make sure the chosen value fits in the range of your encoding scheme (i.e. fp16 requires eps >= 6e-8).
    """

    def __init__(
        self,
        prefix,
        config,
        weights,
        eps=1e-06,
    ):
        super(MLPSpeculatorLayerNorm, self).__init__()
        self.weight = weights.get_tensor(f"{prefix}.weight")
        self.bias = weights.get_tensor(f"{prefix}.bias")
        self.eps = eps

    def forward(self, x):
        xf = x
        xf = xf * torch.rsqrt(xf.pow(2).mean(-1, keepdim=True) + self.eps)
        x = xf.type_as(x)
        x = self.weight * x
        x = x + self.bias
        return x


INV_SQRT2 = 2**-0.5


def simple_norm(x: torch.Tensor, eps=1e-06):
    xf = x
    xf = xf * torch.rsqrt(xf.pow(2).mean(-1, keepdim=True) + eps)
    x = xf.type_as(x)
    return x * INV_SQRT2


class MLPSpeculatorModelTied(torch.nn.Module):
    def __init__(self, config, prefix, weights):
        super().__init__()
        self.config = config
        self.n_predict = get_speculate()
        self.hidden_size = config.hidden_size

        self.emb = TensorParallelEmbedding(f"{prefix}.emb.0", weights)
        self.proj0 = FastLinear.load(
            config,
            prefix=f"{prefix}.proj.0",
            weights=weights,
            bias=False,
        )
        self.proj1 = FastLinear.load(
            config,
            prefix=f"{prefix}.proj.1",
            weights=weights,
            bias=False,
        )
        self.head = FastLinear.load(config, f"{prefix}.head.0", weights, bias=False)
        self.ln = MLPSpeculatorLayerNorm(
            prefix=f"{prefix}.ln.0",
            config=config,
            weights=weights,
        )

        # Weights ensure that state_0 accounts for 50% of state magnitude by final head in expectation
        self.state_weight = 0.5 ** (0.5 / self.n_predict) if self.n_predict > 0 else 1
        self.activation = nn.GELU()
        self.vsize = config.vocab_size
        self.inner_dim = config.speculator_config["inner_dim"]
        self.top_k_tokens_per_head = [1] * self.n_predict
        self.emb_weight = math.sqrt(1 - self.state_weight**2) * math.sqrt(
            self.inner_dim / 2
        )
        self.emb.weight *= self.emb_weight

    def forward(
        self,
        hidden_states: torch.Tensor,
        input_ids: torch.Tensor,
    ):
        top_k_tokens_per_head = self.top_k_tokens_per_head

        # k indicates # of candidates
        # h indicates # of generated tokens
        state = hidden_states
        b = state.size(0)
        ind = input_ids.unsqueeze(0)
        all_probs = torch.empty(
            b, self.n_predict, self.vsize, device=state.device
        )  # b k h v
        assert (
            len(top_k_tokens_per_head) == self.n_predict
        ), f"You must provide a topk number for each head ({self.n_predict} heads, {len(top_k_tokens_per_head)} provided)"
        for i in range(self.n_predict):
            # Project and predict
            z = self.emb(ind)
            # z = z.mul(self.emb_weight)  # b k d
            if i == 0:
                state = self.proj0(state) * self.state_weight + z
            else:
                state = self.proj1(state) * self.state_weight + z
            state = self.activation(self.ln(state))  # b k d
            probs = F.log_softmax(self.head(state), dim=-1)  # b k v
            _probs, preds = probs.topk(top_k_tokens_per_head[i], dim=-1)  # b k k'

            # Update candidate set with new predictions

            # Update distribution set with new logits
            all_probs[:, i] = probs.exp()

            # Update state, log_probs and ind for new predictions
            state = state.unsqueeze(2).expand(
                -1, -1, top_k_tokens_per_head[i], -1
            )  # b k k' d
            state = state.reshape(-1, b, state.size(3))  # b kk' d
            ind = preds.view(-1, b)  # b kk'

        speculative_logits = all_probs
        return speculative_logits


class MLPSpeculatorModel(torch.nn.Module):
    def __init__(self, config, prefix, weights):
        super().__init__()
        self.config = config
        self.n_predict = get_speculate()
        self.hidden_size = config.hidden_size

        self.emb = nn.ModuleList(
            [
                TensorParallelEmbedding(f"{prefix}.emb.{i}", weights)
                for i in range(self.n_predict)
            ]
        )
        self.proj = [
            FastLinear.load(
                config,
                prefix=f"{prefix}.proj.{i}",
                weights=weights,
                bias=False,
            )
            for i in range(self.n_predict)
        ]
        self.head = nn.ModuleList(
            [
                FastLinear.load(config, f"{prefix}.head.{i}", weights, bias=False)
                for i in range(self.n_predict)
            ]
        )
        self.ln = nn.ModuleList(
            [
                MLPSpeculatorLayerNorm(
                    prefix=f"{prefix}.ln.{i}",
                    config=config,
                    weights=weights,
                )
                for i in range(self.n_predict)
            ]
        )

        # Weights ensure that state_0 accounts for 50% of state magnitude by final head in expectation
        self.state_weight = 0.5 ** (0.5 / self.n_predict) if self.n_predict > 0 else 1
        self.activation = nn.GELU()
        self.vsize = config.vocab_size
        self.inner_dim = config.speculator_config["inner_dim"]
        self.top_k_tokens_per_head = [1] * self.n_predict
        self.emb_weight = math.sqrt(1 - self.state_weight**2) * math.sqrt(
            self.inner_dim / 2
        )
        self.emb.weight *= self.emb_weight

    def forward(
        self,
        hidden_states: torch.Tensor,
        input_ids: torch.Tensor,
    ):
        top_k_tokens_per_head = self.top_k_tokens_per_head

        # k indicates # of candidates
        # h indicates # of generated tokens
        state = hidden_states
        b = state.size(0)
        ind = input_ids.unsqueeze(0)
        all_probs = torch.empty(
            b, self.n_predict, self.vsize, device=state.device
        )  # b k h v
        assert (
            len(top_k_tokens_per_head) == self.n_predict
        ), f"You must provide a topk number for each head ({self.n_predict} heads, {len(top_k_tokens_per_head)} provided)"
        for i in range(self.n_predict):
            # Project and predict
            z = self.emb[i](ind)
            # z = z.mul(self.emb_weight)  # b k d
            state = self.proj[i](state) * self.state_weight + z
            state = self.activation(self.ln[i](state))  # b k d
            probs = F.log_softmax(self.head[i](state), dim=-1)  # b k v
            _probs, preds = probs.topk(top_k_tokens_per_head[i], dim=-1)  # b k k'

            # Update candidate set with new predictions

            # Update distribution set with new logits
            all_probs[:, i] = probs.exp()

            # Update state, log_probs and ind for new predictions
            state = state.unsqueeze(2).expand(
                -1, -1, top_k_tokens_per_head[i], -1
            )  # b k k' d
            state = state.reshape(-1, b, state.size(3))  # b kk' d
            ind = preds.view(-1, b)  # b kk'

        speculative_logits = all_probs
        return speculative_logits


class MLPSpeculatorHead(nn.Module):
    def __init__(self, lm_head, mlp_speculator, scale_input: bool):
        super().__init__()
        self.lm_head = lm_head
        self.mlp_speculator = mlp_speculator
        self.scale_input = scale_input

    def forward(
        self, input: torch.Tensor
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        logits = self.lm_head(input)
        # If we have too many tokens, we skip speculative logits
        if input.shape[0] > 128:
            return logits, None

        input_ids = logits.argmax(dim=-1)
        if self.scale_input:
            input = simple_norm(input)
        speculative_logits = self.mlp_speculator(input, input_ids)
        return logits, speculative_logits

    @staticmethod
    def load(config, prefix: str, weights):
        from pathlib import Path
        from safetensors import safe_open

        speculator_path = config.speculator["path"]

        for fname in config.speculator["model_paths"]:
            filename = str(Path(speculator_path) / fname)
            routing = weights.routing
            with safe_open(filename, framework="pytorch") as f:
                for k in f.keys():
                    if k in routing and routing[k] != filename:
                        raise RuntimeError(
                            f"Key {k} was found in multiple files: {filename} and {routing[k]}"
                        )
                    routing[k] = filename

        tie_weights = config.speculator_config.get("tie_weights", False)
        if tie_weights:
            mlp_speculator = MLPSpeculatorModelTied(config, "speculator", weights)
        else:
            mlp_speculator = MLPSpeculatorModel(config, "speculator", weights)
        # This is used in https://huggingface.co/ibm-fms/llama3-70b-accelerator
        scale_input = config.speculator_config.get("scale_input", False)
        lm_head = TensorParallelHead.load(config, prefix, weights)
        return MLPSpeculatorHead(lm_head, mlp_speculator, scale_input)


================================================
FILE: server/text_generation_server/layers/moe/__init__.py
================================================
from typing import Optional, Protocol, runtime_checkable

import torch
import torch.nn as nn
from loguru import logger
from transformers.activations import ACT2FN

from text_generation_server.layers import (
    TensorParallelColumnLinear,
    TensorParallelRowLinear,
)
from text_generation_server.layers.fp8 import HybridFP8UnquantLoader
from text_generation_server.layers.marlin import GPTQMarlinWeightsLoader
from text_generation_server.layers.moe.gptq_marlin import (
    GPTQMarlinSparseMoELayer,
    can_use_marlin_moe_gemm,
)
from text_generation_server.layers.moe.unquantized import UnquantizedSparseMoELayer
from text_generation_server.layers.moe.fp8 import FP8SparseMoELayer
from text_generation_server.utils.import_utils import SYSTEM
from text_generation_server.utils.kernels import load_kernel
from text_generation_server.utils.log import log_once
from text_generation_server.utils.weights import (
    DefaultWeightsLoader,
    Weights,
    UnquantizedWeight,
)

if SYSTEM == "ipex":
    from .fused_moe_ipex import fused_topk, grouped_topk
elif SYSTEM == "cuda":
    moe_kernels = load_kernel(module="moe", repo_id="kernels-community/moe")
    fused_topk = moe_kernels.fused_topk
    grouped_topk = moe_kernels.grouped_topk
else:
    from moe_kernels.fused_moe import fused_topk, grouped_topk


# NOTE: we are using a protocol here, because multiple inherance is not nice.
#       We need `Module`, and `Module` -> some abstract class -> some concrete
#       class inheritance is whacky.


@runtime_checkable
class MoELayer(Protocol):
    def __init__(
        self,
        *,
        n_expert_group: Optional[int],
        n_experts: int,
        prefix: str,
        renormalize: bool,
        topk: int,
        topk_group: Optional[int],
        weights: Weights,
        gate_proj_name: str = "gate_proj",
        up_proj_name: str = "up_proj",
        down_proj_name: str = "down_proj",
        hidden_act: str = "silu",
        scoring_func: Optional[str] = None,
        e_score_correction_bias: Optional[float] = None,
    ): ...

    def forward(
        self, x: torch.Tensor, *, gating_output: torch.Tensor
    ) -> torch.Tensor: ...


class DenseMoELayer(nn.Module):
    """
    Layer for MoE that applies *all* experts to each tokens and then weights
    their outputs based on the calculated routing. This layer is much slower
    than `SparseMoELayer` and should only be used when no fused kernels are
    available (e.g. for unsupported quantizers).
    """

    def __init__(
        self,
        *,
        n_expert_group: Optional[int],
        n_experts: int,
        prefix: str,
        renormalize: bool,
        topk: int,
        topk_group: Optional[int],
        weights: Weights,
        gate_proj_name: str = "gate_proj",
        up_proj_name: str = "up_proj",
        down_proj_name: str = "down_proj",
        hidden_act: str = "silu",
        scoring_func: Optional[str] = None,
        e_score_correction_bias: Optional[float] = None,
    ):
        super().__init__()

        assert scoring_func is None, "scoring func is not handled"
        assert e_score_correction_bias is None, "scoring correction bias is not handled"

        log_once(
            logger.info,
            "No fused layers are available for this model type, using (slower) dense MoE layer",
        )

        assert (n_expert_group is None) == (
            topk_group is None
        ), "n_expert_group and topk_group must both be None or have some value"

        self.n_expert_group = n_expert_group
        self.n_experts = n_experts
        self.renormalize = renormalize
        self.topk = topk
        self.topk_group = topk_group

        if "gelu" in hidden_act:
            self.act = lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh"
                    if hidden_act in ["gelu_fast", "gelu_pytorch_tanh"]
                    else "none"
                ),
            )
        elif "silu" in hidden_act:
            self.act = torch.nn.functional.silu
        else:
            self.act = ACT2FN[hidden_act]

        self.gate_proj = [
            TensorParallelColumnLinear.load(
                None,
                prefix=f"{prefix}.{i}.{gate_proj_name}",
                weights=weights,
                bias=False,
            )
            for i in range(self.n_experts)
        ]
        self.up_proj = [
            TensorParallelColumnLinear.load(
                None,
                prefix=f"{prefix}.{i}.{up_proj_name}",
                weights=weights,
                bias=False,
            )
            for i in range(self.n_experts)
        ]
        self.down_proj = [
            TensorParallelRowLinear.load(
                None,
                prefix=f"{prefix}.{i}.{down_proj_name}",
                weights=weights,
                bias=False,
            )
            for i in range(self.n_experts)
        ]

        self.process_group = weights.process_group

    def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> torch.Tensor:
        """
        x: (sequence_length, model_dim)
        gating_output: (sequence_length, n_experts)
        """
        # optional reshape
        input_shape = x.shape
        x = x.view(-1, input_shape[-1])

        if self.n_expert_group is not None and self.topk_group is not None:
            topk_weights, topk_ids = grouped_topk(
                x,
                gating_output,
                self.topk,
                renormalize=self.renormalize,
                num_expert_group=self.n_expert_group,
                topk_group=self.topk_group,
            )
        else:
            topk_weights, topk_ids = fused_topk(
                x, gating_output, self.topk, self.renormalize
            )
            topk_weights = topk_weights.to(x.dtype)

        weights = torch.zeros(
            topk_ids.shape[0], self.n_experts, dtype=x.dtype, device=x.device
        )

        weights.scatter_(1, topk_ids.long(), topk_weights.to(weights.dtype))

        out = torch.zeros_like(x)
        for i in range(self.n_experts):
            h = self.act(self.gate_proj[i](x)) * self.up_proj[i](x)
            h = self.down_proj[i](h, reduce=False)
            out += h * weights[:, i].view(-1, 1)

        return out


class SparseMoELayer(nn.Module):
    """
    Layer for MoE that uses fused kernels to only apply the active experts
    for each token (rather than applying all experts and selecting the
    outputs of active experts).
    """

    def __init__(
        self,
        *,
        n_expert_group: Optional[int],
        n_experts: int,
        prefix: str,
        renormalize: bool,
        topk: int,
        topk_group: Optional[int],
        weights: Weights,
        scoring_func: Optional[str] = "softmax",
        e_score_correction_bias: Optional[float] = None,
        gate_proj_name: str = "gate_proj",
        up_proj_name: str = "up_proj",
        down_proj_name: str = "down_proj",
    ):
        super().__init__()
        if (
            isinstance(weights.loader, DefaultWeightsLoader)
            and isinstance(weights.loader.weight_class, UnquantizedWeight)
        ) or isinstance(weights.loader, HybridFP8UnquantLoader):
            if (
                isinstance(weights.loader, HybridFP8UnquantLoader)
                and weights.loader.to_fp8
            ):
                cls = FP8SparseMoELayer
            else:
                cls = UnquantizedSparseMoELayer
        elif isinstance(
            weights.loader, GPTQMarlinWeightsLoader
        ) and can_use_marlin_moe_gemm(
            quant_method=weights.loader.quant_method,
            quantize=weights.loader.quantize,
            sym=weights.loader.sym,
        ):
            cls = GPTQMarlinSparseMoELayer
        else:
            raise ValueError(
                f"Unsupported weights loader: {type(weights.loader)}, sparse MoE is only supported for unquantized, AWQ, and GPTQ weights"
            )

        log_once(
            logger.info,
            "Using MoE layer wih fused gemm",
        )

        self.moe = cls(
            n_expert_group=n_expert_group,
            n_experts=n_experts,
            prefix=prefix,
            renormalize=renormalize,
            topk=topk,
            topk_group=topk_group,
            weights=weights,
            scoring_func=scoring_func,
            e_score_correction_bias=e_score_correction_bias,
            gate_proj_name=gate_proj_name,
            up_proj_name=up_proj_name,
            down_proj_name=down_proj_name,
        )

    def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> torch.Tensor:
        return self.moe(x, gating_output=gating_output)

    @staticmethod
    def is_supported(weights: Weights) -> bool:
        return (
            (
                isinstance(weights.loader, DefaultWeightsLoader)
                and isinstance(weights.loader.weight_class, UnquantizedWeight)
            )
            or isinstance(weights.loader, HybridFP8UnquantLoader)
            or (
                isinstance(weights.loader, GPTQMarlinWeightsLoader)
                and can_use_marlin_moe_gemm(
                    quant_method=weights.loader.quant_method,
                    quantize=weights.loader.quantize,
                    sym=weights.loader.sym,
                )
            )
        )


================================================
FILE: server/text_generation_server/layers/moe/fp8.py
================================================
from typing import Optional

import torch
import torch.nn as nn

from text_generation_server.utils.weights import Weights
from text_generation_server.layers.fp8 import (
    Fp8Weight,
    fp8_quantize,
    quant_dtype,
    normalize_e4m3fn_to_native_float8,
)

try:
    from .unquantized import fused_moe
except Exception:
    fused_moe = None


class FP8SparseMoELayer(nn.Module):
    def __init__(
        self,
        *,
        n_expert_group: Optional[int],
        n_experts: int,
        prefix: str,
        renormalize: bool,
        topk: int,
        topk_group: Optional[int],
        weights: Weights,
        scoring_func: Optional[str] = "softmax",
        e_score_correction_bias: Optional[float] = None,
        gate_proj_name: str = "gate_proj",
        up_proj_name: str = "up_proj",
        down_proj_name: str = "down_proj",
    ):
        super().__init__()

        assert (n_expert_group is None) == (
            topk_group is None
        ), "n_expert_group and topk_group must both be None or have some value"

        self.n_expert_group = n_expert_group
        self.topk = topk
        self.topk_group = topk_group
        self.renormalize = renormalize
        self.weight_block_size = weights.weights_loader.weight_block_size
        self.scoring_func = scoring_func
        self.e_score_correction_bias = e_score_correction_bias

        (
            self.gate_up_proj,
            self.gate_up_proj_weight_scale,
            self.gate_up_proj_input_scale,
        ) = _load_expert_multi_weights_col(
            prefix=prefix,
            n_experts=n_experts,
            gate_proj_name=gate_proj_name,
            up_proj_name=up_proj_name,
            weights=weights,
        )

        self.down_proj, self.down_proj_weight_scale, self.down_proj_input_scale = (
            _load_expert_weights_row(
                prefix=prefix,
                n_experts=n_experts,
                name=down_proj_name,
                weights=weights,
            )
        )

    def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> torch.Tensor:
        return fused_moe(
            x,
            w1=self.gate_up_proj,
            w2=self.down_proj,
            gating_output=gating_output,
            topk=self.topk,
            renormalize=self.renormalize,
            inplace=True,
            use_grouped_topk=self.n_expert_group is not None,
            num_expert_group=self.n_expert_group,
            topk_group=self.topk_group,
            scoring_func=self.scoring_func,
            e_score_correction_bias=self.e_score_correction_bias,
            use_fp8_w8a8=True,
            w1_scale=self.gate_up_proj_weight_scale,
            w2_scale=self.down_proj_weight_scale,
            a1_scale=self.gate_up_proj_input_scale,
            a2_scale=self.down_proj_input_scale,
        )


def _load_expert_weights(
    get_weight_fn,
    *,
    prefix: str,
    n_experts: int,
    name: str,
    weights: Weights,
) -> torch.Tensor:
    all_weight = None
    all_weight_scales = None
    max_input_scale = None

    for i in range(n_experts):
        weight = get_weight_fn(prefix, i, name, weights)

        assert isinstance(weight, Fp8Weight)

        if all_weight is None:
            all_weight = torch.empty(
                (n_experts,) + weight.weight.shape,
                dtype=quant_dtype,
                device=weight.weight.device,
            )
        if all_weight_scales is None:
            all_weight_scales = torch.empty(
                (n_experts,) + weight.weight_scale.shape,
                dtype=torch.float32,
                device=weight.weight.device,
            )

        if weight.weight.dtype in {torch.float8_e4m3fn, torch.float8_e4m3fnuz}:
            all_weight[i], all_weight_scales[i], current_input_scale = (
                normalize_e4m3fn_to_native_float8(
                    weight.weight, weight.weight_scale, weight.input_scale
                )
            )
            if current_input_scale is not None:
                if max_input_scale is None or current_input_scale > max_input_scale:
                    max_input_scale = current_input_scale
        else:
            all_weight[i], all_weight_scales[i] = fp8_quantize(
                weight.weight, scalar=True
            )

    assert all_weight is not None

    return all_weight, all_weight_scales, max_input_scale


def _load_expert_multi_weights_col(
    *,
    prefix: str,
    n_experts: int,
    gate_proj_name: str,
    up_proj_name: str,
    weights: Weights,
) -> torch.Tensor:
    def get_weight_fn(prefix, i, name, weights):
        return weights.get_multi_weights_col(
            [f"{prefix}.{i}.{gate_proj_name}", f"{prefix}.{i}.{up_proj_name}"], 0
        )

    return _load_expert_weights(
        get_weight_fn, prefix=prefix, n_experts=n_experts, name=None, weights=weights
    )


def _load_expert_weights_row(
    *,
    prefix: str,
    n_experts: int,
    name: str,
    weights: Weights,
) -> torch.Tensor:
    def get_weight_fn(prefix, i, name, weights):
        return weights.get_weights_row(f"{prefix}.{i}.{name}")

    return _load_expert_weights(
        get_weight_fn, prefix=prefix, n_experts=n_experts, name=name, weights=weights
    )


================================================
FILE: server/text_generation_server/layers/moe/fused_moe_ipex.py
================================================
# coding=utf-8
# Copyright 2023, 2024 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Tuple

import torch


def grouped_topk(
    hidden_states: torch.Tensor,
    gating_output: torch.Tensor,
    topk: int,
    renormalize: bool,
    num_expert_group: int = 0,
    topk_group: int = 0,
) -> Tuple[torch.Tensor, torch.Tensor]:
    scores = torch.softmax(gating_output, dim=-1)
    num_token = scores.shape[0]
    group_scores = (
        scores.view(num_token, num_expert_group, -1).max(dim=-1).values
    )  # [n, n_group]
    group_idx = torch.topk(group_scores, k=topk_group, dim=-1, sorted=False)[
        1
    ]  # [n, top_k_group]
    group_mask = torch.zeros_like(group_scores)  # [n, n_group]
    group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
    score_mask = (
        group_mask.unsqueeze(-1)
        .expand(num_token, num_expert_group, scores.shape[-1] // num_expert_group)
        .reshape(num_token, -1)
    )  # [n, e]
    tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
    topk_weights, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)

    if renormalize:
        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)

    return topk_weights, topk_ids


def fused_topk(
    hidden_states: torch.Tensor,
    gating_output: torch.Tensor,
    topk: int,
    renormalize: bool,
) -> Tuple[torch.Tensor, torch.Tensor]:
    topk_weights = torch.nn.functional.softmax(
        gating_output, dim=1, dtype=torch.float32
    )
    topk_weights, topk_ids = torch.topk(topk_weights, topk, dim=-1)
    if renormalize:
        topk_weights /= topk_weights.sum(dim=-1, keepdim=True)
    return topk_weights, topk_ids


================================================
FILE: server/text_generation_server/layers/moe/gptq_marlin.py
================================================
from dataclasses import dataclass
from typing import Callable, List, Optional

import torch
import torch.nn as nn

from text_generation_server.layers import moe
from text_generation_server.utils.import_utils import SYSTEM
from text_generation_server.utils.kernels import load_kernel
from text_generation_server.utils.weights import Weights
from text_generation_server.layers.marlin.gptq import (
    GPTQMarlinWeight,
    GPTQMarlinWeightsLoader,
)

if SYSTEM == "cuda":
    moe_kernels = load_kernel(module="moe", repo_id="kernels-community/moe")
else:
    moe_kernels = None


try:
    major, _minor = torch.cuda.get_device_capability()
    has_sm_8_0 = major >= 8
except Exception:
    has_sm_8_0 = False


def can_use_marlin_moe_gemm(
    *,
    quant_method: str,
    quantize: str,
    sym: bool,
):
    return (
        SYSTEM == "cuda"
        and moe is not None
        and has_sm_8_0
        and quantize in {"awq", "gptq"}
        and quant_method in {"awq", "gptq"}
        # We only support asymmetric quantization for AWQ.
        and (sym or quant_method == "awq")
    )


@dataclass
class GPTQMarlinMoEWeight:
    qweight: torch.Tensor
    qzeros: torch.Tensor
    scales: torch.Tensor
    g_idx: torch.Tensor
    perm: torch.Tensor
    is_full_k: bool


class GPTQMarlinSparseMoELayer(nn.Module):
    """
    MoE layer that uses a fused GPTQ-Marlin kernel.
    """

    def __init__(
        self,
        *,
        n_expert_group: Optional[int],
        n_experts: int,
        prefix: str,
        renormalize: bool,
        topk: int,
        topk_group: Optional[int],
        weights: Weights,
        gate_proj_name: str = "gate_proj",
        up_proj_name: str = "up_proj",
        down_proj_name: str = "down_proj",
        scoring_func: Optional[str] = None,
        e_score_correction_bias: Optional[float] = None,
    ):
        assert scoring_func in (
            "sigmoid",
            "softmax",
        ), f"scoring func {scoring_func} is not handled"
        super().__init__()

        if not (
            isinstance(weights.loader, GPTQMarlinWeightsLoader)
            and can_use_marlin_moe_gemm(
                quant_method=weights.loader.quant_method,
                quantize=weights.loader.quantize,
                sym=weights.loader.sym,
            )
        ):
            raise ValueError(
                f"Unsupported weights loader: {type(weights.loader)}, only GPTQMarlinWeightsLoader with AWQ and symmetric GPTQ quantization is supported"
            )

        assert (n_expert_group is None) == (
            topk_group is None
        ), "n_expert_group and topk_group must both be None or have some value"

        self.n_expert_group = n_expert_group
        self.topk = topk
        self.topk_group = topk_group
        self.renormalize = renormalize
        self.scoring_func = scoring_func
        self.e_score_correction_bias = e_score_correction_bias

        self.gate_up_proj = _load_expert_multi_weights_col(
            prefix=prefix,
            n_experts=n_experts,
            names=[gate_proj_name, up_proj_name],
            weights=weights,
        )

        self.down_proj = _load_expert_weights_row(
            prefix=prefix, n_experts=n_experts, name=down_proj_name, weights=weights
        )

        self.bits = weights.loader.bits

    def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> torch.Tensor:
        return fused_marlin_moe(
            hidden_states=x,
            w1=self.gate_up_proj.qweight,
            w2=self.down_proj.qweight,
            w1_scale=self.gate_up_proj.scales,
            w2_scale=self.down_proj.scales,
            w1_zeros=(
                self.gate_up_proj.qzeros
                if self.gate_up_proj.qzeros.numel() > 0
                else None
            ),
            w2_zeros=(
                self.down_proj.qzeros if self.down_proj.qzeros.numel() > 0 else None
            ),
            g_idx1=self.gate_up_proj.g_idx,
            g_idx2=self.down_proj.g_idx,
            sort_indices1=self.gate_up_proj.perm,
            sort_indices2=self.down_proj.perm,
            is_k_full=self.gate_up_proj.is_full_k or self.down_proj.is_full_k,
            gating_output=gating_output,
            topk=self.topk,
            renormalize=self.renormalize,
            use_grouped_topk=self.n_expert_group is not None,
            num_expert_group=self.n_expert_group,
            topk_group=self.topk_group,
            num_bits=self.bits,
            scoring_func=self.scoring_func,
            e_score_correction_bias=self.e_score_correction_bias,
        )


def _load_expert_multi_weights_col(
    *,
    prefix: str,
    n_experts: int,
    names: List[str],
    weights: Weights,
) -> GPTQMarlinMoEWeight:
    moe_weight = None
    for i in range(n_experts):
        weight = weights.get_multi_weights_col(
            [f"{prefix}.{i}.{name}" for name in names], 0
        )
        assert isinstance(weight, GPTQMarlinWeight)
        moe_weight = _pack_weight(
            n_experts=n_experts, expert=i, weight=weight, moe_weight=moe_weight
        )
    assert moe_weight is not None
    return moe_weight


def _load_expert_weights_row(
    *,
    prefix: str,
    n_experts: int,
    name: str,
    weights: Weights,
) -> GPTQMarlinMoEWeight:
    moe_weight = None
    for i in range(n_experts):
        weight = weights.get_weights_row(
            f"{prefix}.{i}.{name}",
        )
        assert isinstance(weight, GPTQMarlinWeight)
        moe_weight = _pack_weight(
            n_experts=n_experts, expert=i, weight=weight, moe_weight=moe_weight
        )
    assert moe_weight is not None
    return moe_weight


def _pack_weight(
    *,
    n_experts: int,
    expert: int,
    moe_weight: Optional[GPTQMarlinMoEWeight],
    weight: GPTQMarlinWeight,
) -> GPTQMarlinMoEWeight:
    if moe_weight is None:
        qweight = torch.empty(
            (n_experts,) + weight.qweight.shape,
            dtype=weight.qweight.dtype,
            device=weight.qweight.device,
        )
        qzeros = torch.empty(
            (n_experts,) + weight.qzeros.shape,
            dtype=weight.qzeros.dtype,
            device=weight.qzeros.device,
        )
        scales = torch.empty(
            (n_experts,) + weight.scales.shape,
            dtype=weight.scales.dtype,
            device=weight.scales.device,
        )
        g_idx = torch.empty(
            (n_experts,) + weight.g_idx.shape,
            dtype=weight.g_idx.dtype,
            device=weight.g_idx.device,
        )
        perm = torch.empty(
            (n_experts,) + weight.perm.shape,
            dtype=weight.perm.dtype,
            device=weight.perm.device,
        )

        moe_weight = GPTQMarlinMoEWeight(
            qweight=qweight,
            qzeros=qzeros,
            scales=scales,
            g_idx=g_idx,
            perm=perm,
            is_full_k=weight.is_full_k,
        )

    moe_weight.qweight[expert] = weight.qweight
    moe_weight.qzeros[expert] = weight.qzeros
    moe_weight.scales[expert] = weight.scales
    moe_weight.g_idx[expert] = weight.g_idx
    moe_weight.perm[expert] = weight.perm

    return moe_weight


def fused_marlin_moe(
    *,
    hidden_states: torch.Tensor,
    w1: torch.Tensor,
    w2: torch.Tensor,
    w1_scale: Optional[torch.Tensor] = None,
    w2_scale: Optional[torch.Tensor] = None,
    gating_output: torch.Tensor,
    g_idx1: torch.Tensor,
    g_idx2: torch.Tensor,
    sort_indices1: torch.Tensor,
    sort_indices2: torch.Tensor,
    w1_zeros: Optional[torch.Tensor] = None,
    w2_zeros: Optional[torch.Tensor] = None,
    is_k_full: bool,
    topk: int,
    renormalize: bool,
    num_bits: int = 8,
    use_grouped_topk: bool = False,
    num_expert_group: Optional[int] = None,
    custom_routing_function: Optional[Callable] = None,
    topk_group: Optional[int] = None,
    scoring_func: Optional[str] = None,
    e_score_correction_bias: Optional[float] = None,
) -> torch.Tensor:
    """
    This function computes a Mixture of Experts (MoE) layer using two sets of
    weights, w1 and w2, and top-k gating mechanism.

    Parameters:
    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
    - w1 (torch.Tensor): The first set of expert weights.
    - w2 (torch.Tensor): The second set of expert weights.
    - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
        w1.
    - w2_scale (Optional[torch.Tensor]): Optional scale to be used for
        w2.
    - gating_output (torch.Tensor): The output of the gating operation
        (before softmax).
    - g_idx1 (torch.Tensor): The first set of act_order indices.
    - g_idx2 (torch.Tensor): The second set of act_order indices.
    - sort_indices1 (torch.Tensor): The first act_order input permutation.
    - sort_indices2 (torch.Tensor): The second act_order input permutation.
    - w1_zeros (Optional[torch.Tensor]): Optional zero points to be used for w1.
    - w2_zeros (Optional[torch.Tensor]): Optional zero points to be used for w2.
    - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
    - num_bits (bool): The number of bits in expert weights quantization.

    Returns:
    - torch.Tensor: The output tensor after applying the MoE layer.
    """
    # Check constraints.
    assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
    assert hidden_states.shape[1] == w1.shape[1] * 16, "Hidden size mismatch w1"
    assert hidden_states.shape[1] == w2.shape[2] // (
        num_bits // 2
    ), "Hidden size mismatch w2"
    assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
    assert w1.is_contiguous(), "Expert weights1 must be contiguous"
    assert w2.is_contiguous(), "Expert weights2 must be contiguous"
    assert hidden_states.dtype == torch.float16
    assert num_bits in [4, 8]

    # DeekSeekv2 uses grouped_top_k
    if use_grouped_topk:
        assert topk_group is not None
        assert num_expert_group is not None
        topk_weights, topk_ids = moe_kernels.grouped_topk(
            hidden_states=hidden_states,
            gating_output=gating_output,
            topk=topk,
            renormalize=renormalize,
            num_expert_group=num_expert_group,
            topk_group=topk_group,
            scoring_func=scoring_func,
            e_score_correction_bias=e_score_correction_bias,
        )
    elif custom_routing_function is None:
        topk_weights, topk_ids = moe_kernels.fused_topk(
            hidden_states=hidden_states,
            gating_output=gating_output,
            topk=topk,
            renormalize=renormalize,
        )
    else:
        topk_weights, topk_ids = custom_routing_function(
            hidden_states=hidden_states,
            gating_output=gating_output,
            topk=topk,
            renormalize=renormalize,
        )
    return moe_kernels.fused_marlin_moe(
        hidden_states=hidden_states,
        w1=w1,
        w2=w2,
        w1_scale=w1_scale,
        w2_scale=w2_scale,
        gating_output=gating_output,
        topk_weights=topk_weights,
        topk_ids=topk_ids,
        g_idx1=g_idx1,
        g_idx2=g_idx2,
        sort_indices1=sort_indices1,
        sort_indices2=sort_indices2,
        w1_zeros=w1_zeros,
        w2_zeros=w2_zeros,
        num_bits=num_bits,
        is_k_full=is_k_full,
    )


================================================
FILE: server/text_generation_server/layers/moe/unquantized.py
================================================
from typing import Callable, List, Optional

import torch
import torch.nn as nn

from text_generation_server.utils.import_utils import SYSTEM
from text_generation_server.utils.kernels import load_kernel
from text_generation_server.utils.weights import UnquantizedWeight, Weights

if SYSTEM == "ipex":
    from intel_extension_for_pytorch.llm.modules import GatedMLPMOE
elif SYSTEM == "cuda":
    moe_kernels = load_kernel(module="moe", repo_id="kernels-community/moe")
else:
    import moe_kernels


class UnquantizedSparseMoELayer(nn.Module):
    def __init__(
        self,
        *,
        n_expert_group: Optional[int],
        n_experts: int,
        prefix: str,
        renormalize: bool,
        topk: int,
        topk_group: Optional[int],
        weights: Weights,
        scoring_func: Optional[str] = "softmax",
        e_score_correction_bias: Optional[float] = None,
        gate_proj_name: str = "gate_proj",
        up_proj_name: str = "up_proj",
        down_proj_name: str = "down_proj",
    ):
        super().__init__()

        assert (n_expert_group is None) == (
            topk_group is None
        ), "n_expert_group and topk_group must both be None or have some value"

        self.n_expert_group = n_expert_group
        self.topk = topk
        self.topk_group = topk_group
        self.renormalize = renormalize
        self.weight_block_size = weights.weights_loader.weight_block_size
        self.scoring_func = scoring_func
        self.e_score_correction_bias = e_score_correction_bias

        self.gate_up_proj = _load_expert_multi_weights_col(
            prefix=prefix,
            n_experts=n_experts,
            gate_proj_name=gate_proj_name,
            up_proj_name=up_proj_name,
            weights=weights,
        )

        self.down_proj = _load_expert_weights_row(
            prefix=prefix,
            n_experts=n_experts,
            name=down_proj_name,
            weights=weights,
        )
        if SYSTEM == "ipex":
            self.ipex_fused_moe = GatedMLPMOE(
                W13=self.gate_up_proj, W2=self.down_proj, use_prepack=True
            )

    def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> torch.Tensor:
        if SYSTEM == "rocm":
            return moe_kernels.fused_moe(
                x,
                self.gate_up_proj,
                self.down_proj,
                gating_output,
                self.topk,
                renormalize=self.renormalize,
                inplace=True,
            )
        elif SYSTEM == "ipex":
            return self.ipex_fused_moe(
                hidden_states=x,
                router_logits=gating_output,
                top_k=self.topk,
                renormalize=self.renormalize,
                use_grouped_topk=self.n_expert_group is not None,
                num_expert_group=self.n_expert_group,
                topk_group=self.topk_group,
                scoring_func=self.scoring_func,
                e_score_correction_bias=self.e_score_correction_bias,
            )
        return fused_moe(
            x,
            w1=self.gate_up_proj,
            w2=self.down_proj,
            gating_output=gating_output,
            topk=self.topk,
            renormalize=self.renormalize,
            inplace=True,
            use_grouped_topk=self.n_expert_group is not None,
            num_expert_group=self.n_expert_group,
            topk_group=self.topk_group,
            scoring_func=self.scoring_func,
            e_score_correction_bias=self.e_score_correction_bias,
        )


def _load_expert_multi_weights_col(
    *,
    prefix: str,
    n_experts: int,
    gate_proj_name: str,
    up_proj_name: str,
    weights: Weights,
) -> torch.Tensor:
    all_weight = None
    for i in range(n_experts):
        weight = weights.get_multi_weights_col(
            [f"{prefix}.{i}.{gate_proj_name}", f"{prefix}.{i}.{up_proj_name}"], 0
        )

        assert isinstance(weight, UnquantizedWeight)

        if all_weight is None:
            all_weight = torch.empty(
                (n_experts,) + weight.weight.shape,
                dtype=weight.weight.dtype,
                device=weight.weight.device,
            )

        all_weight[i] = weight.weight

    assert all_weight is not None

    return all_weight


def _load_expert_weights_row(
    *,
    prefix: str,
    n_experts: int,
    name: str,
    weights: Weights,
) -> torch.Tensor:
    all_weight = None
    for i in range(n_experts):
        weight = weights.get_weights_row(
            f"{prefix}.{i}.{name}",
        )

        assert isinstance(weight, UnquantizedWeight)

        if all_weight is None:
            all_weight = torch.empty(
                (n_experts,) + weight.weight.shape,
                dtype=weight.weight.dtype,
                device=weight.weight.device,
            )

        all_weight[i] = weight.weight

    assert all_weight is not None

    return all_weight


def fused_moe(
    hidden_states: torch.Tensor,
    w1: torch.Tensor,
    w2: torch.Tensor,
    gating_output: torch.Tensor,
    topk: int,
    renormalize: bool,
    inplace: bool = False,
    use_grouped_topk: bool = False,
    num_expert_group: Optional[int] = None,
    topk_group: Optional[int] = None,
    custom_routing_function: Optional[Callable] = None,
    scoring_func: str = "softmax",
    e_score_correction_bias: Optional[torch.Tensor] = None,
    use_fp8_w8a8: bool = False,
    use_int8_w8a16: bool = False,
    use_int4_w4a16: bool = False,
    w1_scale: Optional[torch.Tensor] = None,
    w2_scale: Optional[torch.Tensor] = None,
    a1_scale: Optional[torch.Tensor] = None,
    a2_scale: Optional[torch.Tensor] = None,
    block_shape: Optional[List[int]] = None,
) -> torch.Tensor:
    """
    This function computes a Mixture of Experts (MoE) layer using two sets of
    weights, w1 and w2, and top-k gating mechanism.

    Parameters:
    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
    - w1 (torch.Tensor): The first set of expert weights.
    - w2 (torch.Tensor): The second set of expert weights.
    - gating_output (torch.Tensor): The output of the gating operation
        (before softmax).
    - topk (int): The number of top-k experts to select.
    - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
    - inplace (bool): If True, perform the operation in-place.
        Defaults to False.
    - num_expert_group: Optional[int]: additional parameter for grouped_topk
    - topk_group: Optional[int]: additional parameter for grouped_topk
    - use_grouped_topk: If True, use grouped_topk instead of fused_topk
        note: Deepseekv2 model uses grouped_topk
    - use_fp8_w8a8 (bool): If True, use fp8 arithmetic to compute the inner
        products for w1 and w2. Defaults to False.
    - use_int8_w8a16 (bool): If True, use fp8 arithmetic to compute the inner
        products for w1 and w2. Defaults to False.
    - use_int4_w4a16 (bool): If True, use matmul of int4 weight and bf16/fp16
        activation to compute the inner products for w1 and w2.
        Defaults to False.
    - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
        w1.
    - w2_scale (Optional[torch.Tensor]): Optional scale to be used for
        w2.
    - a1_scale (Optional[torch.Tensor]): Optional scale to be used for
        a1.
    - a2_scale (Optional[torch.Tensor]): Optional scale to be used for
        a2.
    - block_shape: (Optional[List[int]]): Optional block size for block-wise
        quantization.
    Returns:
    - torch.Tensor: The output tensor after applying the MoE layer.
    """
    # Check constraints.
    assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"

    if use_grouped_topk:
        assert num_expert_group is not None and topk_group is not None
        from loguru import logger
        import inspect

        logger.info(f"{inspect.signature(moe_kernels.grouped_topk)}")
        topk_weights, topk_ids = moe_kernels.grouped_topk(
            hidden_states,
            gating_output,
            topk,
            renormalize,
            num_expert_group,
            topk_group,
            scoring_func=scoring_func,
            e_score_correction_bias=e_score_correction_bias,
        )
    elif custom_routing_function is None:
        topk_weights, topk_ids = moe_kernels.fused_topk(
            hidden_states, gating_output, topk, renormalize
        )
    else:
        topk_weights, topk_ids = custom_routing_function(
            hidden_states, gating_output, topk, renormalize
        )

    return moe_kernels.fused_experts(
        hidden_states,
        w1,
        w2,
        topk_weights,
        topk_ids,
        inplace=inplace,
        use_fp8_w8a8=use_fp8_w8a8,
        use_int8_w8a16=use_int8_w8a16,
        use_int4_w4a16=use_int4_w4a16,
        w1_scale=w1_scale,
        w2_scale=w2_scale,
        a1_scale=a1_scale,
        a2_scale=a2_scale,
        block_shape=block_shape,
    )


================================================
FILE: server/text_generation_server/layers/rotary.py
================================================
import os
import math
import torch
from torch import nn
from text_generation_server.utils.import_utils import SYSTEM

if SYSTEM == "cuda":
    from text_generation_server.utils.kernels import load_kernel

    rotary = load_kernel(module="rotary", repo_id="kernels-community/rotary")
elif SYSTEM == "rocm":
    import vllm._custom_ops as ops
elif SYSTEM == "ipex":
    import intel_extension_for_pytorch as ipex


def _create_inv_freq(dim, base, device):
    inv_freq = 1.0 / (
        base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim)
    )
    return inv_freq


def _get_rope_config(config):
    if os.getenv("ROPE_SCALING", None) is not None:
        rope_scaling = {
            "type": os.environ["ROPE_SCALING"],
            "factor": float(os.environ["ROPE_FACTOR"]),
        }
        return rope_scaling
    return getattr(config, "rope_scaling", None)


class PositionRotaryEmbedding(nn.Module):
    def __init__(self, inv_freq, scaling_factor):
        super().__init__()
        self.inv_freq = inv_freq
        self._seq_len_cached = 0
        self._cos_cached = None
        self._sin_cached = None
        self._cos_k_cached = None
        self._sin_k_cached = None
        self.scaling_factor = scaling_factor
        self.dynamic_args = None

    def forward(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        cos: torch.Tensor,
        sin: torch.Tensor,
    ):
        # Such controlflows may add some overhead.
        if SYSTEM == "cuda":
            rotary_dim = cos.shape[-1]
            q1 = query[..., :rotary_dim]
            q2 = query[..., rotary_dim : 2 * rotary_dim]

            rotary.apply_rotary(q1, q2, cos, sin, q1, q2, False)

            k1 = key[..., :rotary_dim]
            k2 = key[..., rotary_dim : 2 * rotary_dim]

            rotary.apply_rotary(k1, k2, cos, sin, k1, k2, False)
        elif SYSTEM == "rocm":
            # NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems.
            # Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773

            head_size = query.shape[-1]

            # Inplace operation, updating query and key.
            ops.rotary_embedding(query, key, head_size, cos, sin, True)
        elif SYSTEM == "ipex":
            ipex.llm.functional.rotary_embedding(
                query, key, sin, cos, query.size(-1), True
            )
        else:
            raise ValueError(
                "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
            )

    @classmethod
    def static(cls, config, dim, base, device):
        inv_freq = _create_inv_freq(dim, base, device)
        scaling_factor = None
        rope_scaling = _get_rope_config(config)
        if rope_scaling is not None:
            # `rope_type` is now standard in transformers, but some existing models
            # have `type` instead.
            rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))

            if rope_type == "linear":
                pass
            elif rope_type == "default":
                pass
            elif rope_type == "mrope":
                mrope_section = rope_scaling["mrope_section"]
                if mrope_section is not None:
                    return RotaryPositionEmbeddingMultimodalSections(
                        inv_freq, scaling_factor, mrope_section
                    )
            elif rope_type == "dynamic":
                scaling_factor = rope_scaling["factor"]
                return DynamicPositionRotaryEmbedding(
                    dim=dim,
                    max_position_embeddings=config.max_position_embeddings,
                    base=base,
                    device=inv_freq.device,
                    scaling_factor=scaling_factor,
                )
            elif rope_type == "llama3":
                inv_freq = apply_llama3_scaling(
                    inv_freq,
                    scaling_factor=rope_scaling["factor"],
                    low_freq_factor=rope_scaling["low_freq_factor"],
                    high_freq_factor=rope_scaling["high_freq_factor"],
                    original_max_position_embeddings=rope_scaling[
                        "original_max_position_embeddings"
                    ],
                )

                return cls(inv_freq, scaling_factor)

            elif rope_type == "yarn":
                scaling_factor = rope_scaling["factor"]
                mscale = rope_scaling.get("mscale", 1.0)
                mscale_all_dim = rope_scaling.get("mscale_all_dim", 0.0)
                return YarnPositionRotaryEmbedding(
                    dim=2 * inv_freq.shape[0],
                    max_position_embeddings=rope_scaling[
                        "original_max_position_embeddings"
                    ],
                    base=base,
                    device=inv_freq.device,
                    scaling_factor=scaling_factor,
                    extrapolation_factor=1,
                    attn_factor=1,
                    beta_fast=32,
                    beta_slow=1,
                    mscale=mscale,
                    mscale_all_dim=mscale_all_dim,
                )
            elif rope_type in ["su", "longrope"]:
                short_factor = torch.tensor(
                    rope_scaling["short_factor"], dtype=torch.float32, device=device
                )
                short_inv_freq = 1.0 / (
                    short_factor
                    * base
                    ** (
                        torch.arange(0, dim, 2, device=device, dtype=torch.float32)
                        / dim
                    )
                )
                long_factor = torch.tensor(
                    rope_scaling["long_factor"], dtype=torch.float32, device=device
                )
                long_inv_freq = 1.0 / (
                    long_factor
                    * base
                    ** (
                        torch.arange(0, dim, 2, device=device, dtype=torch.float32)
                        / dim
                    )
                )

                original_max_position_embeddings = (
                    config.original_max_position_embeddings
                )
                max_position_embeddings = config.max_position_embeddings
                if max_position_embeddings <= original_max_position_embeddings:
                    scaling_factor = 1.0
                else:
                    scale = max_position_embeddings / original_max_position_embeddings
                    scaling_factor = math.sqrt(
                        1 + math.log(scale) / math.log(original_max_position_embeddings)
                    )

                # if short_mscale and long_mscale are provided we need to scale the freqs
                # using the Phi3LongRoPEScaledRotaryEmbedding
                if ("short_mscale" in rope_scaling) and ("long_mscale" in rope_scaling):
                    short_mscale = rope_scaling["short_mscale"]
                    long_mscale = rope_scaling["long_mscale"]
                    return Phi3LongRoPEScaledRotaryEmbedding(
                        short_inv_freq=short_inv_freq,
                        long_inv_freq=long_inv_freq,
                        max_position_embeddings=config.max_position_embeddings,
                        short_mscale=short_mscale,
                        long_mscale=long_mscale,
                        original_max_position_embeddings=original_max_position_embeddings,
                    )

                return SuRotaryEmbedding(
                    short_inv_freq=short_inv_freq,
                    long_inv_freq=long_inv_freq,
                    scaling_factor=scaling_factor,
                    original_max_position_embeddings=original_max_position_embeddings,
                )
            else:
                raise NotImplementedError(
                    f"rope scaling type {rope_scaling['type']} is not implemented or invalid"
                )
        return cls(inv_freq, scaling_factor)

    @classmethod
    def load(cls, config, prefix, weights):
        # XXX: Always load this in float32 !
        dtype = weights.dtype
        weights.dtype = torch.float32
        inv_freq = weights.get_tensor(f"{prefix}.inv_freq")
        weights.dtype = dtype

        scaling_factor = None
        rope_scaling = _get_rope_config(config)
        if rope_scaling is not None:
            scaling_factor = rope_scaling["factor"]
            if rope_scaling["type"] == "linear":
                pass
            elif rope_scaling["type"] == "dynamic":
                return DynamicPositionRotaryEmbedding(
                    dim=2 * inv_freq.shape[0],
                    max_position_embeddings=config.max_position_embeddings,
                    base=10000.0,
                    device=inv_freq.device,
                    scaling_factor=scaling_factor,
                )
            elif rope_scaling["type"] == "yarn":
                mscale = rope_scaling.get("mscale", 1.0)
                mscale_all_dim = rope_scaling.get("mscale_all_dim", 0.0)
                return YarnPositionRotaryEmbedding(
                    dim=2 * inv_freq.shape[0],
                    max_position_embeddings=rope_scaling[
                        "original_max_position_embeddings"
                    ],
                    base=10000.0,
                    device=inv_freq.device,
                    scaling_factor=scaling_factor,
                    extrapolation_factor=1,
                    attn_factor=1,
                    beta_fast=32,
                    beta_slow=1,
                    mscale=mscale,
                    mscale_all_dim=mscale_all_dim,
                )
            else:
                raise NotImplementedError(
                    f"rope scaling type {rope_scaling['type']} is not implemented or invalid"
                )
        return cls(inv_freq, scaling_factor)

    def _update_cos_sin_cache(self, dtype, device, seqlen):
        # Reset the tables if the sequence length has changed,
        # or if we're on a new device (possibly due to tracing for instance)
        if (
            seqlen > self._seq_len_cached
            or self._cos_cached.device != device
            or self._cos_cached.dtype != dtype
        ):
            self._seq_len_cached = seqlen
            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
            if self.scaling_factor is not None:
                t /= self.scaling_factor
            # Don't do einsum, it converts fp32 to fp16
            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)

            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
            self._cos_cached = torch.cos(freqs).to(dtype)
            self._sin_cached = torch.sin(freqs).to(dtype)

    def get_cos_sin(self, position_ids: torch.Tensor, max_s: int, dtype: torch.dtype):
        """
        Return cos and sin for the asked position ids
        """
        if SYSTEM == "rocm":
            # For RoCm, we always use float cos/sin to avoid a cast.
            # For NVIDIA, for some reason, the flash-attn rotary kernel requires cos/sin and query/key to be of same dtype: https://github.com/Dao-AILab/flash-attention/blob/017716451d446e464dde9aca3a3c1ed2209caaa9/csrc/rotary/rotary.cpp#L26
            # But later on goes and cast cos/sin to float anyway: https://github.com/Dao-AILab/flash-attention/blob/017716451d446e464dde9aca3a3c1ed2209caaa9/csrc/rotary/rotary_cuda.cu#L29, which looks suboptimal.
            dtype = torch.float32

        self._update_cos_sin_cache(dtype, position_ids.device, max_s)

        cos = torch.index_select(self._cos_cached, 0, position_ids)
        sin = torch.index_select(self._sin_cached, 0, position_ids)

        # Note: this unsqueeze is not necessary on RoCm + VLLM ROPE implementation, but we leave it as is to avoid yet an other controlflow.
        return cos.unsqueeze(1), sin.unsqueeze(1)


class SuRotaryEmbedding(PositionRotaryEmbedding):
    def __init__(
        self,
        short_inv_freq,
        long_inv_freq,
        scaling_factor,
        original_max_position_embeddings,
    ):
        super(PositionRotaryEmbedding, self).__init__()
        self.short_inv_freq = short_inv_freq
        self.long_inv_freq = long_inv_freq
        self.scaling_factor = scaling_factor
        self.original_max_position_embeddings = original_max_position_embeddings
        self._seq_len_cached = 0
        self._cos_cached = None
        self._sin_cached = None
        self._cos_k_cached = None
        self._sin_k_cached = None
        self.dynamic_args = None

    def _update_cos_sin_cache(self, dtype, device, seqlen):
        # Reset the tables if the sequence length has changed,
        # or if we're on a new device (possibly due to tracing for instance)
        if (
            seqlen > self._seq_len_cached
            or self._cos_cached is None
            or self._cos_cached.device != device
            or self._cos_cached.dtype != dtype
        ):
            self._seq_len_cached = seqlen

            t = torch.arange(seqlen, device=device, dtype=self.short_inv_freq.dtype)
            short_freqs = torch.outer(
                t[: self.original_max_position_embeddings],
                self.short_inv_freq.to(device=t.device),
            )
            long_freqs = torch.outer(
                t[self.original_max_position_embeddings :],
                self.long_inv_freq.to(device=t.device),
            )

            freqs = torch.cat([short_freqs, long_freqs])

            self._cos_cached = (torch.cos(freqs) * self.scaling_factor).to(dtype)
            self._sin_cached = (torch.sin(freqs) * self.scaling_factor).to(dtype)


class Phi3LongRoPEScaledRotaryEmbedding(PositionRotaryEmbedding):
    def __init__(
        self,
        short_inv_freq: torch.Tensor,
        long_inv_freq: torch.Tensor,
        max_position_embeddings: int,
        short_mscale: float,
        long_mscale: float,
        original_max_position_embeddings: int,
    ):
        super(PositionRotaryEmbedding, self).__init__()
        self.short_inv_freq = short_inv_freq
        self.long_inv_freq = long_inv_freq
        self.max_position_embeddings = max_position_embeddings
        self.short_mscale = short_mscale
        self.long_mscale = long_mscale
        self.original_max_position_embeddings = original_max_position_embeddings

        # cache
        self._seq_len_cached = 0
        self._cos_cached = None
        self._sin_cached = None
        self._cos_k_cached = None
        self._sin_k_cached = None
        self.dynamic_args = None

    def _update_cos_sin_cache(self, dtype, device, seqlen):
        if (
            seqlen > self._seq_len_cached
            or self._cos_cached is None
            or self._cos_cached.device != device
            or self._cos_cached.dtype != dtype
        ):
            self._seq_len_cached = seqlen
            t = torch.arange(seqlen, device=device, dtype=self.short_inv_freq.dtype)

            short_freqs = torch.outer(
                t[: self.original_max_position_embeddings],
                self.short_inv_freq.to(device=t.device),
            )

            long_freqs = torch.outer(
                t[self.original_max_position_embeddings :],
                self.long_inv_freq.to(device=t.device),
            )

            short_freqs = short_freqs * self.short_mscale
            long_freqs = long_freqs * self.long_mscale

            freqs = torch.empty((seqlen, short_freqs.shape[1]), device=device)
            freqs[: self.original_max_position_embeddings] = short_freqs
            freqs[self.original_max_position_embeddings :] = long_freqs

            self._cos_cached = torch.cos(freqs).to(dtype)
            self._sin_cached = torch.sin(freqs).to(dtype)


class DynamicPositionRotaryEmbedding(PositionRotaryEmbedding):
    def __init__(self, dim, max_position_embeddings, base, device, scaling_factor):
        inv_freq = _create_inv_freq(dim, base, device)
        super().__init__(inv_freq, scaling_factor)
        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base

    def _update_cos_sin_cache(self, dtype, device, seqlen):
        # Reset the tables if the sequence length has changed,
        # or if we're on a new device (possibly due to tracing for instance)
        if (
            seqlen > self._seq_len_cached
            or self._cos_cached.device != device
            or self._cos_cached.dtype != dtype
        ):
            if seqlen > self.max_position_embeddings:
                newbase = self.base * (
                    (self.scaling_factor * seqlen / self.max_position_embeddings)
                    - (self.scaling_factor - 1)
                ) ** (self.dim / (self.dim - 2))
                self.inv_freq = _create_inv_freq(
                    self.dim, newbase, self.inv_freq.device
                )
            self._seq_len_cached = seqlen
            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
            # Don't do einsum, it converts fp32 to fp16
            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)

            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
            self._cos_cached = torch.cos(freqs).to(dtype)
            self._sin_cached = torch.sin(freqs).to(dtype)


def find_correction_dim(num_rotations, dim, base=10000, max_position_embeddings=2048):
    return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
        2 * math.log(base)
    )


# Find dim range bounds based on rotations
def find_correction_range(
    low_rot, high_rot, dim, base=10000, max_position_embeddings=2048
):
    low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings))
    high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings))
    return max(low, 0), min(high, dim - 1)  # Clamp values just in case


def linear_ramp_mask(min, max, dim):
    if min == max:
        max += 0.001  # Prevent singularity

    linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
    ramp_func = torch.clamp(linear_func, 0, 1)
    return ramp_func


def get_mscale(scale: float = 1.0, mscale: float = 1.0):
    if scale <= 1:
        return 1.0
    return 0.1 * mscale * math.log(scale) + 1.0


class YarnPositionRotaryEmbedding(PositionRotaryEmbedding):
    def __init__(
        self,
        dim,
        max_position_embeddings,
        base,
        device,
        scaling_factor,
        *,
        extrapolation_factor,
        attn_factor,
        beta_fast,
        beta_slow,
        mscale: float,
        mscale_all_dim: float,
    ):
        inv_freq = _create_inv_freq(dim, base, device)
        super().__init__(inv_freq, scaling_factor)
        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base
        self.extrapolation_factor = extrapolation_factor
        self.attn_factor = attn_factor
        self.beta_fast = beta_fast
        self.beta_slow = beta_slow
        self.mscale_all_dim = mscale_all_dim
        self.scaling_factor = scaling_factor
        self.mscale = float(
            get_mscale(self.scaling_factor, mscale)
            / get_mscale(self.scaling_factor, mscale_all_dim)
            * self.attn_factor
        )  # Get n-d magnitude scaling corrected for interpolation

    def _update_cos_sin_cache(self, dtype, device, seqlen):
        # Reset the tables if the sequence length has changed,
        # or if we're on a new device (possibly due to tracing for instance)
        if (
            seqlen > self._seq_len_cached
            or self._cos_cached.device != device
            or self._cos_cached.dtype != dtype
        ):
            if seqlen > self.max_position_embeddings or True:
                inv_freq_extrapolation = _create_inv_freq(
                    self.dim, self.base, self.inv_freq.device
                )
                freqs = 1.0 / inv_freq_extrapolation
                inv_freq_interpolation = 1.0 / (self.scaling_factor * freqs)
                low, high = find_correction_range(
                    self.beta_fast,
                    self.beta_slow,
                    self.dim,
                    self.base,
                    self.max_position_embeddings,
                )

                inv_freq_mask = (
                    1 - linear_ramp_mask(low, high, self.dim // 2).float().to(device)
                ) * self.extrapolation_factor  # Get n-d rotational scaling corrected for extrapolation
                inv_freq = (
                    inv_freq_interpolation * (1 - inv_freq_mask)
                    + inv_freq_extrapolation * inv_freq_mask
                )

                self.inv_freq = inv_freq

            self._seq_len_cached = seqlen
            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
            # Don't do einsum, it converts fp32 to fp16
            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)

            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
            self._cos_cached = (torch.cos(freqs) * self.mscale).to(dtype)
            self._sin_cached = (torch.sin(freqs) * self.mscale).to(dtype)


def apply_llama3_scaling(
    freqs: torch.Tensor,
    *,
    scaling_factor: int,
    low_freq_factor: int,
    high_freq_factor: int,
    original_max_position_embeddings: int,
):
    low_freq_wavelen = original_max_position_embeddings / low_freq_factor
    high_freq_wavelen = original_max_position_embeddings / high_freq_factor
    new_freqs = []

    for freq in freqs:
        wavelen = 2 * math.pi / freq

        if wavelen < high_freq_wavelen:
            new_freqs.append(freq)
        elif wavelen > low_freq_wavelen:
            new_freqs.append(freq / scaling_factor)
        else:
            assert low_freq_wavelen != high_freq_wavelen
            smooth = (original_max_position_embeddings / wavelen - low_freq_factor) / (
                high_freq_factor - low_freq_factor
            )
            new_freqs.append((1 - smooth) * freq / scaling_factor + smooth * freq)

    return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device)


class RotaryPositionEmbeddingMultimodalSections(PositionRotaryEmbedding):
    def __init__(self, inv_freq: torch.Tensor, scaling_factor: float, sections: list):
        super().__init__(inv_freq, scaling_factor)
        self.sections = sections
        self._cos_cached = None
        self._sin_cached = None
        self.section_indices = (
            torch.arange(len(self.sections))
            .repeat_interleave(torch.tensor(self.sections))
            .view(1, 1, -1)
            .to(inv_freq.device)
        )

    def _update_cos_sin_cache(
        self, dtype: torch.dtype, device: torch.device, seqlen: int
    ):
        # always cache the cos/sin for the full sequence length to avoid
        # recomputing if the sequence length is smaller than the cached one
        if (
            seqlen > self._seq_len_cached
            or self._cos_cached.device != device
            or self._cos_cached.dtype != dtype
        ):
            self._seq_len_cached = seqlen
            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
            self._cos_cached = torch.cos(freqs).to(dtype)
            self._sin_cached = torch.sin(freqs).to(dtype)
            self._sections = self.section_indices.expand(seqlen, -1, -1)

    def get_cos_sin(
        self,
        position_ids: torch.Tensor,
        max_s: int,
        dtype: torch.dtype,
    ):
        self._update_cos_sin_cache(dtype, position_ids.device, max_s)
        slen = position_ids.shape[0]

        cos = self._cos_cached[position_ids].gather(1, self._sections[:slen])
        sin = self._sin_cached[position_ids].gather(1, self._sections[:slen])
        return cos, sin


================================================
FILE: server/text_generation_server/layers/speculative.py
================================================
import torch
import json
from typing import Tuple, Optional
from text_generation_server.layers.tensor_parallel import TensorParallelHead
from text_generation_server.layers.medusa import MedusaHeadV1, MedusaHeadV2
from text_generation_server.layers.mlp import MLPSpeculatorHead


class SpeculativeHead(torch.nn.Module):
    def __init__(self, lm_head, speculator):
        super().__init__()
        self.head = lm_head
        self.speculator = speculator

    @staticmethod
    def load(config, prefix: str, weights):
        speculator = config.speculator
        if speculator:
            speculator_path = config.speculator["path"]
            speculator_config = str(speculator_path / "config.json")

            with open(speculator_config, "r") as f:
                speculator_config = json.load(f)

            config.speculator_config = speculator_config
            try:
                architecture = speculator_config["architectures"][0]

                if architecture == "MLPSpeculatorPreTrainedModel":
                    speculator = MLPSpeculatorHead.load(config, prefix, weights)
                else:
                    speculator = None
            except KeyError:
                try:
                    speculator = MedusaHeadV1.load(config, prefix, weights)
                except Exception:
                    speculator = MedusaHeadV2(config, prefix, weights)
            lm_head = None
        else:
            lm_head = TensorParallelHead.load(config, prefix, weights)
            speculator = None
        return SpeculativeHead(lm_head, speculator)

    def forward(
        self, input: torch.Tensor
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        if self.speculator is not None:
            return self.speculator(input)

        assert self.head is not None
        logits = self.head(input)
        return logits, None


================================================
FILE: server/text_generation_server/layers/tensor_parallel.py
================================================
import torch
from torch.nn import functional as F
from typing import Iterable, List
from text_generation_server.layers.linear import get_linear, FastLinear
from text_generation_server.utils.import_utils import SYSTEM

if SYSTEM == "ipex":
    import intel_extension_for_pytorch as ipex


class LayerConcat(torch.nn.Module):
    """
    Apply multiple layers to the input and concatenate their
    outputs.
    """

    def __init__(self, layers: Iterable[torch.nn.Module], dim: int = -1):
        """
        `dim` is the dimension along which layer outputs are concatenated.
        """
        super().__init__()
        self.layers = layers
        self.dim = dim

    def forward(self, x: torch.Tensor):
        outputs = [layer(x) for layer in self.layers]
        return torch.cat(outputs, self.dim)


class SuperLayer(torch.nn.Module):
    def __init__(self, linear):
        super().__init__()
        self.linear = linear

    def forward(self, x):
        return self.linear.forward(x)


class TensorParallelHead(SuperLayer):
    def __init__(self, linear, process_group, should_gather: bool):
        super().__init__(linear)
        self.process_group = process_group
        self.should_gather = should_gather

    @staticmethod
    def load(config, prefix: str, weights):
        if config.quantize == "exl2":
            try:
                # If the piece and LM head embeddings are shared, we have
                # non-quantized weights...
                weight = weights.get_tensor(f"{prefix}.weight")
            except Exception:
                # ...otherwise they are quantized.
                weight = weights.get_weights_col(prefix)
            should_gather = weights.process_group.size() > 1
        elif weights.process_group.size() > 1:
            try:
                weight = weights.get_sharded(f"{prefix}.weight", dim=0)
                should_gather = True
            except AssertionError:
                # If the vocab size is not divisible by number of shards
                # just load the entire thing.
                weight = weights.get_tensor(f"{prefix}.weight")
                should_gather = False
        else:
            weight = weights.get_tensor(f"{prefix}.weight")
            should_gather = False

        return TensorParallelHead(
            get_linear(weight, bias=None),
            process_group=weights.process_group,
            should_gather=should_gather,
        )

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        if not self.should_gather:
            return super().forward(input)

        world_size = self.process_group.size()
        if len(input.shape) == 2 and isinstance(self.linear, FastLinear):
            out_dim = self.linear.weight.shape[0]

            if input.shape[0] == 1:
                world_out = input.new_empty(1, out_dim * world_size)
                local_out = input.new_empty(1, out_dim)
                gather_input = local_out
            else:
                world_out = input.new_empty(out_dim * world_size, input.shape[0])
                gather_input = input.new_empty(out_dim, input.shape[0])
                local_out = gather_input.T

            torch.mm(input, self.linear.weight.T, out=local_out)
            if SYSTEM == "ipex" and gather_input.device.type == "cpu":
                ipex.distributed.all_gather_into_tensor(
                    world_out, gather_input, group=self.process_group
                )
            else:
                torch.distributed.all_gather_into_tensor(
                    world_out, gather_input, group=self.process_group
                )

            if input.shape[0] == 1:
                return world_out
            return world_out.T

        output = super().forward(input)
        world_output = [
            torch.empty_like(output) for _ in range(self.process_group.size())
        ]
        if SYSTEM == "ipex" and output.device.type == "cpu":
            ipex.distributed.all_gather(world_output, output, group=self.process_group)
        else:
            torch.distributed.all_gather(world_output, output, group=self.process_group)
        world_output = torch.cat(world_output, dim=-1)
        return world_output


class TensorParallelColumnLinear(SuperLayer):
    @classmethod
    def load_gate_up(cls, config, prefix: str, weights, bias: bool):
        """Specific method when the QKV was joined after the fact"""
        weight = weights.get_weights_col_packed_gate_up(prefix)
        if bias:
            raise NotImplementedError("packed_gate_up only implemented without bias")
        else:
            bias = None
        linear = get_linear(weight, bias)
        return cls(linear)

    @classmethod
    def load_qkv(
        cls,
        config,
        prefix: str,
        weights,
        bias: bool,
        num_heads: int,
        num_key_value_heads: int,
    ):
        """Specific method when the QKV was joined after the fact"""
        weight = weights.get_weights_col_packed_qkv(
            prefix,
            num_heads=num_heads,
            num_key_value_heads=num_key_value_heads,
        )
        if bias:
            raise NotImplementedError("packed_qkv only implemented for baichuan")
        else:
            bias = None
        linear = get_linear(weight, bias)
        return cls(linear)

    @classmethod
    def load(cls, config, prefix: str, weights, bias: bool):
        weight = weights.get_weights_col(prefix)
        if bias:
            bias = weights.get_sharded(f"{prefix}.bias", dim=0)
        else:
            bias = None
        linear = get_linear(weight, bias)
        return cls(linear)

    @classmethod
    def load_multi(cls, config, prefixes: List[str], weights, bias: bool, dim: int):
        if config.quantize == "exl2":
            linears = []
            for prefix in prefixes:
                weight = weights.get_weights_col(prefix)
                b = weights.get_tensor(f"{prefix}.bias") if bias else None
                linears.append(get_linear(weight, b))
            linear = LayerConcat(linears)
        else:
            weight = weights.get_multi_weights_col(prefixes, dim=dim)
            if bias:
                b = [weights.get_sharded(f"{p}.bias", dim=0) for p in prefixes]
                bias = torch.cat(b, dim=dim)
            else:
                bias = None
            linear = get_linear(weight, bias)
        return cls(linear)


class TensorParallelRowLinear(SuperLayer):
    def __init__(self, linear, process_group):
        super().__init__(linear)
        self.process_group = process_group

    @classmethod
    def load(cls, config, prefix: str, weights, bias: bool):
        weight = weights.get_weights_row(prefix)

        if bias and weights.process_group.rank() == 0:
            # Rank is only on the first rank process
            bias = weights.get_tensor(f"{prefix}.bias")
        else:
            bias = None
        return cls(
            get_linear(weight, bias),
            process_group=weights.process_group,
        )

    def forward(self, input: torch.Tensor, reduce: bool = True) -> torch.Tensor:
        out = super().forward(input)
        if self.process_group.size() > 1 and reduce:
            if SYSTEM == "ipex" and out.device.type == "cpu":
                ipex.distributed.all_reduce(out, group=self.process_group)
            else:
                torch.distributed.all_reduce(out, group=self.process_group)
        return out


class TensorParallelEmbedding(torch.nn.Module):
    def __init__(self, prefix: str, weights, reduce=True):
        super().__init__()
        weight = weights.get_partial_sharded(f"{prefix}.weight", dim=0)
        num_embeddings = weights.get_shape(f"{prefix}.weight")[0]

        process_group = weights.process_group

        world_size = process_group.size()
        rank = process_group.rank()

        block_size = (num_embeddings + world_size - 1) // world_size
        self.min_id = rank * block_size
        self.max_id = min(num_embeddings, (rank + 1) * block_size)
        self.null_idx = weight.shape[
            0
        ]  # Usually block_size, might be less in non even vocab_size.
        self.process_group = weights.process_group
        self.reduce = reduce

        """Additional 0 entry used for masking"""
        self.weight = torch.nn.Parameter(F.pad(weight, (0, 0, 0, 1)))

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        # default all out of bounds values to `self.null_idx` that will then be mapped to 0
        # translate for [0, self.max_id - self.min_id[
        input = torch.where(
            (self.min_id > input) | (input >= self.max_id),
            self.null_idx,
            input - self.min_id,
        )
        out = torch.nn.functional.embedding(input, self.weight)
        if self.reduce and self.process_group.size() > 1:
            if SYSTEM == "ipex" and out.device.type == "cpu":
                ipex.distributed.all_reduce(out, group=self.process_group)
            else:
                torch.distributed.all_reduce(out, group=self.process_group)
        return out


================================================
FILE: server/text_generation_server/models/__init__.py
================================================
# ruff: noqa: F821
# the above line disables the `undefined-name` rule for the model type variables

from compressed_tensors.compressors.model_compressors.model_compressor import (
    QuantizationConfig,
)
from compressed_tensors.quantization import QuantizationType
from pydantic import ValidationError
import enum
import os
from typing import Optional, List, Dict
from pathlib import Path
from loguru import logger

import torch
import transformers
from transformers.configuration_utils import PretrainedConfig
from transformers.models.auto import modeling_auto
from transformers.dynamic_module_utils import get_class_from_dynamic_module
from huggingface_hub import hf_hub_download, HfApi

from text_generation_server.utils.speculate import get_speculate, set_speculate
from text_generation_server.models.model import Model
from text_generation_server.models.causal_lm import CausalLM, CausalLMBatchKeysLast

from text_generation_server.models.custom_modeling.opt_modeling import OPTForCausalLM
from text_generation_server.models.custom_modeling.mpt_modeling import (
    MPTForCausalLM,
)
from text_generation_server.models.bloom import BloomCausalLMBatch
from text_generation_server.models.custom_modeling.bloom_modeling import (
    BloomForCausalLM,
)
from text_generation_server.models.globals import ATTENTION
from text_generation_server.models.seq2seq_lm import Seq2SeqLM
from text_generation_server.models.galactica import GalacticaCausalLMBatch
from text_generation_server.models.custom_modeling.neox_modeling import (
    GPTNeoxForCausalLM,
)
from text_generation_server.models.custom_modeling.phi_modeling import (
    PhiConfig,
    PhiForCausalLM,
)
from text_generation_server.models.custom_modeling.flash_phi_moe_modeling import (
    PhiMoEConfig,
)
from text_generation_server.models.custom_modeling.t5_modeling import (
    T5ForConditionalGeneration,
)


from text_generation_server.utils.adapter import (
    AdapterParameters,
    build_layer_weight_lookup,
    load_and_merge_adapters,
    AdapterInfo,
)
from text_generation_server.adapters.lora import LoraWeights


from text_generation_server.utils.import_utils import SYSTEM
from text_generation_server.utils.log import log_master

# The flag below controls whether to allow TF32 on matmul. This flag defaults to False
# in PyTorch 1.12 and later.
torch.backends.cuda.matmul.allow_tf32 = True

# The flag below controls whether to allow TF32 on cuDNN. This flag defaults to True.
torch.backends.cudnn.allow_tf32 = True

# Disable gradients
torch.set_grad_enabled(False)

__all__ = [
    "Model",
    "CausalLM",
    "Seq2SeqLM",
    "get_model_with_lora_adapters",
]

FLASH_ATT_ERROR_MESSAGE = "{} requires Flash Attention enabled models."

FLASH_ATTENTION = True

try:
    from text_generation_server.models.flash_causal_lm import FlashCausalLM
    from text_generation_server.models.vlm_causal_lm import VlmCausalLM
    from text_generation_server.models.mllama_causal_lm import MllamaCausalLM
    from text_generation_server.models.custom_modeling.flash_deepseek_v2_modeling import (
        FlashDeepseekV2ForCausalLM,
        DeepseekV2Config,
    )
    from text_generation_server.models.custom_modeling.flash_deepseek_v3_modeling import (
        FlashDeepseekV3ForCausalLM,
        DeepseekV3Config,
    )
    from text_generation_server.models.custom_modeling.flash_llama_modeling import (
        FlashLlamaForCausalLM,
    )
    from text_generation_server.models.custom_modeling.flash_cohere_modeling import (
        FlashCohereForCausalLM,
    )
    from text_generation_server.models.custom_modeling.flash_gemma_modeling import (
        FlashGemmaForCausalLM,
    )
    from text_generation_server.models.custom_modeling.flash_gemma2_modeling import (
        FlashGemma2ForCausalLM,
    )
    from text_generation_server.models.custom_modeling.flash_gemma3_modeling import (
        FlashGemma3ForCausalLM,
        Gemma3ForConditionalGeneration,
    )
    from text_generation_server.models.custom_modeling.gemma3.processing_gemma3 import (
        Gemma3Processor,
    )
    from text_generation_server.models.custom_modeling.gemma3.configuration_gemma3 import (
        Gemma3Config,
        Gemma3TextConfig,
    )
    from text_generation_server.models.custom_modeling.flash_dbrx_modeling import (
        FlashDbrxForCausalLM,
        DbrxConfig,
    )
    from text_generation_server.models.custom_modeling.flash_rw_modeling import (
        RWConfig,
        FlashRWForCausalLM,
    )
    from text_generation_server.models.custom_modeling.flash_neox_modeling import (
        FlashGPTNeoXForCausalLM,
    )
    from text_generation_server.models.custom_modeling.flash_pali_gemma_modeling import (
        PaliGemmaForConditionalGeneration,
    )
    from text_generation_server.models.custom_modeling.flash_phi_modeling import (
        FlashPhiForCausalLM,
    )
    from text_generation_server.models.idefics_causal_lm import IdeficsCausalLM
    from text_generation_server.models.mllama_causal_lm import MllamaCausalLMBatch
    from text_generation_server.models.custom_modeling.mllama import (
        MllamaForConditionalGeneration,
    )
    from text_generation_server.models.custom_modeling.llava_next import (
        LlavaNextForConditionalGeneration,
    )

    from text_generation_server.models.custom_modeling.flash_santacoder_modeling import (
        FlashSantacoderForCausalLM,
    )
    from text_generation_server.models.custom_modeling.flash_starcoder2_modeling import (
        FlashStarcoder2ForCausalLM,
    )
    from text_generation_server.models.custom_modeling.flash_qwen2_modeling import (
        Qwen2ForCausalLM,
    )
    from text_generation_server.models.custom_modeling.flash_mistral_modeling import (
        FlashMistralForCausalLM,
    )
    from text_generation_server.models.custom_modeling.flash_mixtral_modeling import (
        FlashMixtralForCausalLM,
    )
    from text_generation_server.models.custom_modeling.flash_gpt2_modeling import (
        FlashGPT2ForCausalLM,
    )
    from text_generation_server.models.custom_modeling.flash_gptj_modeling import (
        FlashGPTJForCausalLM,
    )
    from text_generation_server.models.custom_modeling.idefics2 import (
        Idefics2ForConditionalGeneration,
    )
    from text_generation_server.models.custom_modeling.idefics3 import (
        Idefics3ForConditionalGeneration,
    )
    from text_generation_server.models.custom_modeling.qwen2_vl import (
        Qwen2VLForConditionalGeneration,
    )
    from text_generation_server.models.custom_modeling.qwen2_5_vl import (
        Qwen2_5VLForConditionalGeneration,
        Qwen2_5_VLConfig,
        Qwen2_5_VLProcessor,
    )
    from text_generation_server.layers.attention import SUPPORTS_WINDOWING
except ImportError as e:
    log_master(logger.warning, f"Could not import Flash Attention enabled models: {e}")
    SUPPORTS_WINDOWING = False
    FLASH_ATTENTION = False

if FLASH_ATTENTION:
    __all__.append(FlashCausalLM)
    __all__.append(IdeficsCausalLM)

MAMBA_AVAILABLE = True
try:
    from text_generation_server.models.mamba import Mamba
except ImportError as e:
    log_master(logger.warning, f"Could not import Mamba: {e}")
    MAMBA_AVAILABLE = False

if MAMBA_AVAILABLE:
    __all__.append(Mamba)

FLASH_TRANSFORMERS_BACKEND = torch.cuda.is_available() or SYSTEM == "ipex"

try:
    from text_generation_server.models.transformers_flash_causal_lm import (
        TransformersFlashCausalLM,
    )
    from text_generation_server.models.transformers_flash_vlm import (
        TransformersFlashVlmCausalLM,
        TransformersGemma3VlmCausalLM,
        TransformersLlama4VlmCausalLM,
    )
except ImportError as e:
    log_master(logger.warning, f"Could not import Flash Transformers Backend: {e}")
    FLASH_TRANSFORMERS_BACKEND = False


class ModelType(enum.Enum):
    DEEPSEEK_V2 = {
        "type": "deepseek_v2",
        "name": "Deepseek V2",
        "url": "https://huggingface.co/deepseek-ai/DeepSeek-V2",
    }
    DEEPSEEK_V3 = {
        "type": "deepseek_v3",
        "name": "Deepseek V3",
        "url": "https://huggingface.co/deepseek-ai/DeepSeek-V3",
    }
    IDEFICS2 = {
        "type": "idefics2",
        "name": "Idefics 2",
        "url": "https://huggingface.co/HuggingFaceM4/idefics2-8b",
        "multimodal": True,
    }
    IDEFICS3 = {
        "type": "idefics3",
        "name": "Idefics 3",
        "url": "https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3",
        "multimodal": True,
    }
    LLAVA_NEXT = {
        "type": "llava_next",
        "name": "Llava Next (1.6)",
        "url": "https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf",
        "multimodal": True,
    }
    LLAMA = {
        "type": "llama",
        "name": "Llama",
        "url": "https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f",
    }
    LLAMA4 = {
        "type": "llama4",
        "name": "Llama4",
        "url": "https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f",
    }
    PHI3 = {
        "type": "phi3",
        "name": "Phi 3",
        "url": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct",
    }
    GRANITE = {
        "type": "granite",
        "name": "Granite",
        "url": "https://huggingface.co/ibm-granite/granite-3.0-8b-instruct",
    }
    GEMMA = {
        "type": "gemma",
        "name": "Gemma",
        "url": "https://huggingface.co/google/gemma-7b",
    }
    PALIGEMMA = {
        "type": "paligemma",
        "name": "PaliGemma",
        "url": "https://huggingface.co/google/paligemma-3b-pt-224",
    }
    GEMMA2 = {
        "type": "gemma2",
        "name": "Gemma2",
        "url": "https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315",
    }
    GEMMA3 = {
        "type": "gemma3",
        "name": "Gemma3",
        "url": "https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d",
    }
    GEMMA3_TEXT = {
        "type": "gemma3_text",
        "name": "Gemma3 Text",
        "url": "https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d",
    }
    COHERE = {
        "type": "cohere",
        "name": "Cohere",
        "url": "https://huggingface.co/CohereForAI/c4ai-command-r-plus",
    }
    DBRX = {
        "type": "dbrx",
        "name": "Dbrx",
        "url": "https://huggingface.co/databricks/dbrx-instruct",
    }
    MAMBA = {
        "type": "mamba",
        "name": "Mamba",
        "url": "https://huggingface.co/state-spaces/mamba-2.8b-slimpj",
    }
    MISTRAL = {
        "type": "mistral",
        "name": "Mistral",
        "url": "https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407",
    }
    MIXTRAL = {
        "type": "mixtral",
        "name": "Mixtral",
        "url": "https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1",
    }
    GPT_BIGCODE = {
        "type": "gpt_bigcode",
        "name": "Gpt Bigcode",
        "url": "https://huggingface.co/bigcode/gpt_bigcode-santacoder",
    }
    PHI = {
        "type": "phi",
        "name": "Phi",
        "url": "https://huggingface.co/microsoft/phi-1_5",
    }
    PHI_MOE = {
        "type": "phimoe",
        "name": "PhiMoe",
        "url": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct",
    }
    BAICHUAN = {
        "type": "baichuan",
        "name": "Baichuan",
        "url": "https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat",
    }
    FALCON = {
        "type": "falcon",
        "name": "Falcon",
        "url": "https://huggingface.co/tiiuae/falcon-7b-instruct",
    }
    STARCODER2 = {
        "type": "starcoder2",
        "name": "StarCoder 2",
        "url": "https://huggingface.co/bigcode/starcoder2-15b-instruct-v0.1",
    }
    QWEN2 = {
        "type": "qwen2",
        "name": "Qwen 2",
        "url": "https://huggingface.co/collections/Qwen/qwen2-6659360b33528ced941e557f",
    }
    QWEN2_VL = {
        "type": "qwen2_vl",
        "name": "Qwen 2 VL",
        "url": "https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d",
    }
    QWEN2_5_VL = {
        "type": "qwen2_5_vl",
        "name": "Qwen 2.5 VL",
        "url": "https://huggingface.co/collections/Qwen/qwen25-66e81a666513e518adb90d9e",
    }
    OPT = {
        "type": "opt",
        "name": "Opt",
        "url": "https://huggingface.co/facebook/opt-6.7b",
    }
    T5 = {
        "type": "t5",
        "name": "T5",
        "url": "https://huggingface.co/google/flan-t5-xxl",
    }
    GALACTICA = {
        "type": "galactica",
        "name": "Galactica",
        "url": "https://huggingface.co/facebook/galactica-120b",
    }
    SANTACODER = {
        "type": "santacoder",
        "name": "SantaCoder",
        "url": "https://huggingface.co/bigcode/santacoder",
    }
    BLOOM = {
        "type": "bloom",
        "name": "Bloom",
        "url": "https://huggingface.co/bigscience/bloom-560m",
    }
    MPT = {
        "type": "mpt",
        "name": "Mpt",
        "url": "https://huggingface.co/mosaicml/mpt-7b-instruct",
    }
    GPT2 = {
        "type": "gpt2",
        "name": "Gpt2",
        "url": "https://huggingface.co/openai-community/gpt2",
    }
    GPT_NEOX = {
        "type": "gpt_neox",
        "name": "Gpt Neox",
        "url": "https://huggingface.co/EleutherAI/gpt-neox-20b",
    }
    GPTJ = {
        "type": "gptj",
        "name": "Gptj",
        "url": "https://huggingface.co/EleutherAI/gpt-j-6b",
    }
    IDEFICS = {
        "type": "idefics",
        "name": "Idefics",
        "url": "https://huggingface.co/HuggingFaceM4/idefics-9b",
        "multimodal": True,
    }
    MLLAMA = {
        "type": "mllama",
        "name": "Mllama",
        "url": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct",
        "multimodal": True,
    }


__GLOBALS = locals()
for data in ModelType:
    __GLOBALS[data.name] = data.value["type"]


def get_model(
    model_id: str,
    lora_adapter_ids: Optional[List[str]],
    revision: Optional[str],
    sharded: bool,
    quantize: Optional[str],
    speculate: Optional[int],
    dtype: Optional[str],
    kv_cache_dtype: Optional[str],
    trust_remote_code: bool,
    max_input_tokens: int,
) -> Model:
    global FLASH_ATTENTION

    config_dict, _ = PretrainedConfig.get_config_dict(
        model_id, revision=revision, trust_remote_code=trust_remote_code
    )
    model_type = config_dict.get("model_type", None)

    quantization_config = config_dict.get("quantization_config", None)
    if quantization_config is None:
        quantization_config = config_dict.get("compression_config", None)
    if quantization_config is not None and quantize is None:
        method = quantization_config.get("quant_method", None)
        if method in {"gptq", "awq", "exl2"}:
            log_master(logger.info, f"Auto selecting quantization method {method}")
            quantize = method
        elif method == "fbgemm_fp8" or method == "fp8":
            log_master(logger.info, "Auto selecting quantization method fp8")
            quantize = "fp8"
        if method == "compressed-tensors":
            log_master(
                logger.info, "Auto selecting quantization method compressed-tensors"
            )
            quantize = "compressed-tensors"

        else:
            log_master(logger.warning, f"Unknown quantization method {method}")

    if dtype is None:
        if quantize in ["awq", "exl2", "gptq", "marlin"]:
            if SYSTEM == "ipex" and not (
                hasattr(torch, "xpu") and torch.xpu.is_available()
            ):
                dtype = torch.bfloat16
            else:
                # These quantizers only work with float16 params.
                dtype = torch.float16
        else:
            # Keep it as default for now and let
            # every model resolve their own default dtype.
            dtype = None
    elif dtype == "float16":
        dtype = torch.float16
    elif dtype == "bfloat16":
        dtype = torch.bfloat16
    else:
        raise RuntimeError(f"Unknown dtype {dtype}")

    compressed_tensors_config = None
    if quantize == "compressed-tensors":
        try:
            compressed_tensors_config = QuantizationConfig.model_validate(
                quantization_config
            )
        except ValidationError as e:
            raise ValueError("Cannot parse compressed-tensors configuration") from e

    if kv_cache_dtype is None:
        kv_cache_scheme = (
            compressed_tensors_config.kv_cache_scheme
            if isinstance(compressed_tensors_config, QuantizationConfig)
            else None
        )
        if (
            kv_cache_scheme is not None
            and kv_cache_scheme.type == QuantizationType.FLOAT
            and kv_cache_scheme.num_bits == 8
            and SYSTEM == "cuda"
            and ATTENTION == "flashinfer"
        ):
            kv_cache_dtype = torch.float8_e4m3fn
        else:
            kv_cache_dtype = dtype
    elif kv_cache_dtype == "fp8_e4m3fn":
        kv_cache_dtype = torch.float8_e4m3fn
    elif kv_cache_dtype == "fp8_e5m2":
        kv_cache_dtype = torch.float8_e5m2
    else:
        raise RuntimeError(f"Unknown kv_cache_dtype: {kv_cache_dtype}")

    if speculate is not None:
        set_speculate(speculate)
    else:
        set_speculate(0)

    speculator = None
    if "medusa_num_heads" in config_dict:
        medusa_model_id = model_id
        medusa_revision = revision
        model_id = config_dict["base_model_name_or_path"]
        revision = "main"
        speculate_medusa = config_dict["medusa_num_heads"]
        if speculate is not None:
            if speculate > speculate_medusa:
                raise RuntimeError(
                    f"Speculate is set to `{speculate}` but this medusa models only has `{speculate_medusa}` heads, please make them match"
                )
            else:
                set_speculate(speculate)
        else:
            set_speculate(speculate_medusa)

        config_dict, _ = PretrainedConfig.get_config_dict(
            model_id, revision=revision, trust_remote_code=trust_remote_code
        )
        # Reload model type from parent.
        model_type = config_dict.get("model_type", None)
        is_local = Path(medusa_model_id).exists()
        if not is_local:
            medusa_config = hf_hub_download(
                medusa_model_id, revision=medusa_revision, filename="config.json"
            )
            hf_hub_download(
                medusa_model_id,
                revision=medusa_revision,
                filename="medusa_lm_head.safetensors",
            )
            speculator = {
                "path": Path(medusa_config).parent,
                "model_paths": ["medusa_lm_head.safetensors"],
            }
        else:
            speculator = {
                "path": Path(medusa_model_id),
                "model_paths": ["medusa_lm_head.safetensors"],
            }

        method = "medusa"
    elif model_type == "mlp_speculator":
        mlp_model_id = model_id
        mlp_revision = revision
        model_id = config_dict["base_model_name_or_path"]
        revision = "main"
        speculate_mlp = config_dict["n_predict"]
        if speculate is not None:
            if speculate > speculate_mlp:
                raise RuntimeError(
                    f"Speculate is set to `{speculate}` but this mlp_speculator models only has `{speculate_mlp}` heads, please make them match"
                )
            else:
                set_speculate(speculate)
        else:
            set_speculate(speculate_mlp)

        config_dict, _ = PretrainedConfig.get_config_dict(
            model_id, revision=revision, trust_remote_code=trust_remote_code
        )
        # Reload model type from parent.
        model_type = config_dict.get("model_type", None)
        is_local = Path(mlp_model_id).exists()
        extension = ".safetensors"
        if not is_local:
            mlp_speculator_config = hf_hub_download(
                mlp_model_id, revision=mlp_revision, filename="config.json"
            )
            api = HfApi()
            info = api.model_info(mlp_model_id, revision=mlp_revision)
            filenames = [
                s.rfilename
                for s in info.siblings
                if s.rfilename.endswith(extension)
                and len(s.rfilename.split("/")) == 1
                and "arguments" not in s.rfilename
                and "args" not in s.rfilename
                and "training" not in s.rfilename
            ]
            for filename in filenames:
                hf_hub_download(
                    mlp_model_id,
                    revision=mlp_revision,
                    filename=filename,
                )
            speculator_dir_path = Path(mlp_speculator_config).parent
            # if these are downloaded, they get converted to safetensors
            filenames.extend(
                [p for p in os.listdir(speculator_dir_path) if p.endswith(extension)]
            )
            speculator = {
                "path": Path(mlp_speculator_config).parent,
                "model_paths": filenames,
            }
        else:
            speculator = Path(mlp_model_id)
            filenames = [p for p in os.listdir(speculator) if p.endswith(extension)]
            speculator = {"path": speculator, "model_paths": filenames}
        method = "mlp_speculator"
    else:
        method = "n-gram"

    speculate = get_speculate()
    if speculate > 0:
        log_master(
            logger.info, f"Using speculation {method} with {speculate} input ids."
        )

    if model_type is None:
        # TODO: fix how we determine model type for Mamba
        if "ssm_cfg" in config_dict:
            # *only happens in Mamba case
            model_type = "mamba"
        else:
            raise RuntimeError(
                f"Could not determine model type for {model_id} revision {revision}"
            )

    if quantize == "exl2" and sharded:
        raise RuntimeError(
            "Sharding is currently not supported with `exl2` quantization"
        )

    sliding_window = (
        config_dict.get("sliding_window")
        if config_dict.get("sliding_window") is not None
        else -1
    )

    use_sliding_window = sliding_window is not None and sliding_window != -1
    needs_sliding_window = (
        max_input_tokens is not None and max_input_tokens > sliding_window
    )
    if use_sliding_window and needs_sliding_window and not SUPPORTS_WINDOWING:
        raise ValueError(
            f"The backend {SYSTEM} does not support sliding window attention that is used by the model type {model_type}. To use this model nonetheless with the {SYSTEM} backend, please launch TGI with the argument `--max-input-tokens` smaller than sliding_window={sliding_window} (got here max_input_tokens={max_input_tokens})."
        )
    if model_type == DEEPSEEK_V2:
        if FLASH_ATTENTION:
            head_size = max(
                config_dict.get("qk_nope_dim", 128)
                + config_dict.get("qk_rope_dim", 64),
                config_dict.get("v_head_dim", 128),
            )
            return FlashCausalLM(
                model_id=model_id,
                model_class=FlashDeepseekV2ForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                default_dtype=torch.bfloat16,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
                config_class=DeepseekV2Config,
                head_size=head_size,
            )
        elif sharded:
            raise NotImplementedError(
                FLASH_ATT_ERROR_MESSAGE.format("Sharded Deepseek V2")
            )
        else:
            return CausalLM.fallback(
                model_id,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
    elif model_type == DEEPSEEK_V3:
        if FLASH_ATTENTION:
            head_size = max(
                config_dict.get("qk_nope_dim", 128)
                + config_dict.get("qk_rope_dim", 64),
                config_dict.get("v_head_dim", 128),
            )
            return FlashCausalLM(
                model_id=model_id,
                model_class=FlashDeepseekV3ForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                default_dtype=torch.bfloat16,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
                config_class=DeepseekV3Config,
                head_size=head_size,
            )
        elif sharded:
            raise NotImplementedError(
                FLASH_ATT_ERROR_MESSAGE.format("Sharded Deepseek V3")
            )
        else:
            return CausalLM.fallback(
                model_id,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
    elif model_type == MAMBA:
        return Mamba(
            model_id,
            revision,
            quantize=quantize,
            speculator=speculator,
            dtype=dtype,
            trust_remote_code=trust_remote_code,
        )
    elif model_type == "ssm":
        raise RuntimeError(
            "`ssm` models have been deprecated in favor of `mamba` models, which follow standard HF formats. Check out a list here: https://huggingface.co/models?search=mamba%20-hf"
        )

    if model_id.startswith("facebook/galactica"):
        return CausalLM(
            model_id=model_id,
            # Yes galactica is just an OPT model.
            model_class=OPTForCausalLM,
            revision=revision,
            quantize=quantize,
            speculator=speculator,
            dtype=dtype,
            trust_remote_code=trust_remote_code,
            batch_class=GalacticaCausalLMBatch,
        )

    if (
        model_type == GPT_BIGCODE
        or model_type == GPT2
        and model_id.startswith("bigcode/")
    ):
        if FLASH_ATTENTION:
            return FlashCausalLM(
                model_id=model_id,
                model_class=FlashSantacoderForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
                aliases={"transformer.wte.weight": ["lm_head.weight"]},
                num_kv_heads=1,
            )
        elif sharded:
            raise NotImplementedError(
                FLASH_ATT_ERROR_MESSAGE.format("Sharded Santacoder")
            )
        else:
            return CausalLM.fallback(
                model_id=model_id,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )

    if model_type == BLOOM:
        return CausalLM(
            model_id=model_id,
            model_class=BloomForCausalLM,
            revision=revision,
            quantize=quantize,
            speculator=speculator,
            dtype=dtype,
            trust_remote_code=trust_remote_code,
            batch_class=BloomCausalLMBatch,
        )
    elif model_type == MPT:
        return CausalLM(
            model_id=model_id,
            model_class=MPTForCausalLM,
            revision=revision,
            quantize=quantize,
            speculator=speculator,
            dtype=dtype,
            trust_remote_code=trust_remote_code,
            batch_class=CausalLMBatchKeysLast,
        )
    elif model_type == GPT2:
        if FLASH_ATTENTION:
            try:
                return FlashCausalLM(
                    model_id=model_id,
                    model_class=FlashGPT2ForCausalLM,
                    revision=revision,
                    quantize=quantize,
                    speculator=speculator,
                    dtype=dtype,
                    kv_cache_dtype=kv_cache_dtype,
                    trust_remote_code=trust_remote_code,
                    lora_adapter_ids=lora_adapter_ids,
                )
            except RuntimeError as e:
                # Lots of legacy models with various weight names.
                log_master(logger.warning, f"Couldn't load flash gpt2 variant: {e}")
                return CausalLM.fallback(
                    model_id,
                    revision,
                    quantize=quantize,
                    speculator=speculator,
                    dtype=dtype,
                    trust_remote_code=trust_remote_code,
                )
        elif sharded:
            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded GPT-2"))
        else:
            return CausalLM.fallback(
                model_id,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
    elif model_type == GPTJ:
        if FLASH_ATTENTION:
            try:
                return FlashCausalLM(
                    model_id=model_id,
                    model_class=FlashGPTJForCausalLM,
                    revision=revision,
                    quantize=quantize,
                    speculator=speculator,
                    dtype=dtype,
                    kv_cache_dtype=kv_cache_dtype,
                    trust_remote_code=trust_remote_code,
                    lora_adapter_ids=lora_adapter_ids,
                )
            except RuntimeError as e:
                # Lots of legacy models with various weight names.
                log_master(logger.warning, f"Couldn't load flash gptj variant: {e}")
                return CausalLM.fallback(
                    model_id,
                    revision,
                    quantize=quantize,
                    speculator=speculator,
                    dtype=dtype,
                    trust_remote_code=trust_remote_code,
                )
        elif sharded:
            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded GPT-J"))
        else:
            return CausalLM.fallback(
                model_id,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
    elif model_type == GPT_NEOX:
        if FLASH_ATTENTION:
            from text_generation_server.models.custom_modeling.flash_neox_modeling import (
                GPTNeoXConfig,
            )

            return FlashCausalLM(
                model_id=model_id,
                model_class=FlashGPTNeoXForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
                config_class=GPTNeoXConfig,
            )
        elif FLASH_TRANSFORMERS_BACKEND:
            return TransformersFlashCausalLM.fallback(
                model_id,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
        elif sharded:
            return CausalLM(
                model_id=model_id,
                model_class=GPTNeoxForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
        else:
            return CausalLM.fallback(
                model_id,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )

    elif model_type == PHI:
        if FLASH_ATTENTION:
            return FlashCausalLM(
                model_id=model_id,
                model_class=FlashPhiForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
            )
        else:
            return TransformersFlashCausalLM.fallback(
                model_id,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )

    elif model_type == PHI_MOE:
        if FLASH_ATTENTION:
            return FlashCausalLM(
                model_id=model_id,
                model_class=FlashLlamaForCausalLM,
                config_class=PhiMoEConfig,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
            )
        else:
            return CausalLM.fallback(
                model_id,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )

    elif model_type == "phi-msft":
        if FLASH_ATTENTION:
            raise NotImplementedError(
                "Legacy phi-msft is not supported with Flash Attention"
            )
        else:
            return CausalLM(
                model_id=model_id,
                model_class=PhiForCausalLM,
                config_class=PhiConfig,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )

    elif model_type == LLAMA or model_type == PHI3 or model_type == GRANITE:
        if FLASH_ATTENTION:
            return FlashCausalLM(
                model_id=model_id,
                model_class=FlashLlamaForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
            )
        elif FLASH_TRANSFORMERS_BACKEND:
            return TransformersFlashCausalLM.fallback(
                model_id,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
        elif sharded:
            raise NotImplementedError(
                FLASH_ATT_ERROR_MESSAGE.format(f"Sharded {model_type}")
            )
        else:
            return CausalLM.fallback(
                model_id,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
    elif model_type == LLAMA4:
        if FLASH_TRANSFORMERS_BACKEND:
            from transformers import Llama4ForConditionalGeneration as Llama4Model

            return TransformersLlama4VlmCausalLM.fallback(
                model_id,
                Llama4Model,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=torch.bfloat16,
                trust_remote_code=trust_remote_code,
                processor_kwargs={
                    "use_fast": True,
                },
            )
    elif model_type == BAICHUAN:
        if FLASH_ATTENTION:
            return FlashCausalLM(
                model_id=model_id,
                model_class=FlashLlamaForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
            )
        elif sharded:
            raise NotImplementedError(
                FLASH_ATT_ERROR_MESSAGE.format(f"Sharded {model_type}")
            )
        else:
            return CausalLM.fallback(
                model_id,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )

    if model_type == GEMMA:
        if FLASH_ATTENTION:
            return FlashCausalLM(
                model_id=model_id,
                model_class=FlashGemmaForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                # Works better for these models
                default_dtype=torch.bfloat16,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
            )
        elif FLASH_TRANSFORMERS_BACKEND:
            return TransformersFlashCausalLM.fallback(
                model_id,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
        elif sharded:
            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Gemma"))
        else:
            return CausalLM.fallback(
                model_id,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
    elif model_type == GEMMA2:
        if FLASH_ATTENTION:
            return FlashCausalLM(
                model_id=model_id,
                model_class=FlashGemma2ForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                # Works better for these models
                default_dtype=torch.bfloat16,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
            )
        elif FLASH_TRANSFORMERS_BACKEND:
            return TransformersFlashCausalLM.fallback(
                model_id,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
        elif sharded:
            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Gemma2"))
        else:
            return CausalLM.fallback(
                model_id,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
    elif model_type == GEMMA3_TEXT:
        if FLASH_ATTENTION:
            return FlashCausalLM(
                model_id=model_id,
                model_class=FlashGemma3ForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                # TODO: once implemented in transformers, use the config class
                # and processor class from there.
                config_class=Gemma3TextConfig,
                # Works better for these models
                default_dtype=torch.bfloat16,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
            )
        elif FLASH_TRANSFORMERS_BACKEND:
            return TransformersFlashCausalLM.fallback(
                model_id,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
        elif sharded:
            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Gemma3"))
        else:
            return CausalLM.fallback(
                model_id,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
    elif model_type == GEMMA3:
        if FLASH_ATTENTION:
            return VlmCausalLM(
                model_id=model_id,
                model_class=Gemma3ForConditionalGeneration,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                # TODO: once implemented in transformers, use the config class
                # and processor class from there.
                config_class=Gemma3Config,
                processor_class=Gemma3Processor,
                default_dtype=torch.bfloat16,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
                support_chunking=False,
            )
        elif FLASH_TRANSFORMERS_BACKEND:
            from transformers import Gemma3ForConditionalGeneration as Gemma3Model

            return TransformersGemma3VlmCausalLM.fallback(
                model_id,
                Gemma3Model,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=torch.bfloat16,
                trust_remote_code=trust_remote_code,
                support_chunking=False,
            )
        elif sharded:
            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Gemma3"))
        else:
            return CausalLM.fallback(
                model_id,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )

    if model_type == COHERE:
        if FLASH_ATTENTION:
            return FlashCausalLM(
                model_id=model_id,
                model_class=FlashCohereForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
            )
        elif FLASH_TRANSFORMERS_BACKEND:
            return TransformersFlashCausalLM.fallback(
                model_id,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
        elif sharded:
            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Cohere"))
        else:
            return CausalLM.fallback(
                model_id,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )

    if model_type == DBRX:
        if FLASH_ATTENTION:
            return FlashCausalLM(
                model_id=model_id,
                model_class=FlashDbrxForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                # Dbrx works better in bfloat16.
                default_dtype=torch.bfloat16,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
                config_class=DbrxConfig,
            )
        elif sharded:
            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded DBRX"))
        else:
            return CausalLM.fallback(
                model_id,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )

    if model_type in ["RefinedWeb", "RefinedWebModel", FALCON]:
        if sharded:
            if FLASH_ATTENTION:
                if config_dict.get("alibi", False):
                    raise NotImplementedError("sharded is not supported for this model")
                return FlashCausalLM(
                    model_id=model_id,
                    model_class=FlashRWForCausalLM,
                    revision=revision,
                    quantize=quantize,
                    speculator=speculator,
                    dtype=dtype,
                    kv_cache_dtype=kv_cache_dtype,
                    aliases={
                        "lm_head.weight": ["transformer.word_embeddings.weight"],
                        "transformer.word_embeddings.weight": ["lm_head.weight"],
                    },
                    trust_remote_code=trust_remote_code,
                    lora_adapter_ids=lora_adapter_ids,
                    config_class=RWConfig,
                )
            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Falcon"))
        else:
            if FLASH_ATTENTION and not config_dict.get("alibi", False):
                return FlashCausalLM(
                    model_id=model_id,
                    model_class=FlashRWForCausalLM,
                    revision=revision,
                    quantize=quantize,
                    speculator=speculator,
                    dtype=dtype,
                    kv_cache_dtype=kv_cache_dtype,
                    aliases={
                        "lm_head.weight": ["transformer.word_embeddings.weight"],
                        "transformer.word_embeddings.weight": ["lm_head.weight"],
                    },
                    trust_remote_code=trust_remote_code,
                    lora_adapter_ids=lora_adapter_ids,
                    config_class=RWConfig,
                )
            else:
                return CausalLM.fallback(
                    model_id,
                    revision,
                    quantize=quantize,
                    speculator=speculator,
                    dtype=dtype,
                    trust_remote_code=trust_remote_code,
                )

    if model_type == MISTRAL:
        if FLASH_ATTENTION:
            return FlashCausalLM(
                model_id=model_id,
                model_class=FlashMistralForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
            )
        elif FLASH_TRANSFORMERS_BACKEND:
            return TransformersFlashCausalLM.fallback(
                model_id,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
        elif sharded:
            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Mistral"))
        else:
            return CausalLM.fallback(
                model_id,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )

    if model_type == MIXTRAL:
        if FLASH_ATTENTION:
            return FlashCausalLM(
                model_id=model_id,
                model_class=FlashMixtralForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
            )
        elif FLASH_TRANSFORMERS_BACKEND:
            return TransformersFlashCausalLM.fallback(
                model_id,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
        elif sharded:
            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Mixtral"))
        else:
            return CausalLM.fallback(
                model_id,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )

    if model_type == STARCODER2:
        if FLASH_ATTENTION:
            return FlashCausalLM(
                model_id=model_id,
                model_class=FlashStarcoder2ForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
            )
        elif FLASH_TRANSFORMERS_BACKEND:
            return TransformersFlashCausalLM.fallback(
                model_id,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
        elif sharded:
            raise NotImplementedError(
                FLASH_ATT_ERROR_MESSAGE.format("Sharded Starcoder2")
            )
        else:
            return CausalLM.fallback(
                model_id,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )

    if model_type == QWEN2:
        if FLASH_ATTENTION:
            return FlashCausalLM(
                model_id=model_id,
                model_class=Qwen2ForCausalLM,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
            )
        elif FLASH_TRANSFORMERS_BACKEND:
            return TransformersFlashCausalLM.fallback(
                model_id,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
        elif sharded:
            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Qwen2"))
        else:
            return CausalLM.fallback(
                model_id,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )

    if model_type == OPT:
        return CausalLM(
            model_id=model_id,
            model_class=OPTForCausalLM,
            revision=revision,
            quantize=quantize,
            speculator=speculator,
            dtype=dtype,
            trust_remote_code=trust_remote_code,
        )

    if model_type == T5:
        return Seq2SeqLM(
            model_id=model_id,
            model_class=T5ForConditionalGeneration,
            revision=revision,
            quantize=quantize,
            speculator=speculator,
            dtype=dtype,
            trust_remote_code=trust_remote_code,
            aliases={
                "shared.weight": [
                    "encoder.embed_tokens.weight",
                    "decoder.embed_tokens.weight",
                ]
            },
        )
    if model_type == IDEFICS:
        if FLASH_ATTENTION:
            return IdeficsCausalLM(
                model_id,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
        else:
            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Idefics"))
    if model_type == QWEN2_VL:
        if FLASH_ATTENTION:
            return VlmCausalLM(
                model_id=model_id,
                model_class=Qwen2VLForConditionalGeneration,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                default_dtype=torch.bfloat16,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
                # TODO: Fix bug in rust image_text_replacement implementation
                support_chunking=False,
            )
        # TODO: Uncomment when transformers is refactored
        # elif FLASH_TRANSFORMERS_BACKEND:
        #     from transformers import Qwen2VLForConditionalGeneration as Qwen2VLModel

        #     return TransformersQwen2VlmCausalLM.fallback(
        #         model_id,
        #         Qwen2VLModel,
        #         revision,
        #         quantize=quantize,
        #         speculator=speculator,
        #         dtype=torch.bfloat16,
        #         trust_remote_code=trust_remote_code,
        #     )
        else:
            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Qwen2_VL"))
    if model_type == QWEN2_5_VL:
        if FLASH_ATTENTION:
            return VlmCausalLM(
                model_id=model_id,
                model_class=Qwen2_5VLForConditionalGeneration,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                default_dtype=torch.bfloat16,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
                config_class=Qwen2_5_VLConfig,
                processor_class=Qwen2_5_VLProcessor,
                # TODO: Fix bug in rust image_text_replacement implementation
                support_chunking=False,
            )
        # TODO: Uncomment when transformers is refactored
        # elif FLASH_TRANSFORMERS_BACKEND:
        #     return TransformersQwen2VlmCausalLM.fallback(
        #         model_id,
        #         Qwen2VLModel,
        #         revision,
        #         quantize=quantize,
        #         speculator=speculator,
        #         dtype=torch.bfloat16,
        #         trust_remote_code=trust_remote_code,
        #         config_class=Qwen2_5_VLConfig,
        #         processor_class=Qwen2_5_VLProcessor,
        #     )
        else:
            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Qwen2_5_VL"))
    if model_type == MLLAMA:
        if FLASH_ATTENTION:
            return MllamaCausalLM(
                model_id=model_id,
                model_class=MllamaForConditionalGeneration,
                batch_class=MllamaCausalLMBatch,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                default_dtype=torch.bfloat16,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
                support_chunking=False,
            )
        # TODO: Uncomment when transformers is refactored and cross attn is added
        # elif FLASH_TRANSFORMERS_BACKEND:
        #     from transformers import MllamaForConditionalGeneration as MllamaModel

        #     return TransformersFlashVlmCausalLM.fallback(
        #         model_id,
        #         MllamaModel,
        #         revision,
        #         quantize=quantize,
        #         speculator=speculator,
        #         dtype=torch.bfloat16,
        #         trust_remote_code=trust_remote_code,
        #         batch_class=MllamaCausalLMBatch,
        #     )
        else:
            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Mllama"))
    if model_type == IDEFICS2:
        if FLASH_ATTENTION:
            return VlmCausalLM(
                model_id=model_id,
                model_class=Idefics2ForConditionalGeneration,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
                # XXX: Extremely important to cap resolution in order to limit
                # VRAM usage.
                processor_kwargs={"size": {"longest_edge": 448, "shortest_edge": 378}},
            )
        elif FLASH_TRANSFORMERS_BACKEND:
            from transformers import Idefics2ForConditionalGeneration as Idefics2Model

            return TransformersFlashVlmCausalLM.fallback(
                model_id,
                Idefics2Model,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
                processor_kwargs={"size": {"longest_edge": 448, "shortest_edge": 378}},
            )
        else:
            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Idefics"))
    if model_type == IDEFICS3:
        if FLASH_ATTENTION:
            return VlmCausalLM(
                model_id=model_id,
                model_class=Idefics3ForConditionalGeneration,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                default_dtype=torch.bfloat16,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
                # XXX: Extremely important to cap resolution in order to limit
                # VRAM usage.
                processor_kwargs={"size": {"longest_edge": 1456}},
            )
        elif FLASH_TRANSFORMERS_BACKEND:
            from transformers import Idefics3ForConditionalGeneration as Idefics3Model

            return TransformersFlashVlmCausalLM.fallback(
                model_id,
                Idefics3Model,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=torch.bfloat16,
                trust_remote_code=trust_remote_code,
                processor_kwargs={"size": {"longest_edge": 1456}},
            )
        else:
            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Idefics"))
    if model_type == PALIGEMMA:
        if FLASH_ATTENTION:
            return VlmCausalLM(
                model_id=model_id,
                model_class=PaliGemmaForConditionalGeneration,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                # Works better for these models
                default_dtype=torch.bfloat16,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
            )
        elif FLASH_TRANSFORMERS_BACKEND:
            from transformers import PaliGemmaForConditionalGeneration as PaliGemmaModel

            return TransformersFlashVlmCausalLM.fallback(
                model_id,
                PaliGemmaModel,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=torch.bfloat16,
                trust_remote_code=trust_remote_code,
            )
        else:
            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("PaliGemma"))
    if model_type == LLAVA_NEXT:
        if FLASH_ATTENTION:
            return VlmCausalLM(
                model_class=LlavaNextForConditionalGeneration,
                model_id=model_id,
                revision=revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
            )
        elif FLASH_TRANSFORMERS_BACKEND:
            from transformers import LlavaNextForConditionalGeneration as LlavaNextModel

            return TransformersFlashVlmCausalLM.fallback(
                model_id,
                LlavaNextModel,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
        else:
            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("LlavaNext"))

    if quantize == "gptq":
        raise NotImplementedError(
            "gptq quantization is not supported for AutoModel, you can try to quantize it with `text-generation-server quantize ORIGINAL_MODEL_ID NEW_MODEL_ID`"
        )
    if quantize == "awq":
        raise NotImplementedError("awq quantization is not supported for AutoModel")
    elif (quantize == "bitsandbytes-fp4") or (quantize == "bitsandbytes-nf4"):
        raise NotImplementedError("4bit quantization is not supported for AutoModel")
    elif quantize == "eetq":
        raise NotImplementedError("Eetq quantization is not supported for AutoModel")
    elif quantize == "exl2":
        raise NotImplementedError("exl2 quantization is not supported for AutoModel")

    auto_map = config_dict.get("auto_map", None)
    model_class = None

    # If the model is already in the library
    if model_type in modeling_auto.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
        model_class = getattr(
            transformers, modeling_auto.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[model_type]
        )
    elif (
        trust_remote_code
        and auto_map is not None
        and "AutoModelForCausalLM" in auto_map.keys()
    ):
        model_class = get_class_from_dynamic_module(
            config_dict["auto_map"]["AutoModelForCausalLM"], model_id
        )

    # This means the model is ForCausalLM
    if model_class is not None:
        if FLASH_TRANSFORMERS_BACKEND and model_class.is_backend_compatible():
            return TransformersFlashCausalLM.fallback(
                model_id,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
        elif sharded:
            raise NotImplementedError("sharded is not supported for AutoModel")
        else:
            return CausalLM.fallback(
                model_id,
                revision,
                quantize=quantize,
                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )

    # Not supported at this point
    if sharded:
        raise NotImplementedError("sharded is not supported for AutoModel")

    # This means it is a ForSeq2SeqLM model
    if model_type in modeling_auto.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES or (
        trust_remote_code
        and auto_map is not None
        and "AutoModelForSeq2SeqLM" in auto_map.keys()
    ):
        return Seq2SeqLM.fallback(
            model_id,
            revision,
            quantize=quantize,
            speculator=speculator,
            dtype=dtype,
            trust_remote_code=trust_remote_code,
        )

    raise ValueError(f"Unsupported model type {model_type}")


# get_model_with_lora_adapters wraps the internal get_model function and adds support for loading adapters
# this provides a post model loading hook to load adapters into the model after the model has been loaded
def get_model_with_lora_adapters(
    model_id: str,
    lora_adapters: Optional[List[AdapterInfo]],
    revision: Optional[str],
    sharded: bool,
    quantize: Optional[str],
    speculate: Optional[int],
    dtype: Optional[str],
    kv_cache_dtype: Optional[str],
    trust_remote_code: bool,
    max_input_tokens: int,
    adapter_to_index: Dict[str, int],
):
    lora_adapter_ids = [adapter.id for adapter in lora_adapters]
    model = get_model(
        model_id,
        lora_adapter_ids,
        revision,
        sharded,
        quantize,
        speculate,
        dtype,
        kv_cache_dtype,
        trust_remote_code,
        max_input_tokens,
    )

    if len(lora_adapters) > 0:
        target_to_layer = build_layer_weight_lookup(model.model)

        for index, adapter in enumerate(lora_adapters):
            # The AdapterParameters object allows for merging multiple adapters into a single adapter.
            # At the moment, we only support loading a single adapter into the model, but we keep the
            # AdapterParameters object for easier extension in the future.
            adapter_parameters = AdapterParameters(
                adapter_info=[adapter],
                # when merging multiple adapters we can weight them differently
                # if this is not set, all adapters will be weighted equally
                # see: text_generation_server.utils.merges.strategies for impl
                weights=None,
                merge_strategy=0,
                density=1.0,
                majority_sign_method=0,
            )

            adapter_index = index + 1
            adapter_to_index[adapter.id] = adapter_index

            logger.info(
                f"Loading adapter weights into model: {','.join([adapter.id for adapter in adapter_parameters.adapter_info])}"
            )
            weight_names = tuple([v[0] for v in target_to_layer.values()])
            (
                module_map,
                adapter_config,
                adapter_weight_names,
                adapter_tokenizer,
            ) = load_and_merge_adapters(
                model.model_id,
                adapter_parameters,
                adapter_index,
                weight_names,
                False,
            )

            unused_weight_names = adapter_weight_names.copy()

            adapter_layers = [
                "q_proj",
                "k_proj",
                "v_proj",
                "o_proj",
                "gate_proj",
                "up_proj",
                "down_proj",
                "qkv_proj",
                # add c_* layers used in starcoder2
                "c_proj",
                "c_fc",
            ]

            for layer_name in adapter_layers:
                nlayers = (
                    1 if layer_name == "lm_head" else len(model.model.model.layers)
                )
                adapter_weights = LoraWeights.prepare_weights(
                    config=adapter_config,
                    module_map=module_map,
                    layer_type=layer_name,
                    unused_weight_names=unused_weight_names,
                    nlayers=nlayers,
                    dtype=model.dtype,
                    world_size=model.world_size,
                    process_group=model.process_group,
                    target_to_layer=target_to_layer,
                )

                if adapter_weights is None:
                    continue

                model.layer_to_adapter_weights[layer_name].add_adapter(
                    adapter_index, adapter_weights
                )

            if len(unused_weight_names) > 0:
                logger.warning(
                    f"{','.join([a.id for a in lora_adapters])} unused adapter weights: {unused_weight_names}"
                )

            if adapter_tokenizer is not None:
                model.tokenizers.add_tokenizer(adapter_index, adapter_tokenizer)

            model.loaded_adapters.add(adapter_index)

    return model


================================================
FILE: server/text_generation_server/models/bloom.py
================================================
import torch
import torch.distributed

from typing import Optional, Type

from transformers import (
    PreTrainedTokenizerBase,
)

from text_generation_server.models import CausalLM
from text_generation_server.models.causal_lm import CausalLMBatch
from text_generation_server.pb import generate_pb2


class BloomCausalLMBatch(CausalLMBatch):
    @classmethod
    def from_pb(
        cls,
        pb: generate_pb2.Batch,
        tokenizer: PreTrainedTokenizerBase,
        dtype: torch.dtype,
        device: torch.device,
    ) -> "CausalLMBatch":
        batch = super().from_pb(pb=pb, tokenizer=tokenizer, dtype=dtype, device=device)
        batch.keys_head_dim_last = False
        return batch


class BLOOMSharded(CausalLM):
    @property
    def batch_type(self) -> Type[CausalLMBatch]:
        return BloomCausalLMBatch

    def forward(
        self, input_ids, attention_mask, position_ids, past_key_values: Optional = None
    ):
        outputs, speculative_logits = self.model.forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            use_cache=True,
        )

        logits = outputs.logits
        return logits, speculative_logits, outputs.past_key_values


================================================
FILE: server/text_generation_server/models/causal_lm.py
================================================
import torch
import time
import torch.distributed

from dataclasses import dataclass
from opentelemetry import trace
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForCausalLM,
    PreTrainedTokenizerBase,
)
from typing import Optional, Tuple, List, Type, Dict

from text_generation_server.utils import (
    initialize_torch_distributed,
    weight_files,
    Weights,
)
from text_generation_server.models import Model
from text_generation_server.utils.chunks import concat_text_chunks
from text_generation_server.utils.import_utils import SYSTEM
from text_generation_server.utils.quantization import get_loader
from text_generation_server.utils.tokens import batch_top_tokens
from text_generation_server.models.types import (
    Batch,
    Tokens,
    Generation,
    GeneratedText,
)
from text_generation_server.pb import generate_pb2
from text_generation_server.utils import NextTokenChooser, StoppingCriteria, Sampling

tracer = trace.get_tracer(__name__)


@dataclass
class CausalLMBatch(Batch):
    batch_id: int
    requests: List[generate_pb2.Request]
    requests_idx_mapping: Dict[int, int]

    # Decoder values
    input_ids: torch.Tensor
    attention_mask: torch.Tensor
    position_ids: torch.Tensor
    past_key_values: Optional[List[Tuple]]

    # All tokens
    all_input_ids: List[torch.Tensor]

    # Lengths of all generations present in the batch
    input_lengths: List[int]
    prefix_offsets: List[int]
    read_offsets: List[int]

    # Generation helpers
    next_token_choosers: List[NextTokenChooser]
    stopping_criterias: List[StoppingCriteria]
    top_n_tokens: List[int]
    top_n_tokens_tensor: torch.Tensor

    # Metadata used for padding
    max_input_length: int
    padding_right_offset: int

    # Maximum number of tokens this batch will grow to
    max_tokens: int

    # Past metadata
    keys_head_dim_last: bool = True

    def to_pb(self) -> generate_pb2.CachedBatch:
        return generate_pb2.CachedBatch(
            id=self.batch_id,
            request_ids=[r.id for r in self.requests],
            size=len(self),
            max_tokens=self.max_tokens,
            current_tokens=len(self.input_ids),
        )

    @classmethod
    def from_pb(
        cls,
        pb: generate_pb2.Batch,
        tokenizer: PreTrainedTokenizerBase,
        dtype: torch.dtype,
        device: torch.device,
    ) -> "CausalLMBatch":
        inputs = []
        next_token_choosers = []
        stopping_criterias = []
        top_n_tokens = []
        prefix_offsets = []
        read_offsets = []
        requests_idx_mapping = {}

        # Parse batch
        max_truncation = 0
        padding_right_offset = 0
        max_decode_tokens = 0
        for i, r in enumerate(pb.requests):
            requests_idx_mapping[r.id] = i
            inputs.append(concat_text_chunks(r.input_chunks.chunks))

            next_token_choosers.append(
                NextTokenChooser.from_pb(r.parameters, device, tokenizer)
            )
            stopping_criteria = StoppingCriteria.from_pb(
                r.stopping_parameters, tokenizer
            )
            stopping_criterias.append(stopping_criteria)
            top_n_tokens.append(r.top_n_tokens)
            max_truncation = max(max_truncation, r.truncate)
            max_decode_tokens += stopping_criteria.max_new_tokens
            padding_right_offset = max(
                padding_right_offset, stopping_criteria.max_new_tokens
            )

        tokenized_inputs = tokenizer(
            inputs,
            return_tensors="pt",
            padding=True,
            return_token_type_ids=False,
            truncation=True,
            max_length=max_truncation,
        ).to(device)
        for _ in pb.requests:
            input_len = tokenized_inputs["input_ids"].shape[1]
            prefix_offsets.append(input_len - 5)
            read_offsets.append(input_len)

        input_lengths = tokenized_inputs["attention_mask"].sum(1)
        max_input_length = input_lengths.max()

        input_ids = tokenized_inputs["input_ids"]
        # Allocate maximum attention_mask
        attention_mask = input_ids.new_zeros(
            (pb.size, max_input_length + padding_right_offset)
        )
        # Copy tokenizer attention_mask into fully allocated attention_mask
        attention_mask[:, :max_input_length] = tokenized_inputs["attention_mask"]

        position_ids = tokenized_inputs["attention_mask"].long().cumsum(-1) - 1
        position_ids.masked_fill_(tokenized_inputs["attention_mask"] == 0, 1)
        all_input_ids = tokenized_inputs["input_ids"].T.split(1, dim=1)
        top_n_tokens_tensor = torch.tensor(
            top_n_tokens, device=device, dtype=torch.int64
        )

        max_tokens = len(inputs) * (max_input_length + max_decode_tokens)

        return cls(
            batch_id=pb.id,
            requests=pb.requests,
            requests_idx_mapping=requests_idx_mapping,
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=None,
            all_input_ids=list(all_input_ids),
            input_lengths=input_lengths.tolist(),
            prefix_offsets=prefix_offsets,
            read_offsets=read_offsets,
            next_token_choosers=next_token_choosers,
            stopping_criterias=stopping_criterias,
            top_n_tokens=top_n_tokens,
            top_n_tokens_tensor=top_n_tokens_tensor,
            max_input_length=max_input_length.item(),
            padding_right_offset=padding_right_offset,
            max_tokens=max_tokens,
        )

    @tracer.start_as_current_span("filter")
    def filter(self, request_ids: List[int]) -> Optional["CausalLMBatch"]:
        if len(request_ids) == 0:
            raise ValueError("Batch must have at least one request")
        if len(request_ids) == len(self):
            return self

        keep_indices = []

        # New values after filtering
        requests_idx_mapping = {}
        requests = []
        input_lengths = []
        prefix_offsets = []
        read_offsets = []
        all_input_ids = []
        max_input_length = 0

        next_token_choosers = []
        stopping_criterias = []
        top_n_tokens = []

        total_remaining_decode_tokens = 0
        new_padding_right_offset = 0

        for i, request_id in enumerate(request_ids):
            idx = self.requests_idx_mapping[request_id]
            requests_idx_mapping[request_id] = i
            keep_indices.append(idx)

            requests.append(self.requests[idx])
            prefix_offsets.append(self.prefix_offsets[idx])
            read_offsets.append(self.read_offsets[idx])
            all_input_ids.append(self.all_input_ids[idx])

            request_input_length = self.input_lengths[idx]
            input_lengths.append(request_input_length)
            max_input_length = max(max_input_length, request_input_length)

            next_token_choosers.append(self.next_token_choosers[idx])
            stopping_criteria = self.stopping_criterias[idx]
            stopping_criterias.append(stopping_criteria)
            top_n_tokens.append(self.top_n_tokens[idx])
            remaining_decode_tokens = (
                stopping_criteria.max_new_tokens - stopping_criteria.current_tokens
            )
            total_remaining_decode_tokens += remaining_decode_tokens
            new_padding_right_offset = max(
                new_padding_right_offset, remaining_decode_tokens
            )

        # Apply indices to input_ids, attention mask, past key values and other items that need to be cached
        input_ids = self.input_ids[keep_indices]
        position_ids = self.position_ids[keep_indices]
        self.attention_mask = self.attention_mask[
            keep_indices,
            -(self.padding_right_offset + max_input_length) : (
                self.attention_mask.shape[1] - self.padding_right_offset
            )
            + new_padding_right_offset,
        ]

        # Ensure that past_key_values tensors can be updated in-place
        if type(self.past_key_values[0]) is tuple:
            self.past_key_values = [list(layer) for layer in self.past_key_values]

        # Update tensors in-place to allow incremental garbage collection
        past_kv_length = max_input_length - 1
        for layer in self.past_key_values:
            past_keys, past_values = layer
            if len(past_keys.shape) == 3:
                # Force past to be of dim [self_size, num_heads, ...] for easy indexing
                past_keys = past_keys.view(len(self), -1, *past_keys.shape[-2:])
                past_values = past_values.view(len(self), -1, *past_values.shape[-2:])
            if self.keys_head_dim_last:
                layer[0] = past_keys[keep_indices, :, -past_kv_length:, :]
            else:
                layer[0] = past_keys[keep_indices, :, :, -past_kv_length:]
            del past_keys
            layer[1] = past_values[keep_indices, :, -past_kv_length:, :]
            del past_values

        top_n_tokens_tensor = self.top_n_tokens_tensor[keep_indices]
        max_tokens = len(request_ids) * max_input_length + total_remaining_decode_tokens

        self.requests = requests
        self.requests_idx_mapping = requests_idx_mapping
        self.input_ids = input_ids
        self.position_ids = position_ids
        self.all_input_ids = all_input_ids
        self.input_lengths = input_lengths
        self.prefix_offsets = prefix_offsets
        self.read_offsets = read_offsets
        self.next_token_choosers = next_token_choosers
        self.stopping_criterias = stopping_criterias
        self.top_n_tokens = top_n_tokens
        self.top_n_tokens_tensor = top_n_tokens_tensor
        self.max_input_length = max_input_length
        self.padding_right_offset = new_padding_right_offset
        self.max_tokens = max_tokens

        return self

    @classmethod
    @tracer.start_as_current_span("concatenate")
    def concatenate(cls, batches: List["CausalLMBatch"]) -> "CausalLMBatch":
        # Used for padding
        total_batch_size = 0
        max_input_length = 0
        padding_right_offset = 0
        for batch in batches:
            total_batch_size += len(batch)
            max_input_length = max(max_input_length, batch.max_input_length)
            padding_right_offset = max(padding_right_offset, batch.padding_right_offset)

        # Batch attributes
        requests = []
        requests_idx_mapping = {}
        input_lengths = []
        prefix_offsets = []
        read_offsets = []
        all_input_ids = []
        next_token_choosers = []
        stopping_criterias = []
        top_n_tokens = []
        max_tokens = 0

        # Batch tensors
        input_ids = None
        attention_mask = None
        position_ids = None
        past_key_values = []
        top_n_tokens_tensor = None

        # Used for slicing correctly inside the tensors
        # Equivalent to a cumsum on batch sizes
        start_index = 0
        for i, batch in enumerate(batches):
            requests.extend(batch.requests)
            input_lengths.extend(batch.input_lengths)
            prefix_offsets.extend(batch.prefix_offsets)
            read_offsets.extend(batch.read_offsets)
            all_input_ids.extend(batch.all_input_ids)
            next_token_choosers.extend(batch.next_token_choosers)
            stopping_criterias.extend(batch.stopping_criterias)
            top_n_tokens.extend(batch.top_n_tokens)

            if i == 0:
                requests_idx_mapping = batch.requests_idx_mapping
            else:
                # We need to offset the mapping for each batch by the cumulative batch size
                for k, v in batch.requests_idx_mapping.items():
                    requests_idx_mapping[k] = v + start_index

            # Slicing end index for this batch
            end_index = start_index + len(batch)

            # We only concatenate batches that did at least one step
            if batch.past_key_values is None:
                raise ValueError("only concatenate prefilled batches")

            # Create empty tensor
            # input_ids is always of shape [batch_size, 1]
            # We do not need to pad it
            if input_ids is None:
                input_ids = batch.input_ids.new_empty((total_batch_size, 1))
            # Copy to correct indices
            input_ids[start_index:end_index] = batch.input_ids

            # Create padded tensor
            if attention_mask is None:
                attention_mask = batch.attention_mask.new_zeros(
                    (total_batch_size, max_input_length + padding_right_offset),
                )

            if top_n_tokens_tensor is None:
                top_n_tokens_tensor = batches[0].top_n_tokens_tensor.new_zeros(
                    total_batch_size,
                )
            top_n_tokens_tensor[start_index:end_index] = batch.top_n_tokens_tensor

            # We need to slice the attention mask to remove padding from previous steps
            # and to remove unused allocated space
            left_offset = max_input_length - batch.max_input_length
            batch_left_offset = (
                batch.attention_mask.shape[1]
                - batch.max_input_length
                - batch.padding_right_offset
            )
            attention_mask[
                start_index:end_index,
                left_offset:-padding_right_offset,
            ] = batch.attention_mask[
                :,
                batch_left_offset : -batch.padding_right_offset,
            ]

            # Create empty tensor
            # position_ids is always of shape [batch_size, 1]
            if position_ids is None:
                position_ids = batch.position_ids.new_empty((total_batch_size, 1))
            position_ids[start_index:end_index] = batch.position_ids

            # Shenanigans to get dimensions because BLOOM outputs a past with a different shape
            # BLOOM Keys:   [batch_size * num_heads, head_dim, seq_length]
            # BLOOM Values: [batch_size * num_heads, seq_length, head_dim]
            # And ensure that we can update tensors in-place
            if isinstance(batch.past_key_values[0], tuple):
                batch.past_key_values = [
                    [t.view(len(batch), -1, *t.shape[-2:]) for t in layer]
                    for layer in batch.past_key_values
                ]
            elif len(batch.past_key_values[0][0].shape) == 3:
                for layer in batch.past_key_values:
                    for k, t in enumerate(layer):
                        layer[k] = t.view(len(batch), -1, *t.shape[-2:])

            # Add eventual padding tokens that were added while concatenating
            max_tokens += batch.max_tokens + (
                max_input_length - batch.max_input_length
            ) * len(batch)

            start_index = end_index

        first_past_kvs = batches[0].past_key_values
        _, num_heads, padded_sequence_length, head_dim = first_past_kvs[0][1].shape

        padded_past_values_shape = (
            total_batch_size,
            num_heads,
            max_input_length - 1,
            head_dim,
        )

        if batches[0].keys_head_dim_last:
            padded_past_keys_shape = padded_past_values_shape
        else:
            # seq_length is last for BLOOM
            padded_past_keys_shape = (
                total_batch_size,
                num_heads,
                head_dim,
                max_input_length - 1,
            )

        # Iterate over attention layers
        # Concatenate past key values layer by layer to allow incremental garbage collection
        for j in range(len(first_past_kvs)):
            padded_past_keys = first_past_kvs[j][0].new_zeros(padded_past_keys_shape)
            start_index = 0
            for batch in batches:
                past_keys = batch.past_key_values[j][0]
                # Clear reference to the original tensor
                batch.past_key_values[j][0] = None

                # Slicing end index for this batch
                end_index = start_index + len(batch)
                # We slice the keys to remove the padding from previous batches
                past_seq_len = batch.max_input_length - 1
                if batch.keys_head_dim_last:
                    padded_past_keys[start_index:end_index, :, -past_seq_len:, :] = (
                        past_keys[:, :, -past_seq_len:, :]
                    )
                else:
                    # BLOOM case
                    padded_past_keys[start_index:end_index, :, :, -past_seq_len:] = (
                        past_keys[:, :, :, -past_seq_len:]
                    )
                del past_keys

                start_index = end_index

            padded_past_values = first_past_kvs[j][1].new_zeros(
                padded_past_values_shape
            )
            start_index = 0
            for batch in batches:
                past_values = batch.past_key_values[j][1]
                # Clear reference to the original tensor
                batch.past_key_values[j][1] = None

                # Slicing end index for this batch
                end_index = start_index + len(batch)
                # We slice the past values to remove the padding from previous batches
                past_seq_len = batch.max_input_length - 1
                padded_past_values[start_index:end_index, :, -past_seq_len:, :] = (
                    past_values[:, :, -past_seq_len:, :]
                )
                del past_values

                # Update values
                start_index = end_index

            past_key_values.append([padded_past_keys, padded_past_values])

        return cls(
            batch_id=batches[0].batch_id,
            requests=requests,
            requests_idx_mapping=requests_idx_mapping,
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            all_input_ids=all_input_ids,
            input_lengths=input_lengths,
            prefix_offsets=prefix_offsets,
            read_offsets=read_offsets,
            next_token_choosers=next_token_choosers,
            stopping_criterias=stopping_criterias,
            top_n_tokens=top_n_tokens,
            top_n_tokens_tensor=top_n_tokens_tensor,
            max_input_length=max_input_length,
            padding_right_offset=padding_right_offset,
            keys_head_dim_last=batches[0].keys_head_dim_last,
            max_tokens=max_tokens,
        )

    def __len__(self):
        return len(self.requests)


@dataclass
class CausalLMBatchKeysLast(CausalLMBatch):
    keys_head_dim_last: bool = False


class CausalLM(Model):
    def __init__(
        self,
        model_id: str,
        model_class,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
        speculator: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        default_dtype=torch.float16,
        trust_remote_code: bool = False,
        tokenizer_class=AutoTokenizer,
        config_class=AutoConfig,
        batch_class=CausalLMBatch,
    ):
        self.quantize = quantize
        self.batch_class = batch_class
        self.process_group, rank, world_size = initialize_torch_distributed()
        if torch.cuda.is_available():
            device = torch.device(f"cuda:{rank}")
            dtype = default_dtype if dtype is None else dtype
        elif hasattr(torch, "xpu") and torch.xpu.is_available():
            device = torch.device(f"xpu:{rank}")
            dtype = default_dtype if dtype is None else dtype
        elif SYSTEM == "ipex":
            device = torch.device("cpu")
            # Float16 doesn't exist on target.
            dtype = torch.bfloat16 if dtype is None else dtype
        else:
            device = torch.device("cpu")
            dtype = torch.float32 if dtype is None else dtype

        tokenizer = tokenizer_class.from_pretrained(
            model_id,
            revision=revision,
            padding_side="left",
            truncation_side="left",
            trust_remote_code=trust_remote_code,
        )

        config = config_class.from_pretrained(
            model_id,
            revision=revision,
            trust_remote_code=trust_remote_code,
        )
        config.quantize = quantize
        config.speculator = speculator
        if tokenizer.pad_token_id is None:
            if config.pad_token_id is not None:
                tokenizer.pad_token_id = config.pad_token_id
            elif config.eos_token_id is not None:
                tokenizer.pad_token_id = config.eos_token_id
            elif tokenizer.eos_token_id is not None:
                tokenizer.pad_token_id = tokenizer.eos_token_id

        torch.distributed.barrier(group=self.process_group)
        weights_loader = get_loader(
            quantize=quantize, model_id=model_id, revision=revision
        )
        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
        weights = Weights(
            filenames,
            device=device,
            dtype=dtype,
            process_group=self.process_group,
            weights_loader=weights_loader,
        )

        prefix = ""
        model = model_class(prefix, config, weights)

        torch.distributed.barrier(group=self.process_group)
        super().__init__(
            model_id=model_id,
            model=model,
            tokenizer=tokenizer,
            requires_padding=True,
            dtype=dtype,
            device=device,
            rank=rank,
            world_size=world_size,
        )

    @classmethod
    def fallback(
        cls,
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
        speculator: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
    ):
        if speculator:
            raise RuntimeError("Speculator decoding is not enabled for AutoModel")

        device_count = 0
        if torch.cuda.is_available():
            device = torch.device("cuda")
            device_count = torch.cuda.device_count()
            dtype = torch.float16 if dtype is None else dtype
        elif hasattr(torch, "xpu") and torch.xpu.is_available():
            device = torch.device("xpu")
            device_count = torch.xpu.device_count()
            dtype = torch.float16 if dtype is None else dtype
        else:
            if quantize:
                raise ValueError("quantization is not available on CPU")

            device = torch.device("cpu")
            dtype = torch.float32 if dtype is None else dtype

        tokenizer = AutoTokenizer.from_pretrained(
            model_id,
            revision=revision,
            padding_side="left",
            truncation_side="left",
            trust_remote_code=trust_remote_code,
        )
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            revision=revision,
            torch_dtype=dtype,
            device_map=("auto" if device_count > 1 else None),
            load_in_8bit=quantize == "bitsandbytes",
            trust_remote_code=trust_remote_code,
        )
        if device_count == 1 and quantize != "bitsandbytes":
            model = model.to(device)

        if tokenizer.pad_token_id is None:
            if model.config.pad_token_id is not None:
                tokenizer.pad_token_id = model.config.pad_token_id
            elif model.config.eos_token_id is not None and isinstance(
                model.config.eos_token_id, int
            ):
                tokenizer.pad_token_id = model.config.eos_token_id
            elif tokenizer.eos_token_id is not None:
                tokenizer.pad_token_id = tokenizer.eos_token_id
            else:
                tokenizer.add_special_tokens({"pad_token": "[PAD]"})

        self = cls.__new__(
            cls,
        )
        self.batch_class = CausalLMBatch
        super().__init__(
            self,
            model_id=model_id,
            model=model,
            tokenizer=tokenizer,
            requires_padding=True,
            dtype=dtype,
            device=device,
        )
        self.quantize = quantize
        return self

    @property
    def batch_type(self) -> Type[CausalLMBatch]:
        return self.batch_class

    def forward(
        self, input_ids, attention_mask, position_ids, past_key_values: Optional = None
    ) -> Tuple[
        torch.Tensor, Optional[torch.Tensor], List[Tuple[torch.Tensor, torch.Tensor]]
    ]:
        # Model Forward
        kwargs = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "past_key_values": past_key_values,
            "use_cache": True,
            "return_dict": True,
        }
        if self.has_position_ids:
            kwargs["position_ids"] = position_ids

        outputs = self.model.forward(**kwargs)
        if isinstance(outputs, tuple):
            outputs, speculative_logits = outputs
        else:
            speculative_logits = None
        return outputs.logits, speculative_logits, outputs.past_key_values

    @tracer.start_as_current_span("generate_token")
    def generate_token(
        self, batch: CausalLMBatch
    ) -> Tuple[List[Generation], Optional[CausalLMBatch], Tuple[int, int]]:
        start = time.time_ns()
        # slice the attention mask to the correct shape
        attention_mask = batch.attention_mask[:, : -batch.padding_right_offset]

        logits, speculative_logits, past = self.forward(
            batch.input_ids,
            attention_mask,
            batch.position_ids,
            batch.past_key_values,
        )

        # Results
        generations: List[Generation] = []
        stopped = True

        # Speculation is not active for causal
        accepted_ids = torch.ones_like(batch.input_ids)[:, 0]
        batch_top_token_ids, batch_top_token_logprobs = batch_top_tokens(
            batch.top_n_tokens,
            batch.top_n_tokens_tensor,
            torch.log_softmax(logits[:, -1], -1),
            accepted_ids,
        )

        start_decode = time.time_ns()

        # Zipped iterator
        iterator = zip(
            batch.requests,
            batch.input_lengths,
            batch.prefix_offsets,
            batch.read_offsets,
            logits,
            batch.next_token_choosers,
            batch.stopping_criterias,
            batch.all_input_ids,
            batch.top_n_tokens,
            batch_top_token_ids,
            batch_top_token_logprobs,
        )

        # For each member of the batch
        for i, (
            request,
            input_length,
            prefix_offset,
            read_offset,
            logits,
            next_token_chooser,
            stopping_criteria,
            all_input_ids,
            top_n_tokens,
            top_token_ids,
            top_token_logprobs,
        ) in enumerate(iterator):
            # Select next token
            next_token_id, logprobs = next_token_chooser(
                all_input_ids.view(1, -1), logits[-1:, :]
            )

            # Append next token to all tokens
            all_input_ids = torch.cat([all_input_ids, next_token_id])
            new_input_length = input_length + 1

            # Generated token
            next_token_logprob = logprobs[-1, next_token_id]
            next_token_id_squeezed = next_token_id.squeeze()
            next_token_text, prefix_offset, read_offset = self.decode_token(
                all_input_ids[:, 0], prefix_offset, read_offset
            )

            # Evaluate stopping criteria
            stop, reason = stopping_criteria(
                next_token_id_squeezed,
                next_token_text,
            )

            if not stop:
                stopped = False

            # Shard generations
            # All generations will be appended in the rust sharded client
            if i % self.world_size == self.rank:
                if stop:
                    # Decode generated tokens
                    output_text, _, _ = self.decode_token(
                        all_input_ids[:, 0],
                        prefix_offset=len(all_input_ids)
                        - stopping_criteria.current_tokens
                        - 1,
                        read_offset=len(all_input_ids)
                        - stopping_criteria.current_tokens,
                        skip_special_tokens=True,
                    )
                    # Get seed
                    if isinstance(next_token_chooser.choice, Sampling):
                        seed = next_token_chooser.choice.seed
                    else:
                        seed = None

                    generated_text = GeneratedText(
                        output_text, stopping_criteria.current_tokens, reason, seed
                    )
                else:
                    generated_text = None

                # Prefill
                if stopping_criteria.current_tokens == 1 and request.prefill_logprobs:
                    # Remove generated token to only have prefill and add nan for first prompt token
                    prefill_logprobs = [float("nan")] + torch.log_softmax(
                        logits, -1
                    ).gather(1, all_input_ids[1:]).squeeze(1)[
                        -new_input_length:-1
                    ].tolist()
                    prefill_token_ids = all_input_ids[-new_input_length:-1]
                    prefill_texts = self.tokenizer.batch_decode(
                        prefill_token_ids,
                        clean_up_tokenization_spaces=False,
                        skip_special_tokens=False,
                    )
                    prefill_tokens = Tokens(
                        prefill_token_ids,
                        prefill_logprobs,
                        prefill_texts,
                        is_special=[],
                    )
                else:
                    prefill_tokens = None

                if top_n_tokens > 0:
                    all_top_tokens = []
                    for top_token_ids, top_token_logprobs in zip(
                        top_token_ids, top_token_logprobs
                    ):
                        toptoken_texts = self.tokenizer.batch_decode(
                            top_token_ids,
                            clean_up_tokenization_spaces=False,
                            skip_special_tokens=False,
                        )
                        special_toptokens = [
                            token_id in self.all_special_ids
                            for token_id in top_token_ids
                        ]
                        top_tokens = Tokens(
                            top_token_ids,
                            top_token_logprobs,
                            toptoken_texts,
                            special_toptokens,
                        )
                        all_top_tokens.append(top_tokens)
                    top_tokens = all_top_tokens
                else:
                    top_tokens = None

                generation = Generation(
                    request.id,
                    prefill_tokens,
                    Tokens(
                        [next_token_id_squeezed],
                        [next_token_logprob],
                        [next_token_text],
                        [next_token_id_squeezed.item() in self.all_special_ids],
                    ),
                    generated_text,
                    top_tokens,
                )

                generations.append(generation)

            # Update values
            batch.next_token_choosers[i] = batch.next_token_choosers[i].advance_grammar(
                next_token_id_squeezed.item()
            )
            batch.input_ids[i, 0] = next_token_id
            batch.all_input_ids[i] = all_input_ids
            batch.input_lengths[i] = new_input_length
            batch.prefix_offsets[i] = prefix_offset
            batch.read_offsets[i] = read_offset
            batch.max_input_length = max(batch.max_input_length, new_input_length)

        # We finished all generations in the batch; there is no next batch
        if stopped:
            forward_ns = start_decode - start
            decode_ns = time.time_ns() - start_decode
            return generations, None, (forward_ns, decode_ns)

        # Slice unused values from prefill
        batch.input_ids = batch.input_ids[:, :1]

        # Update attention_mask as we added a new token to input_ids
        batch.attention_mask[:, -batch.padding_right_offset] = 1
        # Decrease right offset
        batch.padding_right_offset -= 1

        # Update position_ids
        batch.position_ids = batch.position_ids[:, -1:] + 1

        # Update past key values
        batch.past_key_values = past

        forward_ns = start_decode - start
        decode_ns = time.time_ns() - start_decode
        return generations, batch, (forward_ns, decode_ns)


================================================
FILE: server/text_generation_server/models/custom_modeling/__init__.py
================================================


================================================
FILE: server/text_generation_server/models/custom_modeling/bloom_modeling.py
================================================
# coding=utf-8
# Copyright 2022 HuggingFace Inc. team and BigScience workshop.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch BLOOM model."""

import math
import os
import warnings
from typing import Optional, Tuple, Union

import torch
import torch.distributed
import torch.utils.checkpoint
from torch import nn
from torch.nn import LayerNorm
from torch.nn import functional as F

from transformers.modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
)
from transformers import BloomConfig, PreTrainedModel

from text_generation_server.layers import (
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    TensorParallelRowLinear,
    SpeculativeHead,
)

CUSTOM_KERNELS_ENABLED = False
if (
    torch.cuda.is_available()
    and not os.environ.get("DISABLE_CUSTOM_KERNELS", "False") == "True"
):
    try:
        from custom_kernels import fused_bloom_attention_cuda

        CUSTOM_KERNELS_ENABLED = True
    except ImportError:
        pass

_CHECKPOINT_FOR_DOC = "bigscience/bloom-560m"
_CONFIG_FOR_DOC = "BloomConfig"

BLOOM_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "bigscience/bigscience-small-testing",
    "bigscience/bloom-560m",
    "bigscience/bloom-1b1",
    "bigscience/bloom-1b7",
    "bigscience/bloom-3b",
    "bigscience/bloom-7b1",
    "bigscience/bloom",
]


def _make_causal_mask(
    input_ids_shape: torch.Size, device: torch.device, past_key_values_length: int
) -> torch.BoolTensor:
    """
    Make causal mask used for self-attention.
    """
    batch_size, target_length = input_ids_shape
    mask = torch.ones(
        (target_length, target_length + past_key_values_length),
        dtype=torch.bool,
        device=device,
    )
    mask = mask.triu(1 + past_key_values_length)

    expanded_mask = mask.unsqueeze(0).expand(
        batch_size, target_length, target_length + past_key_values_length
    )
    return expanded_mask


def _expand_mask(mask: torch.Tensor, tgt_length: int) -> torch.BoolTensor:
    """
    Expands attention_mask from `[batch_size, src_length]` to `[batch_size, 1, tgt_length, src_length]`.
    """
    batch_size, src_length = mask.shape
    tgt_length = tgt_length if tgt_length is not None else src_length

    expanded_mask = ~(mask[:, None, :].to(torch.bool))
    return expanded_mask.expand(batch_size, tgt_length, src_length)


def build_alibi_tensor(attention_mask: torch.Tensor, num_heads: int) -> torch.Tensor:
    """
    Link to paper: https://arxiv.org/abs/2108.12409 Alibi tensor is not causal as the original paper mentions, it
    relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value
    `softmax(l+a) = softmax(l)`. Based on
    https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
    TODO @thomasw21 this doesn't work as nicely due to the masking strategy, and so masking varies slightly.

    Args:
    Returns tensor shaped (batch_size * num_heads, 1, max_seq_len)
        attention_mask (`torch.Tensor`):
            Token-wise attention mask, this should be of shape (batch_size, max_seq_len).
        num_heads (`int`, *required*):
            number of heads
        dtype (`torch.dtype`, *optional*, default=`torch.bfloat16`):
            dtype of the output tensor
    """
    batch_size, seq_length = attention_mask.shape
    closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
    base = torch.tensor(
        2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))),
        device=attention_mask.device,
        dtype=torch.float32,
    )
    powers = torch.arange(
        1, 1 + closest_power_of_2, device=attention_mask.device, dtype=torch.int32
    )
    slopes = torch.pow(base, powers)

    if closest_power_of_2 != num_heads:
        extra_base = torch.tensor(
            2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))),
            device=attention_mask.device,
            dtype=torch.float32,
        )
        num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
        extra_powers = torch.arange(
            1,
            1 + 2 * num_remaining_heads,
            2,
            device=attention_mask.device,
            dtype=torch.int32,
        )
        slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)

    # Note: alibi will added to the attention bias that will be applied to the query, key product of attention
    # => therefore alibi will have to be of shape (batch_size, num_heads, query_length, key_length)
    # => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length)
    # => the query_length dimension will then be broadcasted correctly
    # This is more or less identical to T5's relative position bias:
    # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527
    arange_tensor = ((attention_mask.cumsum(dim=-1) - 1) * attention_mask)[:, None, :]
    alibi = slopes[..., None] * arange_tensor
    return alibi


# @torch.jit.script
def dropout_add(
    x: torch.Tensor, residual: torch.Tensor, prob: float, training: bool
) -> torch.Tensor:
    """
    Dropout add function

    Args:
        x (`torch.tensor`, *required*):
            input tensor
        residual (`torch.tensor`, *required*):
            esidual tensor
        prob (`float`, *required*):
            dropout probability
        training (`bool`, *required*):
            training mode
    """
    out = F.dropout(x, p=prob, training=training)
    out = residual + out
    return out


# @torch.jit.script # this is shit for unknow reasons.
def _split_heads(
    fused_qkv: torch.Tensor, num_heads: int, head_dim: int
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """
    Split the last dimension into (num_heads, head_dim) without making any copies, results share same memory
    storage as `fused_qkv`

    Args:
        fused_qkv (`torch.tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim]

    Returns:
        query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
        value: [batch_size, seq_length, num_heads, head_dim]
    """
    batch_size, seq_length, three_times_hidden_size = fused_qkv.shape
    fused_qkv = fused_qkv.view(batch_size, seq_length, num_heads, 3 * head_dim)
    query_layer, key_layer, value_layer = fused_qkv.split(head_dim, dim=-1)

    query_layer = query_layer.transpose(1, 2).reshape(
        batch_size * num_heads, seq_length, head_dim
    )
    key_layer = key_layer.permute(0, 2, 3, 1).reshape(
        batch_size * num_heads, head_dim, seq_length
    )
    value_layer = value_layer.transpose(1, 2).reshape(
        batch_size * num_heads, seq_length, head_dim
    )

    return query_layer, key_layer, value_layer


# @torch.jit.script
def _merge_heads(x: torch.Tensor, num_heads: int, head_dim: int) -> torch.Tensor:
    """
    Merge heads together over the last dimenstion

    Args:
        x: (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]

    Returns:
        torch.tensor: [batch_size, seq_length, num_heads * head_dim]
    """
    # What we want to achieve is:
    # batch_size * num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads * head_dim
    batch_size_and_num_heads, seq_length, _ = x.shape
    batch_size = batch_size_and_num_heads // num_heads

    # First view to decompose the batch size
    # batch_size * num_heads, seq_length, head_dim -> batch_size, num_heads, seq_length, head_dim
    x = x.view(batch_size, num_heads, seq_length, head_dim)

    # batch_size, num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads, head_dim
    x = x.permute(0, 2, 1, 3)

    # batch_size, seq_length, num_heads, head_dim -> batch_size, seq_length, num_heads * head_dim
    return x.reshape(batch_size, seq_length, num_heads * head_dim)


class BloomAttention(nn.Module):
    def __init__(self, prefix, config: BloomConfig, weights):
        super().__init__()

        self.pretraining_tp = config.pretraining_tp
        self.slow_but_exact = config.slow_but_exact

        self.process_group = weights.process_group

        self.hidden_size = config.hidden_size
        self.num_heads = config.n_head
        self.head_dim = self.hidden_size // self.num_heads
        self.split_size = self.hidden_size
        self.hidden_dropout = config.hidden_dropout

        if self.head_dim * self.num_heads != self.hidden_size:
            raise ValueError(
                f"`hidden_size` must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`:"
                f" {self.num_heads})."
            )

        # Layer-wise attention scaling
        self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim)
        self.beta = 1.0

        process_group = weights.process_group
        if self.num_heads % process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {process_group.size()}"
            )
        self.num_heads = self.num_heads // process_group.size()
        self.query_key_value = TensorParallelColumnLinear.load(
            config=config,
            prefix=f"{prefix}.query_key_value",
            weights=weights,
            bias=True,
        )
        self.dense = TensorParallelRowLinear.load(
            config=config, prefix=f"{prefix}.dense", weights=weights, bias=True
        )
        self.attention_dropout = nn.Dropout(config.attention_dropout)

    @staticmethod
    def compute_attention(
        fused_qkv: torch.Tensor,
        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]],
        alibi: torch.Tensor,
        attention_mask: torch.Tensor,
        head_mask: Optional[torch.Tensor],
        beta: float,
        inv_norm_factor: float,
        num_heads: int,
        use_cache: bool,
    ):
        batch_size, q_length, three_times_hidden_size = fused_qkv.shape
        head_dim = three_times_hidden_size // (3 * num_heads)
        batch_size * num_heads

        ### TODO @thomasw21: this takes quite a bit of time, how do I accelerate that?
        # 3 x [batch_size, seq_length, num_heads, head_dim]
        (query_layer, key_layer, value_layer) = _split_heads(
            fused_qkv, num_heads=num_heads, head_dim=head_dim
        )

        if layer_past is not None:
            past_key, past_value = layer_past
            # concatenate along seq_length dimension:
            #  - key: [batch_size * self.num_heads, head_dim, kv_length]
            #  - value: [batch_size * self.num_heads, kv_length, head_dim]
            past_key = past_key.view(-1, *past_key.shape[-2:])
            key_layer = torch.cat((past_key, key_layer), dim=2)
            past_value = past_value.view(-1, *past_value.shape[-2:])
            value_layer = torch.cat((past_value, value_layer), dim=1)

        _, _, kv_length = key_layer.shape

        if use_cache is True:
            present = (key_layer, value_layer)
        else:
            present = None
        ###

        # [batch_size * num_heads, q_length, kv_length]
        # we use `torch.Tensor.baddbmm` instead of `torch.baddbmm` as the latter isn't supported by TorchScript v1.11
        attention_scores = alibi.baddbmm(
            batch1=query_layer,
            batch2=key_layer,
            beta=beta,
            alpha=inv_norm_factor,
        )

        # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length]
        input_dtype = attention_scores.dtype
        # `float16` has a minimum value of -65504.0, whereas `bfloat16` and `float32` have a minimum value of `-3.4e+38`
        if input_dtype == torch.float16:
            attention_scores = attention_scores.to(torch.float)
        # torch.finfo not supported by torch.jit, we temporarily remplace with `-1e34`
        attn_weights = attention_scores.masked_fill_(
            attention_mask, torch.finfo(attention_scores.dtype).min
        )
        attention_probs = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(
            input_dtype
        )

        # # [batch_size, num_heads, q_length, kv_length]
        # attention_probs = self.attention_dropout(attention_probs)

        if head_mask is not None:
            attention_probs = attention_probs * head_mask

        # matmul: [batch_size * num_heads, q_length, head_dim]
        context_layer = torch.bmm(attention_probs, value_layer, out=query_layer)

        # change view [batch_size, num_heads, q_length, head_dim]
        context_layer = _merge_heads(
            context_layer, num_heads=num_heads, head_dim=head_dim
        )

        return context_layer, present, attention_probs

    def forward(
        self,
        hidden_states: torch.Tensor,
        residual: torch.Tensor,
        alibi: torch.Tensor,
        attention_mask: torch.Tensor,
        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
        head_mask: Optional[torch.Tensor] = None,
        use_cache: bool = False,
        output_attentions: bool = False,
    ):
        fused_qkv = self.query_key_value(
            hidden_states
        )  # [batch_size, seq_length, 3 x hidden_size]
        batch_size, q_length, _ = fused_qkv.shape

        if layer_past is not None:
            past_key, past_value = layer_past
            layer_past = (
                past_key.view(-1, *past_key.shape[-2:]),
                past_value.view(-1, *past_value.shape[-2:]),
            )

        if CUSTOM_KERNELS_ENABLED and attention_mask.shape[-1] < 4096:
            assert self.training is False, "Only foward pass was implemented"
            assert (
                attention_mask.shape[-1] < 4096
            ), "Custom kernel support only up to 4096 tokens"
            (
                context_layer,
                present,
                attention_probs,
            ) = fused_bloom_attention_cuda.forward(
                fused_qkv,
                layer_past,
                alibi,
                attention_mask,
                head_mask,
                self.beta,
                self.inv_norm_factor,
                self.num_heads,
                use_cache,
            )
        else:
            context_layer, present, attention_probs = self.compute_attention(
                fused_qkv=fused_qkv,
                layer_past=layer_past,
                alibi=alibi,
                attention_mask=attention_mask,
                head_mask=head_mask,
                beta=self.beta,
                inv_norm_factor=self.inv_norm_factor,
                num_heads=self.num_heads,
                use_cache=use_cache,
            )

        # aggregate results across tp ranks. See here: https://github.com/pytorch/pytorch/issues/76232
        if self.pretraining_tp > 1 and self.slow_but_exact:
            slices = self.hidden_size / self.pretraining_tp
            output_tensor = torch.zeros_like(context_layer)
            for i in range(self.pretraining_tp):
                output_tensor = output_tensor + F.linear(
                    context_layer[:, :, int(i * slices) : int((i + 1) * slices)],
                    self.dense.weight[:, int(i * slices) : int((i + 1) * slices)],
                )
        else:
            output_tensor = self.dense(context_layer)

        # output_tensor = dropout_add(output_tensor, residual, self.hidden_dropout, self.training)
        output_tensor += residual

        outputs = (output_tensor, present)
        if output_attentions:
            outputs += (attention_probs,)

        return outputs


class BloomMLP(nn.Module):
    def __init__(self, prefix, config: BloomConfig, weights):
        super().__init__()

        self.pretraining_tp = config.pretraining_tp
        self.slow_but_exact = config.slow_but_exact
        self.dense_h_to_4h = TensorParallelColumnLinear.load(
            config=config, prefix=f"{prefix}.dense_h_to_4h", weights=weights, bias=True
        )
        self.dense_4h_to_h = TensorParallelRowLinear.load(
            config=config, prefix=f"{prefix}.dense_4h_to_h", weights=weights, bias=True
        )
        self.gelu_impl = torch.nn.GELU(approximate="tanh")
        self.hidden_dropout = config.hidden_dropout

    def forward(
        self, hidden_states: torch.Tensor, residual: torch.Tensor
    ) -> torch.Tensor:
        hidden_states = self.gelu_impl(self.dense_h_to_4h(hidden_states))

        if self.pretraining_tp > 1 and self.slow_but_exact:
            intermediate_output = torch.zeros_like(residual)
            slices = self.dense_4h_to_h.weight.shape[-1] / self.pretraining_tp
            for i in range(self.pretraining_tp):
                intermediate_output = intermediate_output + F.linear(
                    hidden_states[:, :, int(i * slices) : int((i + 1) * slices)],
                    self.dense_4h_to_h.weight[
                        :, int(i * slices) : int((i + 1) * slices)
                    ],
                )
        else:
            intermediate_output = self.dense_4h_to_h(hidden_states)

        # output = dropout_add(intermediate_output, residual, self.hidden_dropout, self.training)
        intermediate_output += residual

        return intermediate_output


class BloomBlock(nn.Module):
    def __init__(self, layer_id: int, config: BloomConfig, weights):
        super().__init__()

        prefix = f"h.{layer_id}"
        self.input_layernorm = LayerNorm.load(
            prefix=f"{prefix}.input_layernorm",
            weights=weights,
            eps=config.layer_norm_epsilon,
        )
        self.num_heads = config.n_head
        self.self_attention = BloomAttention(
            prefix=f"{prefix}.self_attention", config=config, weights=weights
        )
        self.post_attention_layernorm = LayerNorm.load(
            prefix=f"{prefix}.post_attention_layernorm",
            weights=weights,
            eps=config.layer_norm_epsilon,
        )

        self.mlp = BloomMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
        self.apply_residual_connection_post_layernorm = (
            config.apply_residual_connection_post_layernorm
        )
        self.hidden_dropout = config.hidden_dropout

    def forward(
        self,
        hidden_states: torch.Tensor,
        alibi: torch.Tensor,
        attention_mask: torch.Tensor,
        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
        head_mask: Optional[torch.Tensor] = None,
        use_cache: bool = False,
        output_attentions: bool = False,
    ):
        # hidden_states: [batch_size, seq_length, hidden_size]

        # Layer norm at the beginning of the transformer layer.
        layernorm_output = self.input_layernorm(hidden_states)

        # Layer norm post the self attention.
        if self.apply_residual_connection_post_layernorm:
            residual = layernorm_output
        else:
            residual = hidden_states

        # Self attention.
        attn_outputs = self.self_attention(
            layernorm_output,
            residual,
            layer_past=layer_past,
            attention_mask=attention_mask,
            alibi=alibi,
            head_mask=head_mask,
            use_cache=use_cache,
            output_attentions=output_attentions,
        )

        attention_output = attn_outputs[0]

        outputs = attn_outputs[1:]

        layernorm_output = self.post_attention_layernorm(attention_output)

        # Get residual
        if self.apply_residual_connection_post_layernorm:
            residual = layernorm_output
        else:
            residual = attention_output

        # MLP.
        output = self.mlp(layernorm_output, residual)

        if use_cache:
            outputs = (output,) + outputs
        else:
            outputs = (output,) + outputs[1:]

        return outputs  # hidden_states, present, attentions


class BloomPreTrainedModel(PreTrainedModel):
    config_class = BloomConfig
    base_model_prefix = "transformer"
    _no_split_modules = ["BloomBlock"]

    @staticmethod
    def _convert_to_standard_cache(
        past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]], batch_size: int
    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
        """
        Standardizes the format of the cache so as to match most implementations, i.e. to tuple(tuple([batch_size,
        num_heads, ...]))
        """
        batch_size_times_num_heads, head_dim, seq_length = past_key_value[0][0].shape
        num_heads = batch_size_times_num_heads // batch_size
        # key: [batch_size * num_heads, head_dim, seq_length] -> [batch_size, num_heads, head_dim, seq_length]
        # value: [batch_size * num_heads, seq_length, head_dim] -> [batch_size, num_heads, seq_length, head_dim]
        return tuple(
            (
                layer_past[0].view(batch_size, num_heads, head_dim, seq_length),
                layer_past[1].view(batch_size, num_heads, seq_length, head_dim),
            )
            for layer_past in past_key_value
        )

    @staticmethod
    def _convert_to_bloom_cache(
        past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]],
    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
        """
        Converts the cache to the format expected by Bloom, i.e. to tuple(tuple([batch_size * num_heads, ...]))
        """
        batch_size, num_heads, head_dim, seq_length = past_key_value[0][0].shape
        batch_size_times_num_heads = batch_size * num_heads
        # key:  [batch_size, num_heads, head_dim, seq_length] -> [batch_size * num_heads, head_dim, seq_length]
        # value: [batch_size, num_heads, seq_length, head_dim] -> [batch_size * num_heads, seq_length, head_dim]
        return tuple(
            (
                layer_past[0].view(batch_size_times_num_heads, head_dim, seq_length),
                layer_past[1].view(batch_size_times_num_heads, seq_length, head_dim),
            )
            for layer_past in past_key_value
        )


class BloomModel(BloomPreTrainedModel):
    def __init__(self, config: BloomConfig, weights):
        super().__init__(config)

        self.embed_dim = config.hidden_size
        self.num_heads = config.n_head

        process_group = weights.process_group
        self.tp_rank = process_group.rank()
        self.tp_world_size = process_group.size()

        self.word_embeddings = TensorParallelEmbedding(
            prefix="word_embeddings", weights=weights
        )

        self.word_embeddings_layernorm = LayerNorm.load(
            prefix="word_embeddings_layernorm",
            weights=weights,
            eps=config.layer_norm_epsilon,
        )

        # Transformer blocks
        self.h = nn.ModuleList(
            [
                BloomBlock(layer_id=layer_id, config=config, weights=weights)
                for layer_id in range(config.num_hidden_layers)
            ]
        )

        # Final Layer Norm
        self.ln_f = LayerNorm.load(
            prefix="ln_f", weights=weights, eps=config.layer_norm_epsilon
        )

    def _prepare_attn_mask(
        self,
        attention_mask: torch.Tensor,
        input_shape: Tuple[int, int],
        past_key_values_length: int,
    ) -> torch.BoolTensor:
        # create causal mask
        # [batch_size, seq_length] -> [batch_size, tgt_length, src_length]
        combined_attention_mask = None
        device = attention_mask.device
        _, src_length = input_shape

        if src_length > 1:
            combined_attention_mask = _make_causal_mask(
                input_shape,
                device=device,
                past_key_values_length=past_key_values_length,
            )

        # [batch_size, seq_length] -> [batch_size, tgt_length, src_length]
        expanded_attn_mask = _expand_mask(attention_mask, tgt_length=src_length)
        combined_attention_mask = (
            expanded_attn_mask
            if combined_attention_mask is None
            else expanded_attn_mask | combined_attention_mask
        )

        return combined_attention_mask

    def set_input_embeddings(self, new_embeddings: torch.Tensor):
        self.word_embeddings = new_embeddings

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **deprecated_arguments,
    ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
        if deprecated_arguments.pop("position_ids", False) is not False:
            # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
            warnings.warn(
                "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore"
                " passing `position_ids`.",
                FutureWarning,
            )
        if len(deprecated_arguments) > 0:
            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")

        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        if input_ids is not None and inputs_embeds is not None:
            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time"
            )
        elif input_ids is not None:
            batch_size, seq_length = input_ids.shape
        elif inputs_embeds is not None:
            batch_size, seq_length, _ = inputs_embeds.shape
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        if past_key_values is None:
            past_key_values = tuple([None] * len(self.h))

        # Prepare head mask if needed
        # 1.0 in head_mask indicate we keep the head
        # attention_probs has shape batch_size x num_heads x N x N
        # head_mask has shape n_layer x batch x num_heads x N x N
        head_mask = self.get_head_mask(head_mask, self.config.n_layer)

        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)

        hidden_states = self.word_embeddings_layernorm(inputs_embeds)

        presents = () if use_cache else None
        all_self_attentions = () if output_attentions else None
        all_hidden_states = () if output_hidden_states else None

        # Compute alibi tensor: check build_alibi_tensor documentation
        seq_length_with_past = seq_length
        past_key_values_length = 0
        if past_key_values[0] is not None:
            past_key_values_length = past_key_values[0][0].shape[-1]
            seq_length_with_past = seq_length_with_past + past_key_values_length
        if attention_mask is None:
            attention_mask = torch.ones(
                (batch_size, seq_length_with_past), device=hidden_states.device
            )
        else:
            attention_mask = attention_mask.to(hidden_states.device)

        alibi = build_alibi_tensor(attention_mask, self.num_heads)

        causal_mask = self._prepare_attn_mask(
            attention_mask,
            input_shape=(batch_size, seq_length),
            past_key_values_length=past_key_values_length,
        )

        if hasattr(self, "tp_rank"):
            assert self.num_heads % self.tp_world_size == 0
            block_size = self.num_heads // self.tp_world_size
            alibi = alibi[
                :, self.tp_rank * block_size : (self.tp_rank + 1) * block_size
            ]
            alibi = alibi.reshape(batch_size * block_size, 1, seq_length_with_past)
            causal_mask = torch.repeat_interleave(causal_mask, block_size, dim=0)
        else:
            alibi = alibi.reshape(batch_size * self.num_heads, 1, seq_length_with_past)
            causal_mask = torch.repeat_interleave(causal_mask, self.num_heads, dim=0)

        alibi = alibi.to(hidden_states.dtype)

        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            outputs = block(
                hidden_states,
                layer_past=layer_past,
                attention_mask=causal_mask,
                head_mask=head_mask[i],
                use_cache=use_cache,
                output_attentions=output_attentions,
                alibi=alibi,
            )

            hidden_states = outputs[0]
            if use_cache is True:
                presents = presents + (outputs[1],)

            if output_attentions:
                all_self_attentions = all_self_attentions + (
                    outputs[2 if use_cache else 1],
                )

        # Add last hidden state
        hidden_states = self.ln_f(hidden_states)

        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        if not return_dict:
            return tuple(
                v
                for v in [
                    hidden_states,
                    presents,
                    all_hidden_states,
                    all_self_attentions,
                ]
                if v is not None
            )

        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=presents,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
        )


class BloomForCausalLM(BloomPreTrainedModel):
    def __init__(self, prefix: str, config, weights):
        super().__init__(config)
        self.transformer = BloomModel(config, weights)

        self.lm_head = SpeculativeHead.load(
            config,
            prefix="word_embeddings",
            weights=weights,
        )

    def prepare_inputs_for_generation(
        self,
        input_ids: torch.LongTensor,
        past_key_values: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        **kwargs,
    ) -> dict:
        # only last token for input_ids if past is not None
        if past_key_values:
            input_ids = input_ids[:, -1].unsqueeze(-1)

            # the cache may be in the stardard format (e.g. in contrastive search), convert to bloom's format if needed
            if past_key_values[0][0].shape[0] == input_ids.shape[0]:
                past_key_values = self._convert_to_bloom_cache(past_key_values)

        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
        if inputs_embeds is not None and past_key_values is None:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}

        model_inputs.update(
            {
                "past_key_values": past_key_values,
                "use_cache": kwargs.get("use_cache"),
                "attention_mask": attention_mask,
            }
        )
        return model_inputs

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **deprecated_arguments,
    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        """
        if deprecated_arguments.pop("position_ids", False) is not False:
            # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
            warnings.warn(
                "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore"
                " passing `position_ids`.",
                FutureWarning,
            )
        if len(deprecated_arguments) > 0:
            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")

        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        transformer_outputs = self.transformer(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = transformer_outputs[0]

        logits, speculative_logits = self.lm_head(hidden_states)
        loss = None

        if not return_dict:
            output = (logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return (
            CausalLMOutputWithCrossAttentions(
                loss=loss,
                logits=logits,
                past_key_values=transformer_outputs.past_key_values,
                hidden_states=transformer_outputs.hidden_states,
                attentions=transformer_outputs.attentions,
            ),
            speculative_logits,
        )


================================================
FILE: server/text_generation_server/models/custom_modeling/clip.py
================================================
from typing import Optional, Tuple

import torch
from torch import nn

from transformers.activations import ACT2FN
from transformers.modeling_attn_mask_utils import (
    _create_4d_causal_attention_mask,
    _prepare_4d_attention_mask,
)
from transformers.modeling_outputs import (
    BaseModelOutputWithPooling,
)
from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig

from text_generation_server.layers import (
    TensorParallelEmbedding,
    TensorParallelColumnLinear,
    TensorParallelRowLinear,
)


class CLIPVisionEmbeddings(nn.Module):
    def __init__(self, prefix, config: CLIPVisionConfig, weights):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size
        self.image_size = config.image_size
        self.patch_size = config.patch_size

        # TODO Should we TP this ?
        self.class_embedding = weights.get_tensor(f"{prefix}.class_embedding")

        self.patch_embedding = nn.Conv2d(
            in_channels=config.num_channels,
            out_channels=self.embed_dim,
            kernel_size=self.patch_size,
            stride=self.patch_size,
            bias=False,
        )
        self.patch_embedding.weight = nn.Parameter(
            weights.get_tensor(f"{prefix}.patch_embedding.weight"), requires_grad=False
        )

        self.num_patches = (self.image_size // self.patch_size) ** 2
        self.num_positions = self.num_patches + 1
        self.position_embedding = TensorParallelEmbedding(
            prefix=f"{prefix}.position_embedding", weights=weights
        )
        self.register_buffer(
            "position_ids",
            torch.arange(self.num_positions, device=weights.device).expand((1, -1)),
            persistent=False,
        )

    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
        batch_size = pixel_values.shape[0]
        target_dtype = self.patch_embedding.weight.dtype
        patch_embeds = self.patch_embedding(
            pixel_values.to(dtype=target_dtype)
        )  # shape = [*, width, grid, grid]
        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)

        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
        embeddings = embeddings + self.position_embedding(self.position_ids)
        return embeddings


class CLIPTextEmbeddings(nn.Module):
    def __init__(self, config: CLIPTextConfig):
        super().__init__()
        embed_dim = config.hidden_size

        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
        self.position_embedding = nn.Embedding(
            config.max_position_embeddings, embed_dim
        )

        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
        self.register_buffer(
            "position_ids",
            torch.arange(config.max_position_embeddings).expand((1, -1)),
            persistent=False,
        )

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
    ) -> torch.Tensor:
        seq_length = (
            input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
        )

        if position_ids is None:
            position_ids = self.position_ids[:, :seq_length]

        if inputs_embeds is None:
            inputs_embeds = self.token_embedding(input_ids)

        position_embeddings = self.position_embedding(position_ids)
        embeddings = inputs_embeds + position_embeddings

        return embeddings


class CLIPAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_size = self.embed_dim // self.num_heads
        if self.head_size * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {self.num_heads})."
            )
        self.num_heads = self.num_heads // weights.process_group.size()
        self.embed_dim = self.embed_dim // weights.process_group.size()
        self.scale = self.head_size**-0.5
        self.dropout = config.attention_dropout

        self.qkv = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
            dim=0,
            weights=weights,
            bias=True,
        )
        self.out_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.out_proj",
            weights=weights,
            bias=True,
        )

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return (
            tensor.view(bsz, seq_len, self.num_heads, self.head_size)
            .transpose(1, 2)
            .contiguous()
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        causal_attention_mask: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        """Input shape: Batch x Time x Channel"""

        bsz, tgt_len, _ = hidden_states.size()

        # get query proj

        qkv = self.qkv(hidden_states)
        query_states, key_states, value_states = qkv.split(
            [
                self.head_size * self.num_heads,
            ]
            * 3,
            dim=2,
        )
        query_states = query_states * self.scale
        key_states = self._shape(key_states, -1, bsz)
        value_states = self._shape(value_states, -1, bsz)

        proj_shape = (bsz * self.num_heads, -1, self.head_size)
        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
        key_states = key_states.view(*proj_shape)
        value_states = value_states.view(*proj_shape)

        src_len = key_states.size(1)
        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))

        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
            raise ValueError(
                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
                f" {attn_weights.size()}"
            )

        # apply the causal_attention_mask first
        if causal_attention_mask is not None:
            if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
                raise ValueError(
                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
                    f" {causal_attention_mask.size()}"
                )
            attn_weights = (
                attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
                + causal_attention_mask
            )
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)

        if attention_mask is not None:
            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
                raise ValueError(
                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
                )
            attn_weights = (
                attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
                + attention_mask
            )
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)

        attn_weights = nn.functional.softmax(attn_weights, dim=-1)

        attn_probs = nn.functional.dropout(
            attn_weights, p=self.dropout, training=self.training
        )

        attn_output = torch.bmm(attn_probs, value_states)

        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_size):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_size)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_size)
        attn_output = attn_output.transpose(1, 2)
        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)

        attn_output = self.out_proj(attn_output)

        return attn_output, None


class CLIPMLP(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        self.activation_fn = ACT2FN[config.hidden_act]
        self.fc1 = TensorParallelColumnLinear.load(
            prefix=f"{prefix}.fc1", config=config, weights=weights, bias=True
        )
        self.fc2 = TensorParallelRowLinear.load(
            prefix=f"{prefix}.fc2", config=config, weights=weights, bias=True
        )

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.fc1(hidden_states)
        hidden_states = self.activation_fn(hidden_states)
        hidden_states = self.fc2(hidden_states)
        return hidden_states


class CLIPEncoderLayer(nn.Module):
    def __init__(self, prefix, config: CLIPConfig, weights):
        super().__init__()
        self.embed_dim = config.hidden_size
        self.self_attn = CLIPAttention(
            prefix=f"{prefix}.self_attn", config=config, weights=weights
        )
        self.layer_norm1 = nn.LayerNorm.load(
            prefix=f"{prefix}.layer_norm1", weights=weights, eps=config.layer_norm_eps
        )
        self.mlp = CLIPMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
        self.layer_norm2 = nn.LayerNorm.load(
            prefix=f"{prefix}.layer_norm2", weights=weights, eps=config.layer_norm_eps
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
        causal_attention_mask: torch.Tensor,
    ):
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
        """
        residual = hidden_states

        hidden_states = self.layer_norm1(hidden_states)
        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            causal_attention_mask=causal_attention_mask,
        )
        hidden_states = residual + hidden_states

        residual = hidden_states
        hidden_states = self.layer_norm2(hidden_states)
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states

        return hidden_states


class CLIPPreTrainedModel(nn.Module):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = CLIPConfig
    base_model_prefix = "clip"
    supports_gradient_checkpointing = True


CLIP_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`CLIPConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

CLIP_TEXT_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
"""

CLIP_VISION_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
"""

CLIP_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.
"""


class CLIPEncoder(nn.Module):
    """
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`CLIPEncoderLayer`].

    Args:
        config: CLIPConfig
    """

    def __init__(self, prefix, config: CLIPConfig, weights):
        super().__init__()
        self.config = config
        self.layers = nn.ModuleList(
            [
                CLIPEncoderLayer(
                    prefix=f"{prefix}.layers.{i}", config=config, weights=weights
                )
                for i in range(config.num_hidden_layers)
            ]
        )

    def forward(
        self,
        inputs_embeds,
        attention_mask: Optional[torch.Tensor] = None,
        causal_attention_mask: Optional[torch.Tensor] = None,
    ):
        r"""
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
        """

        hidden_states = inputs_embeds
        for idx, encoder_layer in enumerate(self.layers):
            hidden_states = encoder_layer(
                hidden_states,
                attention_mask,
                causal_attention_mask,
            )

        return hidden_states


class CLIPTextTransformer(nn.Module):
    def __init__(self, prefix: str, config: CLIPTextConfig, weights=None):
        super().__init__()
        self.config = config
        embed_dim = config.hidden_size
        self.embeddings = CLIPTextEmbeddings(config)
        # Initialize weights and apply final processing with `self.post_init()`
        self.encoder = CLIPEncoder(
            prefix=f"{prefix}.encoder", config=config, weights=weights
        )
        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)

        # For `pooled_output` computation
        self.eos_token_id = config.eos_token_id

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
    ):
        r"""
        Returns:

        """
        if input_ids is None:
            raise ValueError("You have to specify input_ids")

        input_shape = input_ids.size()
        input_ids = input_ids.view(-1, input_shape[-1])

        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)

        # CLIP's text model uses causal mask, prepare it here.
        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
        causal_attention_mask = _create_4d_causal_attention_mask(
            input_shape, hidden_states.dtype, device=hidden_states.device
        )
        # expand attention_mask
        if attention_mask is not None:
            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
            attention_mask = _prepare_4d_attention_mask(
                attention_mask, hidden_states.dtype
            )

        encoder_outputs = self.encoder(
            inputs_embeds=hidden_states,
            attention_mask=attention_mask,
            causal_attention_mask=causal_attention_mask,
        )

        last_hidden_state = encoder_outputs[0]
        last_hidden_state = self.final_layer_norm(last_hidden_state)

        if self.eos_token_id == 2:
            # The `eos_token_id` was incorrect before PR #24773: Let's keep what have been done here.
            # A CLIP model with such `eos_token_id` in the config can't work correctly with extra new tokens added
            # ------------------------------------------------------------
            # text_embeds.shape = [batch_size, sequence_length, transformer.width]
            # take features from the eot embedding (eot_token is the highest number in each sequence)
            # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
            last_hidden_state[
                torch.arange(
                    last_hidden_state.shape[0], device=last_hidden_state.device
                ),
                input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(
                    dim=-1
                ),
            ]
        else:
            # The config gets updated `eos_token_id` from PR #24773 (so the use of exta new tokens is possible)
            last_hidden_state[
                torch.arange(
                    last_hidden_state.shape[0], device=last_hidden_state.device
                ),
                # We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`)
                (
                    input_ids.to(dtype=torch.int, device=last_hidden_state.device)
                    == self.eos_token_id
                )
                .int()
                .argmax(dim=-1),
            ]

        return last_hidden_state


class CLIPTextModel(CLIPPreTrainedModel):
    config_class = CLIPTextConfig

    _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"]

    def __init__(self, prefix, config: CLIPTextConfig):
        super().__init__(config)
        self.text_model = CLIPTextTransformer(prefix, config)
        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
    ):
        r"""
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPTextModel

        >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```"""

        return self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
        )


class CLIPVisionTransformer(nn.Module):
    def __init__(self, prefix, config: CLIPVisionConfig, weights):
        super().__init__()
        self.config = config

        self.embeddings = CLIPVisionEmbeddings(
            prefix=f"{prefix}.embeddings", config=config, weights=weights
        )
        self.pre_layrnorm = nn.LayerNorm.load(
            prefix=f"{prefix}.pre_layrnorm", weights=weights, eps=config.layer_norm_eps
        )
        self.encoder = CLIPEncoder(
            prefix=f"{prefix}.encoder", config=config, weights=weights
        )
        # self.post_layernorm = nn.LayerNorm.load(prefix=f"{prefix}.post_layernorm", weights=weights, eps=config.layer_norm_eps)

    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
    ):
        r"""
        Returns:

        """
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        hidden_states = self.embeddings(pixel_values)
        hidden_states = self.pre_layrnorm(hidden_states)

        encoder_outputs = self.encoder(
            inputs_embeds=hidden_states,
        )
        last_hidden_state = encoder_outputs
        # pooled_output = last_hidden_state[:, 0, :]
        # pooled_output = self.post_layernorm(pooled_output)

        return BaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,
            # pooler_output=pooled_output,
            # hidden_states=encoder_outputs,
        )


class CLIPVisionModel(CLIPPreTrainedModel):
    config_class = CLIPVisionConfig
    main_input_name = "pixel_values"
    _no_split_modules = ["CLIPEncoderLayer"]

    def __init__(self, config: CLIPVisionConfig):
        super().__init__(config)
        self.vision_model = CLIPVisionTransformer(config)
        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self) -> nn.Module:
        return self.vision_model.embeddings.patch_embedding

    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
    ):
        r"""
        Returns:

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPVisionModel

        >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```"""

        return self.vision_model(
            pixel_values=pixel_values,
        )


class CLIPModel(nn.Module):
    def __init__(self, prefix, config: CLIPConfig, weights):
        super().__init__()
        text_config = config.text_config
        vision_config = config.vision_config

        self.projection_dim = config.projection_dim
        self.text_embed_dim = text_config.hidden_size
        self.vision_embed_dim = vision_config.hidden_size

        self.text_model = CLIPTextTransformer(text_config)
        self.vision_model = CLIPVisionTransformer(vision_config)

        self.visual_projection = nn.Linear(
            self.vision_embed_dim, self.projection_dim, bias=False
        )
        self.text_projection = nn.Linear(
            self.text_embed_dim, self.projection_dim, bias=False
        )
        self.logit_scale = nn.Parameter(
            torch.tensor(self.config.logit_scale_init_value)
        )

        # Initialize weights and apply final processing
        self.post_init()

    def get_text_features(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
    ) -> torch.FloatTensor:
        r"""
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPTextModel`].

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPModel

        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```"""
        text_outputs = self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
        )

        pooled_output = text_outputs[1]
        text_features = self.text_projection(pooled_output)

        return text_features

    def get_image_features(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
    ) -> torch.FloatTensor:
        r"""
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPVisionModel`].

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPModel

        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> image_features = model.get_image_features(**inputs)
        ```"""
        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
        vision_outputs = self.vision_model(
            pixel_values=pixel_values,
        )

        pooled_output = vision_outputs[1]  # pooled_output
        image_features = self.visual_projection(pooled_output)

        return image_features

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        pixel_values: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
    ):
        r"""
        Returns:

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPModel

        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```"""
        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
        vision_outputs = self.vision_model(
            pixel_values=pixel_values,
        )

        text_outputs = self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
        )

        image_embeds = vision_outputs[1]
        image_embeds = self.visual_projection(image_embeds)

        text_embeds = text_outputs[1]
        text_embeds = self.text_projection(text_embeds)

        # normalized features
        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)

        # cosine similarity as logits
        logit_scale = self.logit_scale.exp()
        logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
        logits_per_image = logits_per_text.t()

        return logits_per_image, logits_per_text


================================================
FILE: server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
================================================
# coding=utf-8
# Copyright 2024 Cohere team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
import torch.distributed

from torch import nn
from transformers.activations import ACT2FN
from typing import Optional, List, Tuple

from text_generation_server.layers.attention import (
    paged_attention,
    attention,
    Seqlen,
)
from text_generation_server.layers.attention.kv_cache import get_kv_scales
from text_generation_server.utils.import_utils import SYSTEM
from text_generation_server.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    SpeculativeHead,
    get_linear,
)
from text_generation_server.layers.layernorm import (
    FastLayerNorm,
)
from text_generation_server.layers.rotary import (
    PositionRotaryEmbedding,
)
from text_generation_server.utils.weights import UnquantizedWeight

if SYSTEM == "cuda":
    import dropout_layer_norm
else:
    dropout_layer_norm = None


class CohereRotary(PositionRotaryEmbedding):
    def forward(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        cos: torch.Tensor,
        sin: torch.Tensor,
    ):
        # Such controlflows may add some overhead.
        if SYSTEM == "cuda":
            from text_generation_server.utils.kernels import load_kernel

            rotary = load_kernel(module="rotary", repo_id="kernels-community/rotary")

            q1 = query[..., ::2]
            q2 = query[..., 1::2]

            rotary.apply_rotary(q1, q2, cos, sin, q1, q2, False)

            k1 = key[..., ::2]
            k2 = key[..., 1::2]

            rotary.apply_rotary(k1, k2, cos, sin, k1, k2, False)
        elif SYSTEM == "rocm":
            import vllm._custom_ops as ops

            # NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems.
            # Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773

            head_size = query.shape[-1]

            # Inplace operation, updating query and key.
            ops.rotary_embedding(query, key, head_size, cos, sin, False)
        elif SYSTEM == "ipex":
            import intel_extension_for_pytorch as ipex

            ipex.llm.functional.rotary_embedding(
                query, key, sin, cos, query.size(-1), False
            )
        else:
            raise ValueError(
                "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
            )


class CohereLayerNorm(nn.Module):
    def __init__(self, prefix, weights, eps):
        super().__init__()
        weight = weights.get_sharded(f"{prefix}.weight", dim=0)
        self.weight = nn.Parameter(weight)
        # Fake weights
        self.ones = weight.new_ones(weight.shape[1])
        self.eps = eps

    def forward(self, hidden_states):
        if hidden_states.shape[-1] > 8192 or SYSTEM != "cuda":
            hidden_states = hidden_states.reshape(
                -1, self.weight.shape[0], self.weight.shape[1]
            )
            input_dtype = hidden_states.dtype
            hidden_states = hidden_states.to(torch.float32)
            mean = hidden_states.mean(-1, keepdim=True)
            hidden_states_minus_mean = hidden_states - mean
            variance = hidden_states_minus_mean.pow(2).mean(-1, keepdim=True)
            hidden_states = hidden_states_minus_mean * torch.rsqrt(variance + self.eps)
            hidden_states = self.weight.to(torch.float32) * hidden_states
            hidden_states = hidden_states.view(-1, self.weight.shape[1])
            return hidden_states.to(input_dtype)

        (
            hidden_states,
            *rest,
        ) = dropout_layer_norm.dropout_add_ln_fwd(
            hidden_states,
            None,
            self.ones,
            None,
            None,
            None,
            None,
            None,
            0.0,
            self.eps,
            1.0,
            0,
            None,
            False,
            False,
        )

        # Required to apply one weight matrix per head
        hidden_states = hidden_states.view(
            -1, self.weight.shape[0], self.weight.shape[1]
        )
        hidden_states = self.weight * hidden_states
        hidden_states = hidden_states.view(-1, self.weight.shape[1])

        return hidden_states


def load_attention(config, prefix, weights):
    if config.num_attention_heads != config.num_key_value_heads:
        return _load_gqa(config, prefix, weights)
    else:
        return TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
            dim=0,
            weights=weights,
            bias=config.attention_bias,
        )


def _load_gqa(config, prefix: str, weights):
    assert config.hidden_size % config.num_attention_heads == 0
    assert config.num_attention_heads % weights.process_group.size() == 0

    weight = weights.get_multi_weights_col(
        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
        dim=0,
    )

    if isinstance(weight, UnquantizedWeight):
        weight.weight = weight.weight.to(dtype=weights.dtype).to(device=weights.device)

        head_size = config.hidden_size // config.num_attention_heads
        num_heads = config.num_attention_heads // weights.process_group.size()
        num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
        assert list(weight.weight.shape) == [
            (num_heads + 2 * num_key_value_heads) * head_size,
            config.hidden_size,
        ], f"{list(weight.weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"

    if config.attention_bias:
        w = [
            weights.get_sharded(f"{p}.bias", dim=0)
            for p in [f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"]
        ]
        bias = torch.cat(w, dim=0).to(dtype=weights.dtype).to(device=weights.device)
    else:
        bias = None

    return TensorParallelColumnLinear(get_linear(weight, bias=bias))


class FlashCohereAttention(torch.nn.Module):
    def __init__(
        self,
        prefix: str,
        config,
        weights,
    ):
        super().__init__()
        self.num_heads = config.num_attention_heads
        self.hidden_size = config.hidden_size
        self.head_size = self.hidden_size // self.num_heads

        self.rotary_emb = CohereRotary.static(
            config=config,
            dim=self.head_size,
            base=config.rope_theta,
            device=weights.device,
        )

        self.softmax_scale = self.head_size**-0.5

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()
        self.num_key_value_heads = (
            config.num_key_value_heads // weights.process_group.size()
        )

        self.query_key_value = load_attention(config, prefix, weights)
        self.kv_scales = get_kv_scales(weights, f"{prefix}")

        self.use_qk_norm = config.use_qk_norm
        if self.use_qk_norm:
            self.q_norm = CohereLayerNorm(
                prefix=f"{prefix}.q_norm",
                weights=weights,
                eps=config.layer_norm_eps,
            )
            self.k_norm = CohereLayerNorm(
                prefix=f"{prefix}.k_norm",
                weights=weights,
                eps=config.layer_norm_eps,
            )
        else:
            self.q_norm = None
            self.k_norm = None

        self.o_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.o_proj",
            weights=weights,
            bias=config.attention_bias,
        )
        self.num_groups = self.num_heads // self.num_key_value_heads
        self.kv_head_mapping = torch.arange(
            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
        ).repeat_interleave(self.num_groups)

    def forward(
        self,
        hidden_states,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        seqlen,
        max_s,
    ):
        qkv = self.query_key_value(hidden_states)
        query, key, value = qkv.split(
            [
                self.head_size * self.num_heads,
                self.head_size * self.num_key_value_heads,
                self.head_size * self.num_key_value_heads,
            ],
            dim=1,
        )

        if self.use_qk_norm:
            query = query.reshape(-1, self.head_size)
            key = key.reshape(-1, self.head_size)
            query = self.q_norm(query.contiguous())
            key = self.k_norm(key.contiguous())

        query = query.view(-1, self.num_heads, self.head_size)
        key = key.view(-1, self.num_key_value_heads, self.head_size)
        value = value.view(-1, self.num_key_value_heads, self.head_size)

        self.rotary_emb(query, key, cos, sin)

        kv_cache.store(
            key=key,
            value=value,
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
            attn_output = attention(
                query=query,
                key=key,
                value=value,
                kv_cache=kv_cache,
                kv_scales=self.kv_scales,
                seqlen=seqlen,
                block_tables=block_tables,
                softmax_scale=self.softmax_scale,
            )
        # Decode
        else:
            attn_output = paged_attention(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                seqlen,
                max_s,
                kv_scales=self.kv_scales,
            )

        return self.o_proj(
            attn_output.view(-1, self.num_heads * self.head_size), reduce=False
        )


class CohereMLP(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        act = config.hidden_act
        self.act = (
            ACT2FN[act]
            if "gelu" not in act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
                ),
            )
        )
        # Fuse gate and up proj
        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
            weights=weights,
            dim=0,
            bias=False,
        )
        self.down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.down_proj",
            weights=weights,
            bias=False,
        )
        self.intermediate_size = (
            config.intermediate_size // weights.process_group.size()
        )

    def forward(self, hidden_states):
        gate_up_states = self.gate_up_proj(hidden_states)
        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
        return self.down_proj(
            self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], reduce=False
        )


class FlashCohereLayer(nn.Module):
    def __init__(self, prefix: str, layer_id, config, weights):
        super().__init__()
        prefix = f"{prefix}.layers.{layer_id}"
        self.self_attn = FlashCohereAttention(
            prefix=f"{prefix}.self_attn", config=config, weights=weights
        )
        self.mlp = CohereMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)

        self.input_layernorm = FastLayerNorm.load_no_bias(
            prefix=f"{prefix}.input_layernorm",
            weights=weights,
            eps=config.layer_norm_eps,
        )
        self.process_group = weights.process_group

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        seqlen,
        max_s,
    ):
        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)

        # Self Attention
        attn_output = self.self_attn(
            normed_hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
        )

        mlp_output = self.mlp(normed_hidden_states)
        output = attn_output + mlp_output

        if self.process_group.size() > 1:
            torch.distributed.all_reduce(output, group=self.process_group)

        return output, res


class FlashCohereModel(torch.nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()

        process_group = weights.process_group
        self.tp_rank = process_group.rank()
        self.tp_world_size = process_group.size()
        self.embed_tokens = TensorParallelEmbedding(
            prefix=f"{prefix}.embed_tokens", weights=weights
        )
        self.layers = nn.ModuleList(
            [
                FlashCohereLayer(
                    prefix,
                    layer_id,
                    config,
                    weights,
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )
        self.norm = FastLayerNorm.load_no_bias(
            prefix=f"{prefix}.norm", weights=weights, eps=config.layer_norm_eps
        )

        self.gradient_checkpointing = False

        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads
        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: torch.Tensor,
        max_s: int,
    ) -> torch.Tensor:
        hidden_states = self.embed_tokens(input_ids)

        # Get rotary cos and sin for this forward
        # Avoid to index in each layer
        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
            position_ids, max_s, hidden_states.dtype
        )

        residual = None

        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                block_tables,
                slots,
                seqlen,
                max_s,
            )

        hidden_states, _ = self.norm(hidden_states, residual)

        return hidden_states


class FlashCohereForCausalLM(torch.nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()

        if not prefix:
            prefix = "model"
        else:
            prefix = f"{prefix}.model"

        self.model = FlashCohereModel(prefix, config, weights)
        try:
            self.lm_head = SpeculativeHead.load(
                config,
                prefix="lm_head",
                weights=weights,
            )
        except RuntimeError:
            self.lm_head = SpeculativeHead.load(
                config,
                prefix=f"{prefix}.embed_tokens",
                weights=weights,
            )
        self.logit_scale = config.logit_scale

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        prefill_cache_indices: Optional[torch.Tensor],
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        hidden_states = self.model(
            input_ids,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.lm_head(hidden_states)
        logits *= self.logit_scale
        if speculative_logits is not None:
            speculative_logits *= self.logit_scale
        return logits, speculative_logits


================================================
FILE: server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
================================================
# coding=utf-8
# Copyright 2022 HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
import torch.distributed

from torch import nn
from transformers.activations import ACT2FN
from transformers.configuration_utils import PretrainedConfig
from typing import Optional, List, Tuple, Any
from text_generation_server.layers.attention.kv_cache import get_kv_scales
from text_generation_server.utils.import_utils import SYSTEM
from text_generation_server.utils.kernels import load_kernel

if SYSTEM == "ipex":
    from intel_extension_for_pytorch.llm.modules import GatedMLPMOE
elif SYSTEM == "cuda":
    moe_kernels = load_kernel(module="moe", repo_id="kernels-community/moe")
else:
    import moe_kernels

from text_generation_server.layers.attention import (
    paged_attention,
    attention,
    Seqlen,
)
from text_generation_server.layers import (
    FastLinear,
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    SpeculativeHead,
    get_linear,
)
from text_generation_server.layers.rotary import (
    PositionRotaryEmbedding,
)
from text_generation_server.layers.layernorm import (
    FastLayerNorm,
)


class DbrxAttentionConfig(PretrainedConfig):
    def __init__(
        self,
        attn_pdrop: float = 0,
        clip_qkv: Optional[float] = None,
        kv_n_heads: int = 1,
        rope_theta: float = 10000.0,
        **kwargs: Any,
    ):
        super().__init__(**kwargs)
        self.attn_pdrop = attn_pdrop
        self.clip_qkv = clip_qkv
        self.kv_n_heads = kv_n_heads
        self.rope_theta = rope_theta

        for k in ["model_type"]:
            if k in kwargs:
                kwargs.pop(k)
        if len(kwargs) != 0:
            raise ValueError(f"Found unknown {kwargs=}")


class DbrxFFNConfig(PretrainedConfig):
    def __init__(
        self,
        ffn_act_fn: Optional[dict] = None,
        ffn_hidden_size: int = 3584,
        moe_num_experts: int = 4,
        moe_top_k: int = 1,
        moe_jitter_eps: Optional[float] = None,
        moe_loss_weight: float = 0.01,
        moe_normalize_expert_weights: Optional[float] = 1,
        uniform_expert_assignment: bool = False,
        **kwargs: Any,
    ):
        super().__init__()
        if ffn_act_fn is None:
            ffn_act_fn = {"name": "silu"}
        self.ffn_act_fn = ffn_act_fn
        self.ffn_hidden_size = ffn_hidden_size
        self.moe_num_experts = moe_num_experts
        self.moe_top_k = moe_top_k
        self.moe_jitter_eps = moe_jitter_eps
        self.moe_loss_weight = moe_loss_weight
        self.moe_normalize_expert_weights = moe_normalize_expert_weights
        self.uniform_expert_assignment = uniform_expert_assignment

        if uniform_expert_assignment:
            raise ValueError("`uniform_expert_assignment = True` is not supported")

        for k in ["model_type"]:
            if k in kwargs:
                kwargs.pop(k)
        if len(kwargs) != 0:
            raise ValueError(f"Found unknown {kwargs=}")


class DbrxConfig(PretrainedConfig):
    attribute_map = {
        "hidden_size": "d_model",
        "num_attention_heads": "n_heads",
        "num_hidden_layers": "n_layers",
    }

    def __init__(
        self,
        d_model: int = 2048,
        n_heads: int = 16,
        n_layers: int = 24,
        max_seq_len: int = 2048,
        vocab_size: int = 32000,
        resid_pdrop: float = 0.0,
        emb_pdrop: float = 0.0,
        attn_config: Optional[DbrxAttentionConfig] = None,
        ffn_config: Optional[DbrxFFNConfig] = None,
        use_cache: bool = True,
        initializer_range: float = 0.02,
        output_router_logits: bool = False,
        router_aux_loss_coef: float = 0.05,
        **kwargs: Any,
    ):
        if attn_config is None:
            self.attn_config = DbrxAttentionConfig()
        elif isinstance(attn_config, dict):
            self.attn_config = DbrxAttentionConfig(**attn_config)
        else:
            self.attn_config = attn_config

        if ffn_config is None:
            self.ffn_config = DbrxFFNConfig()
        elif isinstance(ffn_config, dict):
            self.ffn_config = DbrxFFNConfig(**ffn_config)
        else:
            self.ffn_config = ffn_config

        self.d_model = d_model
        self.n_heads = n_heads
        self.n_layers = n_layers
        self.max_seq_len = max_seq_len
        self.vocab_size = vocab_size
        self.resid_pdrop = resid_pdrop
        self.emb_pdrop = emb_pdrop
        self.use_cache = use_cache
        self.initializer_range = initializer_range
        self.output_router_logits = output_router_logits
        self.router_aux_loss_coef = router_aux_loss_coef

        tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
        if tie_word_embeddings:
            raise ValueError("tie_word_embeddings is not supported for Dbrx models.")

        super().__init__(
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )

    @property
    def num_key_value_heads(self):
        # We can't use the attribute map, since this the number of KV
        # heads is not top-level.
        return self.attn_config.kv_n_heads


def promote_scalar(x: torch.Tensor) -> torch.Tensor:
    return x.view(1) if len(x.size()) == 0 else x


def load_attention(config, prefix, weights):
    return TensorParallelColumnLinear.load_qkv(
        config,
        prefix=f"{prefix}.Wqkv",
        weights=weights,
        bias=False,
        num_heads=config.n_heads,
        num_key_value_heads=config.attn_config.kv_n_heads,
    )


def _load_experts(config, prefix, weights):
    world_size = weights.process_group.size()
    rank = weights.process_group.rank()

    assert (
        config.ffn_config.ffn_hidden_size % world_size == 0
    ), f"The chosen size {config.ffn_config.ffn_hidden_size} is not compatible with sharding on {world_size} shards"

    expert_size = config.ffn_config.ffn_hidden_size
    block_size = expert_size // world_size
    start = rank * block_size
    stop = (rank + 1) * block_size

    tensor = torch.empty(
        (config.ffn_config.moe_num_experts * block_size, config.d_model),
        dtype=weights.dtype,
        device=weights.device,
    )

    slice_ = weights._get_slice(f"{prefix}")

    for i in range(config.ffn_config.moe_num_experts):
        offset = i * expert_size
        expert_slice = slice_[start + offset : stop + offset]

        tensor[i * block_size : (i + 1) * block_size] = expert_slice.to(
            dtype=weights.dtype
        ).to(device=weights.device)
    return tensor


def _load_experts_quantized(config, prefix, weights, cls):
    world_size = weights.process_group.size()
    rank = weights.process_group.rank()

    assert (
        config.ffn_config.ffn_hidden_size % world_size == 0
    ), f"The chosen size {config.ffn_config.ffn_hidden_size} is not compatible with sharding on {world_size} shards"

    expert_size = config.ffn_config.ffn_hidden_size
    block_size = expert_size // world_size
    start = rank * block_size
    stop = (rank + 1) * block_size

    slice_ = weights._get_slice(f"{prefix}")

    experts = []
    for i in range(config.ffn_config.moe_num_experts):
        if config.quantize in ["gptq", "awq"]:
            raise NotImplementedError(
                "Dbrx does not support gptq/awq quantization yet."
            )
        else:
            offset = i * expert_size
            expert_slice = (
                slice_[start + offset : stop + offset]
                .to(dtype=weights.dtype)
                .to(device=weights.device)
            )

        if cls == TensorParallelRowLinear:
            expert_slice = expert_slice.t().contiguous()
            linear = get_linear(expert_slice, None)
            experts.append(cls(linear, weights.process_group))
        else:
            linear = get_linear(expert_slice, None)
            experts.append(cls(linear))

    return experts


class DbrxAttention(torch.nn.Module):
    def __init__(
        self,
        prefix: str,
        config,
        weights,
    ):
        super().__init__()
        self.clip_qkv = config.attn_config.clip_qkv
        self.num_heads = config.n_heads
        self.hidden_size = config.d_model
        self.head_size = self.hidden_size // self.num_heads

        self.rotary_emb = PositionRotaryEmbedding.static(
            config=config,
            dim=self.head_size,
            base=config.attn_config.rope_theta,
            device=weights.device,
        )

        self.softmax_scale = self.head_size**-0.5

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()
        self.num_key_value_heads = (
            config.attn_config.kv_n_heads // weights.process_group.size()
        )

        self.query_key_value = load_attention(config, prefix, weights)
        self.kv_scales = get_kv_scales(weights, f"{prefix}")

        self.o_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.out_proj",
            weights=weights,
            bias=False,
        )
        self.num_groups = self.num_heads // self.num_key_value_heads
        self.kv_head_mapping = torch.arange(
            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
        ).repeat_interleave(self.num_groups)

    def forward(
        self,
        hidden_states,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        seqlen,
        max_s,
    ):
        qkv = self.query_key_value(hidden_states)
        if self.clip_qkv is not None:
            qkv = qkv.clamp(min=-self.clip_qkv, max=self.clip_qkv)

        query, kv = qkv.split(
            [
                self.head_size * self.num_heads,
                2 * self.head_size * self.num_key_value_heads,
            ],
            dim=1,
        )
        query = query.view(-1, self.num_heads, self.head_size)
        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)

        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)

        kv_cache.store(
            key=kv[:, 0],
            value=kv[:, 1],
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
            attn_output = attention(
                query=query,
                key=kv[:, 0],
                value=kv[:, 1],
                kv_cache=kv_cache,
                kv_scales=self.kv_scales,
                seqlen=seqlen,
                block_tables=block_tables,
                softmax_scale=self.softmax_scale,
            )
        # Decode
        else:
            attn_output = paged_attention(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                seqlen,
                max_s,
                kv_scales=self.kv_scales,
            )

        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))


class DbrxNormAttentionNorm(nn.Module):
    def __init__(
        self,
        prefix: str,
        config,
        weights,
    ):
        super().__init__()
        self.norm_1 = FastLayerNorm.load_no_bias(
            prefix=f"{prefix}.norm_1", weights=weights, eps=1e-5
        )
        self.self_attn = DbrxAttention(
            prefix=f"{prefix}.attn", config=config, weights=weights
        )
        self.norm_2 = FastLayerNorm.load_no_bias(
            prefix=f"{prefix}.norm_2",
            weights=weights,
            eps=1e-5,
        )

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        seqlen,
        max_s,
    ):
        normed_hidden_states, res = self.norm_1(hidden_states, residual)

        # Self Attention
        attn_output = self.self_attn(
            normed_hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
        )

        # faster post attention rms norm
        normed_attn_res_output, attn_res = self.norm_2(attn_output, res)

        return normed_attn_res_output, attn_res


@torch.jit.script
def select_experts(
    gate_logits: torch.Tensor, top_k: int, moe_normalize_expert_weights: int
):
    # all_probs: (sequence_length, n_experts) and upcast for softmax
    all_probs = torch.nn.functional.softmax(gate_logits, dim=1, dtype=torch.float)
    # weights, selected_experts: (sequence_length, top-k)
    weights, selected_experts = torch.topk(all_probs, top_k, dim=-1)
    if moe_normalize_expert_weights:
        weights = weights / torch.norm(
            weights, p=moe_normalize_expert_weights, dim=-1, keepdim=True
        )
    weights = weights.view(-1)
    selected_experts = selected_experts.view(-1)

    return selected_experts, weights


@torch.jit.script
def round_up(x: torch.Tensor, value: int):
    return torch.div(x + (value - 1), value, rounding_mode="trunc") * value


class BlockSparseMoE(nn.Module):
    def __init__(self, prefix, config: DbrxConfig, weights):
        super().__init__()
        self.moe_normalize_expert_weights = (
            config.ffn_config.moe_normalize_expert_weights
        )
        self.hidden_dim = config.d_model
        self.ffn_dim = config.ffn_config.ffn_hidden_size // weights.process_group.size()
        self.num_experts = config.ffn_config.moe_num_experts
        self.top_k = config.ffn_config.moe_top_k

        act = config.ffn_config.ffn_act_fn["name"]
        if "gelu" in act:
            self.act = lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
                ),
            )
        elif "silu" in act:
            self.act = torch.nn.functional.silu
        else:
            self.act = ACT2FN[act]

        # gating
        self.gate = FastLinear.load(
            config, f"{prefix}.router.layer", weights, bias=False
        )

        # merged expert weights, all of size  (n_experts * ffn_dim, hidden_dim)
        w1 = _load_experts(config, f"{prefix}.experts.mlp.w1", weights).view(
            self.num_experts, self.ffn_dim, self.hidden_dim
        )
        v1 = _load_experts(config, f"{prefix}.experts.mlp.v1", weights).view(
            self.num_experts, self.ffn_dim, self.hidden_dim
        )
        self.wv1 = torch.cat([w1, v1], dim=1)
        self.w2 = (
            _load_experts(config, f"{prefix}.experts.mlp.w2", weights)
            .view(self.num_experts, self.ffn_dim, self.hidden_dim)
            .transpose(1, 2)
            .contiguous()
        )

        self.process_group = weights.process_group
        if SYSTEM == "ipex":
            self.ipex_fused_moe = GatedMLPMOE(
                W13=self.wv1, W2=self.w2, use_prepack=True
            )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # router_logits: (num_tokens, n_experts)
        router_logits = self.gate(x)

        if SYSTEM == "ipex":
            out = self.ipex_fused_moe(
                hidden_states=x,
                router_logits=router_logits,
                top_k=self.top_k,
                renormalize=self.moe_normalize_expert_weights,
                use_grouped_topk=False,
                num_expert_group=None,
                topk_group=None,
            )
        else:
            out = moe_kernels.fused_moe(
                x,
                self.wv1,
                self.w2,
                router_logits,
                self.top_k,
                renormalize=self.moe_normalize_expert_weights,
                inplace=True,
            )

        # Reduce sum
        if self.process_group.size() > 1:
            torch.distributed.all_reduce(out, group=self.process_group)

        return out.view(*x.shape)


class DenseMoE(nn.Module):
    def __init__(self, prefix, config: DbrxConfig, weights):
        super().__init__()

        self.moe_normalize_expert_weights = (
            config.ffn_config.moe_normalize_expert_weights
        )
        self.hidden_dim = config.d_model
        self.ffn_dim = config.ffn_config.ffn_hidden_size // weights.process_group.size()
        self.num_experts = config.ffn_config.moe_num_experts
        self.top_k = config.ffn_config.moe_top_k

        act = config.ffn_config.ffn_act_fn["name"]
        if "gelu" in act:
            self.act = lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
                ),
            )
        elif "silu" in act:
            self.act = torch.nn.functional.silu
        else:
            self.act = ACT2FN[act]

        # gating
        self.gate = FastLinear.load(
            config, f"{prefix}.router.layer", weights, bias=False
        )

        self.w1 = _load_experts_quantized(
            config,
            prefix=f"{prefix}.experts.mlp.w1",
            weights=weights,
            cls=TensorParallelColumnLinear,
        )
        self.w2 = _load_experts_quantized(
            config,
            prefix=f"{prefix}.experts.mlp.w2",
            weights=weights,
            cls=TensorParallelRowLinear,
        )
        self.v1 = _load_experts_quantized(
            config,
            prefix=f"{prefix}.experts.mlp.v1",
            weights=weights,
            cls=TensorParallelColumnLinear,
        )

        self.process_group = weights.process_group

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        x: (sequence_length, model_dim)
        gate_logits: (sequence_length, n_experts)
        """
        # optional reshape
        input_shape = x.shape
        x = x.view(-1, input_shape[-1])

        # gate_logits: (sequence_length, n_experts)
        gate_logits = self.gate(x)
        # all_probs: (sequence_length, n_experts) and upcast for softmax
        weights = torch.nn.functional.softmax(gate_logits, dim=1, dtype=torch.float)

        if self.top_k < self.num_experts:
            _, not_selected_experts = torch.topk(
                weights,
                self.num_experts - self.top_k,
                largest=False,
                sorted=False,
                dim=1,
            )
            # Mask not selected experts
            weights.scatter_(1, not_selected_experts, 0)

        # Re-normalize
        if self.moe_normalize_expert_weights:
            weights = weights / torch.norm(
                weights, p=self.moe_normalize_expert_weights, dim=-1, keepdim=True
            )
        weights = weights.to(x.dtype)

        # Final output tensor
        out = x.new_zeros(x.shape[0], self.hidden_dim)
        for i in range(self.num_experts):
            h = self.act(self.w1[i](x)) * self.v1[i](x)
            h = self.w2[i](h, reduce=False)
            # Add expert output to out with masking
            out += h * weights[:, i].view(-1, 1)

        # Reduce sum
        if self.process_group.size() > 1:
            torch.distributed.all_reduce(out, group=self.process_group)

        return out


class DbrxLayer(nn.Module):
    def __init__(self, prefix: str, layer_id, config, weights):
        super().__init__()
        prefix = f"{prefix}.blocks.{layer_id}"

        self.attn = DbrxNormAttentionNorm(
            prefix=f"{prefix}.norm_attn_norm", config=config, weights=weights
        )

        moe_cls = BlockSparseMoE if config.quantize is None else DenseMoE
        self.moe = moe_cls(f"{prefix}.ffn", config, weights)

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        seqlen,
        max_s,
    ):
        # Self Attention
        attn_output, attn_res = self.attn(
            hidden_states,
            residual,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
        )

        moe_output = self.moe(attn_output)

        return moe_output, attn_res


class DbrxModel(torch.nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()

        self.embed_tokens = TensorParallelEmbedding(
            prefix=f"{prefix}.wte", weights=weights
        )

        self.layers = nn.ModuleList(
            [
                DbrxLayer(
                    prefix,
                    layer_id,
                    config,
                    weights,
                )
                for layer_id in range(config.n_layers)
            ]
        )
        self.norm = FastLayerNorm.load_no_bias(
            prefix=f"{prefix}.norm_f", weights=weights, eps=1e-5
        )

        self.head_size = self.layers[0].attn.self_attn.head_size
        self.num_heads = self.layers[0].attn.self_attn.num_heads
        self.num_key_value_heads = self.layers[0].attn.self_attn.num_key_value_heads

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
    ) -> torch.Tensor:
        hidden_states = self.embed_tokens(input_ids)

        # Get rotary cos and sin for this forward
        # Avoid to index in each layer
        cos, sin = self.layers[0].attn.self_attn.rotary_emb.get_cos_sin(
            position_ids, max_s, hidden_states.dtype
        )

        residual = None
        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                block_tables,
                slots,
                seqlen,
                max_s,
            )

        hidden_states, _ = self.norm(hidden_states, residual)

        return hidden_states


class FlashDbrxForCausalLM(torch.nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()

        if not prefix:
            prefix = "transformer"
        else:
            prefix = f"{prefix}.transformer"

        self.model = DbrxModel(prefix, config, weights)
        self.lm_head = SpeculativeHead.load(
            config,
            prefix="lm_head",
            weights=weights,
        )

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        prefill_cache_indices: Optional[torch.Tensor],
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        hidden_states = self.model(
            input_ids,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.lm_head(hidden_states)
        return logits, speculative_logits


================================================
FILE: server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py
================================================
# coding=utf-8
# Copyright 2023, 2024 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List, Optional, Tuple, Type

import torch
import torch.distributed
from torch import nn
from transformers.activations import ACT2FN
from transformers.configuration_utils import PretrainedConfig

from text_generation_server.layers import (
    FastLinear,
    SpeculativeHead,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    TensorParallelRowLinear,
    get_linear,
)
from text_generation_server.layers.attention import (
    Seqlen,
    attention,
    paged_attention,
)
from text_generation_server.layers.attention.kv_cache import KVCache, get_kv_scales
from text_generation_server.layers.layernorm import FastRMSNorm
from text_generation_server.layers.moe import DenseMoELayer, MoELayer, SparseMoELayer
from text_generation_server.layers.rotary import PositionRotaryEmbedding, get_mscale
from text_generation_server.utils.import_utils import SYSTEM
from text_generation_server.utils.weights import Weights

if SYSTEM == "rocm":
    try:
        import vllm._custom_ops as ops
    except Exception as e:
        raise ImportError(f"Could not load `vllm._custom_ops`. Full error: {e}")


class DeepseekV2Config(PretrainedConfig):
    def __init__(
        self,
        vocab_size=102400,
        hidden_size=4096,
        intermediate_size=11008,
        moe_intermediate_size=1407,
        num_hidden_layers=30,
        num_attention_heads=32,
        num_key_value_heads=32,
        n_shared_experts=2,
        n_routed_experts=160,
        ep_size=1,
        routed_scaling_factor=1.0,
        kv_lora_rank=512,
        q_lora_rank=1536,
        qk_rope_head_dim=64,
        v_head_dim=128,
        qk_nope_head_dim=128,
        topk_method="gready",
        n_group=8,
        topk_group=3,
        num_experts_per_tok=6,
        moe_layer_freq=1,
        first_k_dense_replace=0,
        norm_topk_prob=False,
        scoring_func="softmax",
        aux_loss_alpha=0.001,
        seq_aux=True,
        hidden_act="silu",
        max_position_embeddings=2048,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        pad_token_id=None,
        bos_token_id=100000,
        eos_token_id=100001,
        pretraining_tp=1,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
        attention_bias=False,
        attention_dropout=0.0,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.moe_intermediate_size = moe_intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.n_shared_experts = n_shared_experts
        self.n_routed_experts = n_routed_experts
        self.ep_size = ep_size
        self.routed_scaling_factor = routed_scaling_factor
        self.kv_lora_rank = kv_lora_rank
        self.q_lora_rank = q_lora_rank
        self.qk_rope_head_dim = qk_rope_head_dim
        self.v_head_dim = v_head_dim
        self.qk_nope_head_dim = qk_nope_head_dim
        self.topk_method = topk_method
        self.n_group = n_group
        self.topk_group = topk_group
        self.num_experts_per_tok = num_experts_per_tok
        self.moe_layer_freq = moe_layer_freq
        self.first_k_dense_replace = first_k_dense_replace
        self.norm_topk_prob = norm_topk_prob
        self.scoring_func = scoring_func
        self.aux_loss_alpha = aux_loss_alpha
        self.seq_aux = seq_aux
        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.pretraining_tp = pretraining_tp
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout

        tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
        if tie_word_embeddings:
            raise ValueError(
                "tie_word_embeddings is not supported for Deepseek V2 models."
            )

        if ep_size != 1:
            raise ValueError(
                f"Currently only ep_size == 1 is supported for Deepseek V2 models, was {ep_size}"
            )

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )


class DeepseekV2Attention(torch.nn.Module):
    def __init__(
        self,
        prefix: str,
        config,
        weights: Weights,
    ):
        super().__init__()
        self.num_heads = config.num_attention_heads
        self.hidden_size = config.hidden_size
        self.kv_lora_rank = config.kv_lora_rank
        self.q_lora_rank = config.q_lora_rank
        self.qk_nope_head_dim = config.qk_nope_head_dim
        self.qk_rope_head_dim = config.qk_rope_head_dim
        self.head_size = config.qk_nope_head_dim + config.qk_rope_head_dim
        self.value_head_size = config.v_head_dim
        self.head_pad_size = max(self.head_size, self.value_head_size)

        self.rotary_emb = PositionRotaryEmbedding.static(
            config=config,
            dim=self.qk_rope_head_dim,
            base=config.rope_theta,
            device=weights.device,
        )

        mscale = get_mscale(
            self.rotary_emb.scaling_factor, self.rotary_emb.mscale_all_dim
        )
        self.softmax_scale = self.head_size**-0.5 * mscale * mscale

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()
        self.num_key_value_heads = (
            config.num_key_value_heads // weights.process_group.size()
        )

        if self.q_lora_rank is None:
            self.q_proj = TensorParallelColumnLinear.load(
                config,
                prefix=f"{prefix}.q_proj",
                weights=weights,
                bias=config.attention_bias,
            )
        else:
            self.q_a_proj = get_linear(
                weight=weights.get_weights(f"{prefix}.q_a_proj"),
                bias=(
                    weights.get_tensor(f"{prefix}.q_a_proj.bias")
                    if config.attention_bias
                    else None
                ),
            )
            self.q_a_layernorm = FastRMSNorm.load(
                prefix=f"{prefix}.q_a_layernorm",
                weights=weights,
                eps=config.rms_norm_eps,
            )
            self.q_b_proj = TensorParallelColumnLinear.load(
                config,
                prefix=f"{prefix}.q_b_proj",
                weights=weights,
                bias=config.attention_bias,
            )

        self.kv_a_proj_with_mqa = get_linear(
            weight=weights.get_weights(f"{prefix}.kv_a_proj_with_mqa"),
            bias=(
                weights.get_tensor(f"{prefix}.kv_a_proj_with_mqa.bias")
                if config.attention_bias
                else None
            ),
        )

        self.kv_scales = get_kv_scales(weights, f"{prefix}")

        self.kv_a_layernorm = FastRMSNorm.load(
            prefix=f"{prefix}.kv_a_layernorm", weights=weights, eps=config.rms_norm_eps
        )

        self.kv_b_proj = TensorParallelColumnLinear.load(
            config,
            prefix=f"{prefix}.kv_b_proj",
            weights=weights,
            bias=config.attention_bias,
        )

        self.o_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.o_proj",
            weights=weights,
            bias=False,
        )
        self.num_groups = self.num_heads // self.num_key_value_heads
        self.kv_head_mapping = torch.arange(
            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
        ).repeat_interleave(self.num_groups)

    def forward(
        self,
        hidden_states: torch.Tensor,
        cos: torch.Tensor,
        sin: torch.Tensor,
        cu_seqlen_prefill: torch.Tensor,
        kv_cache: KVCache,
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
    ):
        if self.q_lora_rank is None:
            query = self.q_proj(hidden_states)
        else:
            query = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states))[0])
        query = query.view(-1, self.num_heads, self.head_size)

        _, query_pe = torch.split(
            query, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
        )

        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
        compressed_kv, key_pe = torch.split(
            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
        )

        key_pe = key_pe.view(-1, 1, self.qk_rope_head_dim)
        kv = self.kv_b_proj(self.kv_a_layernorm(compressed_kv.contiguous())[0]).view(
            -1, self.num_key_value_heads, self.qk_nope_head_dim + self.value_head_size
        )

        key_nope, value = torch.split(
            kv, [self.qk_nope_head_dim, self.value_head_size], dim=-1
        )

        batch_size, heads, head_dim = query_pe.shape
        query_pe = (
            query_pe.view(batch_size, heads, head_dim // 2, 2)
            .transpose(2, 3)
            .reshape(batch_size, heads, head_dim)
        )
        batch_size, heads, head_dim = key_pe.shape
        key_pe = (
            key_pe.view(batch_size, heads, head_dim // 2, 2)
            .transpose(2, 3)
            .reshape(batch_size, heads, head_dim)
        )
        self.rotary_emb(query_pe, key_pe, cos, sin)

        query[..., self.qk_nope_head_dim :] = query_pe
        key = torch.empty_like(query)
        key[..., : self.qk_nope_head_dim] = key_nope
        key[..., self.qk_nope_head_dim :] = key_pe

        # We need to pad the heads because Flash Attention does not support
        # qk and v with different head sizes.
        query = torch.nn.functional.pad(
            query, (0, self.head_pad_size - self.head_size), value=0
        )
        key = torch.nn.functional.pad(
            key, (0, self.head_pad_size - self.head_size), value=0
        )
        value = torch.nn.functional.pad(
            value, (0, self.head_pad_size - self.value_head_size), value=0
        )

        kv_cache.store(
            key=key,
            value=value,
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
            attn_output = attention(
                query=query,
                key=key,
                value=value,
                kv_cache=kv_cache,
                kv_scales=self.kv_scales,
                seqlen=seqlen,
                block_tables=block_tables,
                softmax_scale=self.softmax_scale,
            )
        # Decode
        else:
            attn_output = paged_attention(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                seqlen,
                max_s,
                kv_scales=self.kv_scales,
            )

        # Remove padding.
        attn_output = attn_output[..., : self.value_head_size]

        return self.o_proj(
            attn_output.reshape(-1, self.num_heads * self.value_head_size)
        )


class DeepseekV2MLP(nn.Module):
    def __init__(self, prefix: str, config, weights, intermediate_size: int):
        super().__init__()
        self.hidden_act = config.hidden_act
        if self.hidden_act != "silu":
            # Bail out because MoE only supports silu.
            raise NotImplementedError(
                "Currently only `silu` is supported as an activation for Deepseek V2."
            )
        self.act = ACT2FN[self.hidden_act]

        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
            weights=weights,
            dim=0,
            bias=False,
        )

        self.down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.down_proj",
            weights=weights,
            bias=False,
        )

        self.intermediate_size = intermediate_size // weights.process_group.size()

        # TODO: This is a hotfix to be removed & properly refactored.
        self.quantize = config.quantize

    def forward(self, hidden_states: torch.Tensor, reduce: bool = True):
        if (
            SYSTEM == "rocm"
            and self.hidden_act == "silu"
            and hidden_states.dtype == torch.float16
            and hidden_states.shape[0] == 1
            and not self.quantize
        ):
            out = torch.empty(
                hidden_states.shape[0],
                self.intermediate_size,
                dtype=hidden_states.dtype,
                device="cuda",
            )
            ops.LLMM_Silu(self.gate_up_proj.linear.weight, hidden_states, out, 8)
            return self.down_proj(out, reduce=reduce)
        else:
            gate_up_states = self.gate_up_proj(hidden_states)
            gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
            return self.down_proj(
                self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], reduce=reduce
            )


class DeepseekV2MoE(nn.Module):
    def __init__(
        self,
        prefix,
        config: DeepseekV2Config,
        moe_layer_cls: Type[MoELayer],
        weights,
    ):
        super().__init__()

        self.hidden_dim = config.hidden_size
        self.moe_intermediate_size = (
            config.moe_intermediate_size // weights.process_group.size()
        )
        self.routed_scaling_factor = config.routed_scaling_factor

        # Gating
        self.gate = FastLinear.load(config, f"{prefix}.gate", weights, bias=False)

        self.moe_layer = moe_layer_cls(
            prefix=f"{prefix}.experts",
            n_experts=config.n_routed_experts,
            n_expert_group=config.n_group,
            renormalize=config.norm_topk_prob,
            topk=config.num_experts_per_tok,
            topk_group=config.topk_group,
            weights=weights,
        )
        assert isinstance(self.moe_layer, MoELayer)

        if config.n_shared_experts is not None:
            self.shared_experts = DeepseekV2MLP(
                prefix=f"{prefix}.shared_experts",
                config=config,
                weights=weights,
                intermediate_size=config.moe_intermediate_size
                * config.n_shared_experts,
            )
        else:
            self.shared_experts = None

        self.process_group = weights.process_group

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if self.shared_experts is not None:
            shared_output = self.shared_experts(x, reduce=False)
        else:
            shared_output = None

        router_logits = self.gate(x)

        out = self.moe_layer(x, gating_output=router_logits)

        if shared_output is not None:
            out = out + shared_output

        # Reduce sum
        if self.process_group.size() > 1:
            torch.distributed.all_reduce(out, group=self.process_group)

        return out.view(*x.shape)


class DeepseekV2Layer(nn.Module):
    def __init__(self, prefix, layer_id, config, weights):
        super().__init__()
        prefix = f"{prefix}.layers.{layer_id}"

        self.self_attn = DeepseekV2Attention(
            prefix=f"{prefix}.self_attn",
            config=config,
            weights=weights,
        )

        if (
            config.n_routed_experts is not None
            and layer_id >= config.first_k_dense_replace
            and layer_id % config.moe_layer_freq == 0
        ):
            moe_layer_cls = (
                SparseMoELayer
                if SparseMoELayer.is_supported(weights)
                else DenseMoELayer
            )
            self.mlp = DeepseekV2MoE(f"{prefix}.mlp", config, moe_layer_cls, weights)
        else:
            self.mlp = DeepseekV2MLP(
                prefix=f"{prefix}.mlp",
                config=config,
                weights=weights,
                intermediate_size=config.intermediate_size,
            )

        self.input_layernorm = FastRMSNorm.load(
            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
        )
        self.post_attention_layernorm = FastRMSNorm.load(
            prefix=f"{prefix}.post_attention_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        residual: torch.Tensor,
        cos: torch.Tensor,
        sin: torch.Tensor,
        cu_seqlen_prefill: torch.Tensor,
        kv_cache,
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
    ):
        normed_hidden_states, residual = self.input_layernorm(hidden_states, residual)

        # Self Attention
        attn_output = self.self_attn(
            normed_hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
        )

        # faster post attention rms norm
        normed_attn_res_output, residual = self.post_attention_layernorm(
            attn_output, residual
        )

        output = self.mlp(normed_attn_res_output)

        return output, residual


class DeepseekV2Model(torch.nn.Module):
    def __init__(self, prefix: str, config, weights: Weights):
        super().__init__()

        self.embed_tokens = TensorParallelEmbedding(
            prefix=f"{prefix}.embed_tokens", weights=weights
        )

        self.layers = nn.ModuleList(
            [
                DeepseekV2Layer(
                    prefix,
                    layer_id,
                    config,
                    weights,
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )
        self.norm = FastRMSNorm.load(
            prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
        )

        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads
        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
    ) -> torch.Tensor:
        hidden_states = self.embed_tokens(input_ids)

        # Get rotary cos and sin for this forward
        # Avoid to index in each layer
        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
            position_ids, max_s, hidden_states.dtype
        )

        residual = None
        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                block_tables,
                slots,
                seqlen,
                max_s,
            )

        hidden_states, _ = self.norm(hidden_states, residual)

        return hidden_states


class FlashDeepseekV2ForCausalLM(torch.nn.Module):
    def __init__(self, prefix: str, config, weights: Weights):
        super().__init__()

        self.model = DeepseekV2Model(
            "model" if not prefix else f"{prefix}.model", config, weights
        )
        self.lm_head = SpeculativeHead.load(
            config,
            prefix="lm_head" if not prefix else f"{prefix}.lm_head",
            weights=weights,
        )

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        prefill_cache_indices: Optional[torch.Tensor],
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        hidden_states = self.model(
            input_ids,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.lm_head(hidden_states)
        return logits, speculative_logits


================================================
FILE: server/text_generation_server/models/custom_modeling/flash_deepseek_v3_modeling.py
================================================
# coding=utf-8
# Copyright 2023, 2024 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List, Optional, Tuple, Type

import torch
import torch.distributed
from torch import nn
from transformers.activations import ACT2FN
from transformers.configuration_utils import PretrainedConfig

from text_generation_server.layers import (
    FastLinear,
    SpeculativeHead,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    TensorParallelRowLinear,
    get_linear,
)
from text_generation_server.layers.attention import (
    Seqlen,
    attention,
    paged_attention,
)
from text_generation_server.layers.attention.kv_cache import KVCache, get_kv_scales
from text_generation_server.layers.layernorm import FastRMSNorm
from text_generation_server.layers.moe import DenseMoELayer, MoELayer, SparseMoELayer
from text_generation_server.layers.rotary import PositionRotaryEmbedding, get_mscale
from text_generation_server.utils.import_utils import SYSTEM
from text_generation_server.utils.weights import Weights

if SYSTEM == "rocm":
    try:
        import vllm._custom_ops as ops
    except Exception as e:
        raise ImportError(f"Could not load `vllm._custom_ops`. Full error: {e}")


class DeepseekV3Config(PretrainedConfig):
    def __init__(
        self,
        vocab_size=102400,
        hidden_size=4096,
        intermediate_size=11008,
        moe_intermediate_size=1407,
        num_hidden_layers=30,
        num_attention_heads=32,
        num_key_value_heads=32,
        n_shared_experts=2,
        n_routed_experts=160,
        ep_size=1,
        routed_scaling_factor=1.0,
        kv_lora_rank=512,
        q_lora_rank=1536,
        qk_rope_head_dim=64,
        v_head_dim=128,
        qk_nope_head_dim=128,
        topk_method="gready",
        n_group=8,
        topk_group=3,
        num_experts_per_tok=6,
        moe_layer_freq=1,
        first_k_dense_replace=0,
        norm_topk_prob=False,
        scoring_func="softmax",
        aux_loss_alpha=0.001,
        seq_aux=True,
        hidden_act="silu",
        max_position_embeddings=2048,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        pad_token_id=None,
        bos_token_id=100000,
        eos_token_id=100001,
        pretraining_tp=1,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
        attention_bias=False,
        attention_dropout=0.0,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.moe_intermediate_size = moe_intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.n_shared_experts = n_shared_experts
        self.n_routed_experts = n_routed_experts
        self.ep_size = ep_size
        self.routed_scaling_factor = routed_scaling_factor
        self.kv_lora_rank = kv_lora_rank
        self.q_lora_rank = q_lora_rank
        self.qk_rope_head_dim = qk_rope_head_dim
        self.v_head_dim = v_head_dim
        self.qk_nope_head_dim = qk_nope_head_dim
        self.topk_method = topk_method
        self.n_group = n_group
        self.topk_group = topk_group
        self.num_experts_per_tok = num_experts_per_tok
        self.moe_layer_freq = moe_layer_freq
        self.first_k_dense_replace = first_k_dense_replace
        self.norm_topk_prob = norm_topk_prob
        self.scoring_func = scoring_func
        self.aux_loss_alpha = aux_loss_alpha
        self.seq_aux = seq_aux
        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.pretraining_tp = pretraining_tp
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout

        tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
        if tie_word_embeddings:
            raise ValueError(
                "tie_word_embeddings is not supported for Deepseek V2 models."
            )

        if ep_size != 1:
            raise ValueError(
                f"Currently only ep_size == 1 is supported for Deepseek V2 models, was {ep_size}"
            )

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )


class DeepseekV3Attention(torch.nn.Module):
    def __init__(
        self,
        prefix: str,
        config,
        weights: Weights,
    ):
        super().__init__()
        self.num_heads = config.num_attention_heads
        self.hidden_size = config.hidden_size
        self.kv_lora_rank = config.kv_lora_rank
        self.q_lora_rank = config.q_lora_rank
        self.qk_nope_head_dim = config.qk_nope_head_dim
        self.qk_rope_head_dim = config.qk_rope_head_dim
        self.head_size = config.qk_nope_head_dim + config.qk_rope_head_dim
        self.value_head_size = config.v_head_dim
        self.head_pad_size = max(self.head_size, self.value_head_size)

        self.rotary_emb = PositionRotaryEmbedding.static(
            config=config,
            dim=self.qk_rope_head_dim,
            base=config.rope_theta,
            device=weights.device,
        )

        mscale = get_mscale(
            self.rotary_emb.scaling_factor, self.rotary_emb.mscale_all_dim
        )
        self.softmax_scale = self.head_size**-0.5 * mscale * mscale

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()
        self.num_key_value_heads = (
            config.num_key_value_heads // weights.process_group.size()
        )

        if self.q_lora_rank is None:
            self.q_proj = TensorParallelColumnLinear.load(
                config,
                prefix=f"{prefix}.q_proj",
                weights=weights,
                bias=config.attention_bias,
            )
        else:
            self.q_a_proj = get_linear(
                weight=weights.get_weights(f"{prefix}.q_a_proj"),
                bias=(
                    weights.get_tensor(f"{prefix}.q_a_proj.bias")
                    if config.attention_bias
                    else None
                ),
            )
            self.q_a_layernorm = FastRMSNorm.load(
                prefix=f"{prefix}.q_a_layernorm",
                weights=weights,
                eps=config.rms_norm_eps,
            )
            self.q_b_proj = TensorParallelColumnLinear.load(
                config,
                prefix=f"{prefix}.q_b_proj",
                weights=weights,
                bias=config.attention_bias,
            )

        self.kv_a_proj_with_mqa = get_linear(
            weight=weights.get_weights(f"{prefix}.kv_a_proj_with_mqa"),
            bias=(
                weights.get_tensor(f"{prefix}.kv_a_proj_with_mqa.bias")
                if config.attention_bias
                else None
            ),
        )

        self.kv_scales = get_kv_scales(weights, f"{prefix}")

        self.kv_a_layernorm = FastRMSNorm.load(
            prefix=f"{prefix}.kv_a_layernorm", weights=weights, eps=config.rms_norm_eps
        )

        self.kv_b_proj = TensorParallelColumnLinear.load(
            config,
            prefix=f"{prefix}.kv_b_proj",
            weights=weights,
            bias=config.attention_bias,
        )

        self.o_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.o_proj",
            weights=weights,
            bias=False,
        )
        self.num_groups = self.num_heads // self.num_key_value_heads
        self.kv_head_mapping = torch.arange(
            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
        ).repeat_interleave(self.num_groups)

    def forward(
        self,
        hidden_states: torch.Tensor,
        cos: torch.Tensor,
        sin: torch.Tensor,
        cu_seqlen_prefill: torch.Tensor,
        kv_cache: KVCache,
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
    ):
        if self.q_lora_rank is None:
            query = self.q_proj(hidden_states)
        else:
            query = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states))[0])
        query = query.view(-1, self.num_heads, self.head_size)

        _, query_pe = torch.split(
            query, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
        )

        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
        compressed_kv, key_pe = torch.split(
            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
        )

        key_pe = key_pe.view(-1, 1, self.qk_rope_head_dim)
        kv = self.kv_b_proj(self.kv_a_layernorm(compressed_kv.contiguous())[0]).view(
            -1, self.num_key_value_heads, self.qk_nope_head_dim + self.value_head_size
        )

        key_nope, value = torch.split(
            kv, [self.qk_nope_head_dim, self.value_head_size], dim=-1
        )

        batch_size, heads, head_dim = query_pe.shape
        query_pe = (
            query_pe.view(batch_size, heads, head_dim // 2, 2)
            .transpose(2, 3)
            .reshape(batch_size, heads, head_dim)
        )
        batch_size, heads, head_dim = key_pe.shape
        key_pe = (
            key_pe.view(batch_size, heads, head_dim // 2, 2)
            .transpose(2, 3)
            .reshape(batch_size, heads, head_dim)
        )
        self.rotary_emb(query_pe, key_pe, cos, sin)

        query[..., self.qk_nope_head_dim :] = query_pe
        key = torch.empty_like(query)
        key[..., : self.qk_nope_head_dim] = key_nope
        key[..., self.qk_nope_head_dim :] = key_pe

        # We need to pad the heads because Flash Attention does not support
        # qk and v with different head sizes.
        query = torch.nn.functional.pad(
            query, (0, self.head_pad_size - self.head_size), value=0
        )
        key = torch.nn.functional.pad(
            key, (0, self.head_pad_size - self.head_size), value=0
        )
        value = torch.nn.functional.pad(
            value, (0, self.head_pad_size - self.value_head_size), value=0
        )

        kv_cache.store(
            key=key,
            value=value,
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
            attn_output = attention(
                query=query,
                key=key,
                value=value,
                kv_cache=kv_cache,
                kv_scales=self.kv_scales,
                seqlen=seqlen,
                block_tables=block_tables,
                softmax_scale=self.softmax_scale,
            )
        # Decode
        else:
            attn_output = paged_attention(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                seqlen,
                max_s,
                kv_scales=self.kv_scales,
            )

        # Remove padding.
        attn_output = attn_output[..., : self.value_head_size]

        return self.o_proj(
            attn_output.reshape(-1, self.num_heads * self.value_head_size)
        )


class DeepseekV3MLP(nn.Module):
    def __init__(self, prefix: str, config, weights, intermediate_size: int):
        super().__init__()
        self.hidden_act = config.hidden_act
        if self.hidden_act != "silu":
            # Bail out because MoE only supports silu.
            raise NotImplementedError(
                "Currently only `silu` is supported as an activation for Deepseek V2."
            )
        self.act = ACT2FN[self.hidden_act]

        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
            weights=weights,
            dim=0,
            bias=False,
        )

        self.down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.down_proj",
            weights=weights,
            bias=False,
        )

        self.intermediate_size = intermediate_size // weights.process_group.size()

        # TODO: This is a hotfix to be removed & properly refactored.
        self.quantize = config.quantize

    def forward(self, hidden_states: torch.Tensor, reduce: bool = True):
        if (
            SYSTEM == "rocm"
            and self.hidden_act == "silu"
            and hidden_states.dtype == torch.float16
            and hidden_states.shape[0] == 1
            and not self.quantize
        ):
            out = torch.empty(
                hidden_states.shape[0],
                self.intermediate_size,
                dtype=hidden_states.dtype,
                device="cuda",
            )
            ops.LLMM_Silu(self.gate_up_proj.linear.weight, hidden_states, out, 8)
            return self.down_proj(out, reduce=reduce)
        else:
            gate_up_states = self.gate_up_proj(hidden_states)
            gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
            return self.down_proj(
                self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], reduce=reduce
            )


class DeepseekV3MoE(nn.Module):
    def __init__(
        self,
        prefix,
        config: DeepseekV3Config,
        moe_layer_cls: Type[MoELayer],
        weights,
    ):
        super().__init__()

        self.hidden_dim = config.hidden_size
        self.moe_intermediate_size = (
            config.moe_intermediate_size // weights.process_group.size()
        )
        self.routed_scaling_factor = config.routed_scaling_factor

        # Gating
        self.gate = FastLinear.load(config, f"{prefix}.gate", weights, bias=False)

        if config.topk_method == "noaux_tc":
            self.gate.e_score_correction_bias = torch.zeros(
                config.n_routed_experts, device=weights.device
            )
        else:
            self.gate.e_score_correction_bias = None

        self.moe_layer = moe_layer_cls(
            prefix=f"{prefix}.experts",
            n_experts=config.n_routed_experts,
            n_expert_group=config.n_group,
            renormalize=config.norm_topk_prob,
            topk=config.num_experts_per_tok,
            topk_group=config.topk_group,
            weights=weights,
            scoring_func=config.scoring_func,
            e_score_correction_bias=self.gate.e_score_correction_bias,
        )
        assert isinstance(self.moe_layer, MoELayer)

        if config.n_shared_experts is not None:
            self.shared_experts = DeepseekV3MLP(
                prefix=f"{prefix}.shared_experts",
                config=config,
                weights=weights,
                intermediate_size=config.moe_intermediate_size
                * config.n_shared_experts,
            )
        else:
            self.shared_experts = None

        self.process_group = weights.process_group

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if self.shared_experts is not None:
            shared_output = self.shared_experts(x, reduce=False)
        else:
            shared_output = None

        router_logits = self.gate(x)

        out = self.moe_layer(x, gating_output=router_logits)

        if shared_output is not None:
            out = out + shared_output

        # Reduce sum
        if self.process_group.size() > 1:
            torch.distributed.all_reduce(out, group=self.process_group)

        return out.view(*x.shape)


class DeepseekV3Layer(nn.Module):
    def __init__(self, prefix, layer_id, config, weights):
        super().__init__()
        prefix = f"{prefix}.layers.{layer_id}"

        self.self_attn = DeepseekV3Attention(
            prefix=f"{prefix}.self_attn",
            config=config,
            weights=weights,
        )

        if (
            config.n_routed_experts is not None
            and layer_id >= config.first_k_dense_replace
            and layer_id % config.moe_layer_freq == 0
        ):
            moe_layer_cls = (
                SparseMoELayer
                if SparseMoELayer.is_supported(weights)
                else DenseMoELayer
            )
            self.mlp = DeepseekV3MoE(f"{prefix}.mlp", config, moe_layer_cls, weights)
        else:
            self.mlp = DeepseekV3MLP(
                prefix=f"{prefix}.mlp",
                config=config,
                weights=weights,
                intermediate_size=config.intermediate_size,
            )

        self.input_layernorm = FastRMSNorm.load(
            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
        )
        self.post_attention_layernorm = FastRMSNorm.load(
            prefix=f"{prefix}.post_attention_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        residual: torch.Tensor,
        cos: torch.Tensor,
        sin: torch.Tensor,
        cu_seqlen_prefill: torch.Tensor,
        kv_cache,
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
    ):
        normed_hidden_states, residual = self.input_layernorm(hidden_states, residual)

        # Self Attention
        attn_output = self.self_attn(
            normed_hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
        )

        # faster post attention rms norm
        normed_attn_res_output, residual = self.post_attention_layernorm(
            attn_output, residual
        )

        output = self.mlp(normed_attn_res_output)

        return output, residual


class DeepseekV3Model(torch.nn.Module):
    def __init__(self, prefix: str, config, weights: Weights):
        super().__init__()

        self.embed_tokens = TensorParallelEmbedding(
            prefix=f"{prefix}.embed_tokens", weights=weights
        )

        self.layers = nn.ModuleList(
            [
                DeepseekV3Layer(
                    prefix,
                    layer_id,
                    config,
                    weights,
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )
        self.norm = FastRMSNorm.load(
            prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
        )

        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads
        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
    ) -> torch.Tensor:
        hidden_states = self.embed_tokens(input_ids)

        # Get rotary cos and sin for this forward
        # Avoid to index in each layer
        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
            position_ids, max_s, hidden_states.dtype
        )

        residual = None
        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                block_tables,
                slots,
                seqlen,
                max_s,
            )

        hidden_states, _ = self.norm(hidden_states, residual)

        return hidden_states


class FlashDeepseekV3ForCausalLM(torch.nn.Module):
    def __init__(self, prefix: str, config, weights: Weights):
        super().__init__()

        self.model = DeepseekV3Model(
            "model" if not prefix else f"{prefix}.model", config, weights
        )
        self.lm_head = SpeculativeHead.load(
            config,
            prefix="lm_head" if not prefix else f"{prefix}.lm_head",
            weights=weights,
        )

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        prefill_cache_indices: Optional[torch.Tensor],
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        hidden_states = self.model(
            input_ids,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.lm_head(hidden_states)
        return logits, speculative_logits


================================================
FILE: server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py
================================================
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
import torch.distributed

from torch import nn
from transformers.activations import ACT2FN
from transformers.configuration_utils import PretrainedConfig
from typing import Optional, List, Tuple
from text_generation_server.layers.attention import (
    paged_attention,
    attention,
    Seqlen,
)
from text_generation_server.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    SpeculativeHead,
    get_linear,
    TensorParallelMultiAdapterLinear,
    TensorParallelAdapterRowLinear,
)
from text_generation_server.layers.attention.kv_cache import get_kv_scales
from text_generation_server.layers.rotary import PositionRotaryEmbedding
from text_generation_server.layers.layernorm import (
    FastRMSNorm,
)
from text_generation_server.utils.weights import UnquantizedWeight


class Gemma2Config(PretrainedConfig):
    def __init__(
        self,
        vocab_size=256128,
        hidden_size=3072,
        intermediate_size=24576,
        num_hidden_layers=28,
        num_attention_heads=16,
        num_key_value_heads=16,
        head_dim=256,
        hidden_act="gelu_pytorch_tanh",
        max_position_embeddings=8192,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        pad_token_id=None,
        bos_token_id=1,
        eos_token_id=2,
        tie_word_embeddings=True,
        rope_theta=10000.0,
        rope_scaling=None,
        attention_bias=False,
        attention_dropout=0.0,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.head_dim = head_dim
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads

        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )


class Gemma2FastRMSNorm(FastRMSNorm):
    @classmethod
    def load(cls, prefix: str, weights, eps=1e-6):
        dtype = weights.dtype
        weights.dtype = torch.float32
        weight = weights.get_tensor(f"{prefix}.weight") + 1
        weights.dtype = dtype
        new = cls(weight, eps)
        new.dtype = dtype
        return new

    # perform the multiplication in full precision and downcast after
    def forward(self, hidden_states, residual=None):
        if residual is not None:
            hidden_states += residual
        residual = hidden_states
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        hidden_states = hidden_states * self.weight
        return hidden_states.to(self.dtype), residual


def load_attention(config, prefix: str, weights):
    if config.num_attention_heads != config.num_key_value_heads:
        return _load_gqa(config, prefix, weights)
    else:
        return TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
            dim=0,
            weights=weights,
            bias=False,
        )


def _load_gqa(config, prefix: str, weights):
    assert config.num_attention_heads % weights.process_group.size() == 0

    weight = weights.get_multi_weights_col(
        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
        dim=0,
    )

    if isinstance(weight, UnquantizedWeight):
        weight.weight = weight.weight.to(dtype=weights.dtype).to(device=weights.device)

        head_size = config.head_dim
        num_heads = config.num_attention_heads // weights.process_group.size()
        num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
        assert list(weight.weight.shape) == [
            (num_heads + 2 * num_key_value_heads) * head_size,
            config.hidden_size,
        ], f"{list(weight.weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"

    return TensorParallelColumnLinear(get_linear(weight, bias=None))


class FlashGemma2Attention(torch.nn.Module):
    def __init__(
        self, prefix: str, config, weights, layer_id, causal: bool, is_sliding: bool
    ):
        super().__init__()
        self.num_heads = config.num_attention_heads
        self.head_size = config.head_dim
        self.causal = causal
        if is_sliding:
            self.window_size = config.sliding_window
        else:
            self.window_size = -1

        self.rotary_emb = PositionRotaryEmbedding.static(
            config=config,
            dim=self.head_size,
            base=config.rope_theta,
            device=weights.device,
        )

        # self.softmax_scale = self.head_size**-0.5
        self.softmax_scale = config.query_pre_attn_scalar**-0.5

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()
        self.num_key_value_heads = (
            config.num_key_value_heads // weights.process_group.size()
        )
        self.softcap = config.attn_logit_softcapping

        query_key_value = load_attention(config, prefix, weights)
        self.query_key_value = TensorParallelMultiAdapterLinear.load(
            query_key_value,
            layer_id,
            ["q_proj", "k_proj", "v_proj"],
            sizes=[
                self.head_size * config.num_attention_heads,
                self.head_size * config.num_key_value_heads,
                self.head_size * config.num_key_value_heads,
            ],
            process_group=weights.process_group,
        )
        self.kv_scales = get_kv_scales(weights, f"{prefix}")

        o_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.o_proj",
            weights=weights,
            bias=False,
        )
        self.o_proj = TensorParallelAdapterRowLinear.load(
            o_proj,
            layer_id,
            "o_proj",
            process_group=weights.process_group,
        )

        self.num_groups = self.num_heads // self.num_key_value_heads
        self.kv_head_mapping = torch.arange(
            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
        ).repeat_interleave(self.num_groups)

    def forward(
        self,
        hidden_states,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        seqlen,
        max_s,
        adapter_data,
    ):
        qkv = self.query_key_value(hidden_states, adapter_data)
        query, kv = qkv.split(
            [
                self.head_size * self.num_heads,
                2 * self.head_size * self.num_key_value_heads,
            ],
            dim=1,
        )
        query = query.view(-1, self.num_heads, self.head_size)
        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)

        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)

        kv_cache.store(
            key=kv[:, 0],
            value=kv[:, 1],
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
            attn_output = attention(
                query=query,
                key=kv[:, 0],
                value=kv[:, 1],
                kv_cache=kv_cache,
                kv_scales=self.kv_scales,
                seqlen=seqlen,
                block_tables=block_tables,
                softmax_scale=self.softmax_scale,
                window_size_left=self.window_size,
                softcap=self.softcap,
            )
        # Decode
        else:
            attn_output = paged_attention(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                seqlen,
                max_s,
                softcap=self.softcap,
                kv_scales=self.kv_scales,
                window_size_left=self.window_size,
            )

        return self.o_proj(
            attn_output.view(-1, self.num_heads * self.head_size), adapter_data
        )


class Gemma2MLP(nn.Module):
    def __init__(self, prefix, config, weights, layer_id):
        super().__init__()
        act = config.hidden_activation
        self.act = (
            ACT2FN[act]
            if "gelu" not in act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
                ),
            )
        )
        # Fuse gate and up proj
        gate_up_proj = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
            weights=weights,
            dim=0,
            bias=False,
        )
        self.gate_up_proj = TensorParallelMultiAdapterLinear.load(
            gate_up_proj,
            layer_id,
            ["gate_proj", "up_proj"],
            sizes=[
                config.intermediate_size,
                config.intermediate_size,
            ],
            process_group=weights.process_group,
        )

        down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.down_proj",
            weights=weights,
            bias=False,
        )
        self.down_proj = TensorParallelAdapterRowLinear.load(
            down_proj,
            layer_id,
            "down_proj",
            process_group=weights.process_group,
        )

        self.intermediate_size = (
            config.intermediate_size // weights.process_group.size()
        )

    def forward(self, hidden_states, adapter_data):
        gate_up_states = self.gate_up_proj(hidden_states, adapter_data)
        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
        return self.down_proj(
            self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], adapter_data
        )


class FlashGemma2Layer(nn.Module):
    def __init__(
        self, prefix: str, config, weights, layer_id, causal: bool, is_sliding: bool
    ):
        super().__init__()
        self.self_attn = FlashGemma2Attention(
            prefix=f"{prefix}.self_attn",
            config=config,
            weights=weights,
            layer_id=layer_id,
            causal=causal,
            is_sliding=is_sliding,
        )
        self.mlp = Gemma2MLP(
            prefix=f"{prefix}.mlp", config=config, weights=weights, layer_id=layer_id
        )

        self.input_layernorm = Gemma2FastRMSNorm.load(
            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
        )
        self.post_attention_layernorm = Gemma2FastRMSNorm.load(
            prefix=f"{prefix}.post_attention_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )
        self.pre_feedforward_layernorm = Gemma2FastRMSNorm.load(
            prefix=f"{prefix}.pre_feedforward_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )
        self.post_feedforward_layernorm = Gemma2FastRMSNorm.load(
            prefix=f"{prefix}.post_feedforward_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        seqlen,
        max_s,
        adapter_data,
    ):
        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)

        # Self Attention
        attn_output = self.self_attn(
            normed_hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
            adapter_data,
        )

        # faster post attention rms norm
        normed_attn_res_output, _ = self.post_attention_layernorm(attn_output)
        normed_attn_res_output = normed_attn_res_output + res
        res = normed_attn_res_output

        pre_normed, _ = self.pre_feedforward_layernorm(normed_attn_res_output)
        mlp_output = self.mlp(pre_normed, adapter_data)
        post_hidden_states, _ = self.post_feedforward_layernorm(mlp_output)

        return post_hidden_states, normed_attn_res_output


class FlashGemma2Model(torch.nn.Module):
    def __init__(self, prefix: str, config, weights, causal: bool):
        super().__init__()

        process_group = weights.process_group
        self.tp_rank = process_group.rank()
        self.tp_world_size = process_group.size()
        self.layers = nn.ModuleList(
            [
                FlashGemma2Layer(
                    prefix=f"{prefix}.layers.{layer_id}",
                    config=config,
                    weights=weights,
                    layer_id=layer_id,
                    causal=causal,
                    is_sliding=layer_id % 2 == 0,
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )
        self.norm = Gemma2FastRMSNorm.load(
            prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
        )

        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads
        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads

    def forward(
        self,
        inputs_embeds: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        hidden_states = inputs_embeds

        # Get rotary cos and sin for this forward
        # Avoid to index in each layer
        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
            position_ids, max_s, hidden_states.dtype
        )

        residual = None
        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                block_tables,
                slots,
                seqlen,
                max_s,
                adapter_data,
            )

        hidden_states, _ = self.norm(hidden_states, residual)

        return hidden_states


class FlashGemma2ForCausalLM(torch.nn.Module):
    def __init__(self, prefix: str, config, weights, *, causal: bool = True):
        super().__init__()

        embed_norm = config.hidden_size**0.5
        if not prefix:
            prefix = "model"
        else:
            prefix = f"{prefix}.model"

        self.embed_tokens = TensorParallelEmbedding(
            prefix=f"{prefix}.embed_tokens", weights=weights
        )
        self.embed_tokens.weight *= embed_norm

        self.model = FlashGemma2Model(
            prefix=prefix, config=config, weights=weights, causal=causal
        )
        self.lm_head = SpeculativeHead.load(
            prefix=(
                f"{prefix}.embed_tokens"
                if config.tie_word_embeddings
                else f"{prefix}.lm_head"
            ),
            config=config,
            weights=weights,
        )
        self.softcap = config.final_logit_softcapping
        assert isinstance(self.softcap, float)

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        prefill_cache_indices: Optional[torch.Tensor],
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        input_embeds = self.embed_tokens(input_ids)
        hidden_states = self.model(
            input_embeds,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
            adapter_data,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.lm_head(hidden_states)

        logits /= self.softcap
        logits = torch.tanh(logits)
        logits *= self.softcap

        return logits, speculative_logits


================================================
FILE: server/text_generation_server/models/custom_modeling/flash_gemma3_modeling.py
================================================
# coding=utf-8
# Copyright 2024 HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
import torch.distributed
from torch import nn
from typing import Optional, List, Tuple
import copy

from text_generation_server.layers import (
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    TensorParallelRowLinear,
    get_linear,
    #
    SpeculativeHead,
    TensorParallelMultiAdapterLinear,
    TensorParallelAdapterRowLinear,
)

import torch
import torch.nn.functional as F


from text_generation_server.models.custom_modeling.vlm import (
    load_text_model,
    load_vision_model,
)


from text_generation_server.layers.attention.kv_cache import get_kv_scales
from text_generation_server.layers.rotary import PositionRotaryEmbedding
from text_generation_server.layers.layernorm import (
    FastRMSNorm,
)
from text_generation_server.models.globals import ATTENTION
from text_generation_server.utils.weights import UnquantizedWeight
from transformers.activations import ACT2FN
from text_generation_server.layers.attention import (
    paged_attention,
    attention,
    Seqlen,
)


ATTENTION_TYPE_GLOBAL = "global"
ATTENTION_TYPE_LOCAL = "local_sliding"


class Gemma3FastRMSNorm(FastRMSNorm):
    @classmethod
    def load(cls, prefix: str, weights, eps=1e-6):
        dtype = weights.dtype
        weights.dtype = torch.float32
        weight = weights.get_tensor(f"{prefix}.weight") + 1
        weights.dtype = dtype
        new = cls(weight, eps)
        new.dtype = dtype
        return new

    # perform the multiplication in full precision and downcast after
    def forward(self, hidden_states, residual=None):
        if residual is not None:
            hidden_states += residual
        residual = hidden_states
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        hidden_states = hidden_states * self.weight
        return hidden_states.to(self.dtype), residual


def load_attention(config, prefix: str, weights):
    if config.num_attention_heads != config.num_key_value_heads:
        return _load_gqa(config, prefix, weights)
    else:
        return TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
            dim=0,
            weights=weights,
            bias=False,
        )


def _load_gqa(config, prefix: str, weights):
    assert config.num_attention_heads % weights.process_group.size() == 0

    weight = weights.get_multi_weights_col(
        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
        dim=0,
    )

    if isinstance(weight, UnquantizedWeight):
        weight.weight = weight.weight.to(dtype=weights.dtype).to(device=weights.device)

        head_size = config.head_dim
        num_heads = config.num_attention_heads // weights.process_group.size()
        num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
        assert list(weight.weight.shape) == [
            (num_heads + 2 * num_key_value_heads) * head_size,
            config.hidden_size,
        ], f"{list(weight.weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"

    return TensorParallelColumnLinear(get_linear(weight, bias=None))


class FlashGemma3Attention(torch.nn.Module):
    def __init__(
        self, prefix: str, config, weights, layer_id, causal: bool, is_sliding: bool
    ):
        super().__init__()
        self.num_heads = config.num_attention_heads
        self.head_size = config.head_dim
        self.causal = causal
        if is_sliding:
            self.window_size = config.sliding_window
            # TODO: remove this hack to support local sliding window
            config = copy.deepcopy(config)
            config.rope_scaling = dict(rope_type="default")
            self.rotary_emb = PositionRotaryEmbedding.static(
                config=config,
                dim=config.head_dim,
                base=config.rope_local_base_freq,
                device=weights.device,
            )
        else:
            self.window_size = -1
            self.rotary_emb = PositionRotaryEmbedding.static(
                config=config,
                dim=config.head_dim,
                base=config.rope_theta,
                device=weights.device,
            )

        self.softmax_scale = (
            config.query_pre_attn_scalar**-0.5
            if config.query_pre_attn_scalar is not None
            else None
        )
        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()
        self.num_key_value_heads = (
            config.num_key_value_heads // weights.process_group.size()
        )
        self.softcap = None  # config.attn_logit_softcapping

        query_key_value = load_attention(config, prefix, weights)
        self.query_key_value = TensorParallelMultiAdapterLinear.load(
            query_key_value,
            layer_id,
            ["q_proj", "k_proj", "v_proj"],
            sizes=[
                self.head_size * config.num_attention_heads,
                self.head_size * config.num_key_value_heads,
                self.head_size * config.num_key_value_heads,
            ],
            process_group=weights.process_group,
        )
        self.kv_scales = get_kv_scales(weights, f"{prefix}")

        o_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.o_proj",
            weights=weights,
            bias=False,
        )
        self.o_proj = TensorParallelAdapterRowLinear.load(
            o_proj,
            layer_id,
            "o_proj",
            process_group=weights.process_group,
        )

        self.num_groups = self.num_heads // self.num_key_value_heads
        self.kv_head_mapping = torch.arange(
            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
        ).repeat_interleave(self.num_groups)
        self.q_norm = Gemma3FastRMSNorm.load(
            prefix=f"{prefix}.q_norm", weights=weights, eps=config.rms_norm_eps
        )
        self.k_norm = Gemma3FastRMSNorm.load(
            prefix=f"{prefix}.k_norm", weights=weights, eps=config.rms_norm_eps
        )
        self.enable_gqa = self.num_heads != self.num_key_value_heads

    def forward(
        self,
        hidden_states,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        seqlen,
        max_s,
        adapter_data,
        attention_mask,
    ):

        qkv = self.query_key_value(hidden_states, adapter_data)
        query, kv = qkv.split(
            [
                self.head_size * self.num_heads,
                2 * self.head_size * self.num_key_value_heads,
            ],
            dim=1,
        )

        kv = kv.view(-1, 2, self.num_key_value_heads * self.head_size)
        key = kv[:, 0]
        value = kv[:, 1]

        query = query.reshape(-1, self.head_size)
        key = key.reshape(-1, self.head_size)

        query, _ = self.q_norm(query.contiguous())
        key, _ = self.k_norm(key.contiguous())

        query = query.view(-1, self.num_heads, self.head_size)
        key = key.view(-1, self.num_key_value_heads, self.head_size)
        value = value.view(-1, self.num_key_value_heads, self.head_size)

        self.rotary_emb(query, key, cos, sin)

        kv_cache.store(
            key=key,
            value=value,
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            if attention_mask is None or ATTENTION == "flashinfer":
                # flash attention
                attn_output = attention(
                    query=query,
                    key=key,
                    value=value,
                    kv_cache=kv_cache,
                    kv_scales=self.kv_scales,
                    seqlen=seqlen,
                    block_tables=block_tables,
                    softmax_scale=self.softmax_scale,
                    window_size_left=self.window_size,
                    softcap=self.softcap,
                )
            else:
                lengths = cu_seqlen_prefill[1:] - cu_seqlen_prefill[:-1]

                # Split tensors using vectorized split
                query_list = torch.split(query, lengths.tolist(), dim=0)
                key_list = torch.split(key, lengths.tolist(), dim=0)
                value_list = torch.split(value, lengths.tolist(), dim=0)

                padded_query = torch.nn.utils.rnn.pad_sequence(
                    query_list, batch_first=True
                )
                padded_key = torch.nn.utils.rnn.pad_sequence(key_list, batch_first=True)
                padded_value = torch.nn.utils.rnn.pad_sequence(
                    value_list, batch_first=True
                )

                padded_query = padded_query.transpose(1, 2).contiguous()
                padded_key = padded_key.transpose(1, 2).contiguous()
                padded_value = padded_value.transpose(1, 2).contiguous()

                # Compute attention
                attn_output = F.scaled_dot_product_attention(
                    padded_query,
                    padded_key,
                    padded_value,
                    attn_mask=attention_mask,
                    scale=self.softmax_scale,
                    enable_gqa=self.enable_gqa,
                )

                attn_output = attn_output.transpose(
                    1, 2
                )  # [batch_size, seq_len, num_heads, head_dim]
                max_seq_len = padded_query.size(2)
                seq_range = torch.arange(
                    max_seq_len, device=padded_query.device
                ).unsqueeze(0)
                lengths_tensor = torch.tensor(
                    lengths, device=padded_query.device
                ).unsqueeze(1)
                mask = seq_range < lengths_tensor  # [batch, max_seq_len]
                attn_output = attn_output[mask]  # [total_seq_len, num_heads, head_dim]

        # Decode
        else:
            attn_output = paged_attention(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                seqlen,
                max_s,
                softcap=self.softcap,
                kv_scales=self.kv_scales,
                window_size_left=self.window_size,
            )

        return self.o_proj(
            attn_output.view(-1, self.num_heads * self.head_size), adapter_data
        )


class Gemma3MLP(nn.Module):
    def __init__(self, prefix, config, weights, layer_id):
        super().__init__()
        act = config.hidden_activation
        self.act = (
            ACT2FN[act]
            if "gelu" not in act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
                ),
            )
        )
        # Fuse gate and up proj
        gate_up_proj = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
            weights=weights,
            dim=0,
            bias=False,
        )
        self.gate_up_proj = TensorParallelMultiAdapterLinear.load(
            gate_up_proj,
            layer_id,
            ["gate_proj", "up_proj"],
            sizes=[
                config.intermediate_size,
                config.intermediate_size,
            ],
            process_group=weights.process_group,
        )

        down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.down_proj",
            weights=weights,
            bias=False,
        )
        self.down_proj = TensorParallelAdapterRowLinear.load(
            down_proj,
            layer_id,
            "down_proj",
            process_group=weights.process_group,
        )

        self.intermediate_size = (
            config.intermediate_size // weights.process_group.size()
        )

    def forward(self, hidden_states, adapter_data):
        gate_up_states = self.gate_up_proj(hidden_states, adapter_data)
        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
        return self.down_proj(
            self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], adapter_data
        )


class FlashGemma3Layer(nn.Module):
    def __init__(
        self, prefix: str, config, weights, layer_id, causal: bool, is_sliding: bool
    ):
        super().__init__()
        self.self_attn = FlashGemma3Attention(
            prefix=f"{prefix}.self_attn",
            config=config,
            weights=weights,
            layer_id=layer_id,
            causal=causal,
            is_sliding=is_sliding,
        )
        self.mlp = Gemma3MLP(
            prefix=f"{prefix}.mlp", config=config, weights=weights, layer_id=layer_id
        )

        self.input_layernorm = Gemma3FastRMSNorm.load(
            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
        )
        self.post_attention_layernorm = Gemma3FastRMSNorm.load(
            prefix=f"{prefix}.post_attention_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )
        self.pre_feedforward_layernorm = Gemma3FastRMSNorm.load(
            prefix=f"{prefix}.pre_feedforward_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )
        self.post_feedforward_layernorm = Gemma3FastRMSNorm.load(
            prefix=f"{prefix}.post_feedforward_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        seqlen,
        max_s,
        adapter_data,
        attention_mask,
    ):
        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)

        # Self Attention
        attn_output = self.self_attn(
            normed_hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
            adapter_data,
            attention_mask,
        )

        # faster post attention rms norm
        normed_attn_res_output, _ = self.post_attention_layernorm(attn_output)
        normed_attn_res_output = normed_attn_res_output + res
        res = normed_attn_res_output

        pre_normed, _ = self.pre_feedforward_layernorm(normed_attn_res_output)
        mlp_output = self.mlp(pre_normed, adapter_data)
        post_hidden_states, _ = self.post_feedforward_layernorm(mlp_output)

        return post_hidden_states, normed_attn_res_output


class FlashGemma3Model(torch.nn.Module):
    def __init__(self, prefix: str, config, weights, causal: bool):
        super().__init__()

        process_group = weights.process_group
        self.tp_rank = process_group.rank()
        self.tp_world_size = process_group.size()

        self.layers = nn.ModuleList(
            [
                FlashGemma3Layer(
                    prefix=f"{prefix}.layers.{layer_id}",
                    config=config,
                    weights=weights,
                    layer_id=layer_id,
                    causal=causal,
                    is_sliding=bool((layer_id + 1) % config.sliding_window_pattern),
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )
        self.norm = Gemma3FastRMSNorm.load(
            prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
        )

        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads
        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads

    def forward(
        self,
        inputs_embeds: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        adapter_data: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        attention_mask_local: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        hidden_states = inputs_embeds

        # Get rotary cos and sin for this forward
        # Avoid to index in each layer

        residual = None
        for i, layer in enumerate(self.layers):
            cos, sin = self.layers[i].self_attn.rotary_emb.get_cos_sin(
                position_ids, max_s, hidden_states.dtype
            )

            hidden_states, residual = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                block_tables,
                slots,
                seqlen,
                max_s,
                adapter_data,
                (
                    attention_mask
                    if self.layers[i].self_attn.window_size == -1
                    else attention_mask_local
                ),
            )

        hidden_states, _ = self.norm(hidden_states, residual)

        return hidden_states


class FlashGemma3ForCausalLM(torch.nn.Module):
    def __init__(self, prefix: str, config, weights, *, causal: bool = True):
        super().__init__()

        embed_norm = config.hidden_size**0.5
        if not prefix:
            prefix = "model"
        else:
            prefix = f"{prefix}.model"

        self.embed_tokens = TensorParallelEmbedding(
            prefix=f"{prefix}.embed_tokens", weights=weights
        )
        self.embed_tokens.weight *= embed_norm

        self.model = FlashGemma3Model(
            prefix=prefix, config=config, weights=weights, causal=causal
        )
        self.lm_head = SpeculativeHead.load(
            prefix=(
                f"{prefix}.embed_tokens"
                if config.tie_word_embeddings
                else f"{prefix}.lm_head"
            ),
            config=config,
            weights=weights,
        )
        # self.softcap = config.attn_logit_softcapping
        # assert isinstance(self.softcap, float)
        self.softcap = None

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        prefill_cache_indices: Optional[torch.Tensor],
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        input_embeds = self.embed_tokens(input_ids)

        hidden_states = self.model(
            input_embeds,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
            adapter_data,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.lm_head(hidden_states)

        return logits, speculative_logits


class Gemma3MultimodalInputProjection(torch.nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()

        self.mm_input_projection_weight = weights.get_tensor(
            "multi_modal_projector.mm_input_projection_weight"
        )

        self.mm_soft_emb_norm = Gemma3FastRMSNorm.load(
            prefix=f"{prefix}.mm_soft_emb_norm",
            weights=weights,
            eps=config.vision_config.layer_norm_eps,
        )

        self.patches_per_image = int(
            config.vision_config.image_size // config.vision_config.patch_size
        )
        self.tokens_per_side = int(config.mm_tokens_per_image**0.5)
        self.kernel_size = self.patches_per_image // self.tokens_per_side
        self.avg_pool = nn.AvgPool2d(
            kernel_size=self.kernel_size, stride=self.kernel_size
        )

    def forward(self, vision_outputs: torch.Tensor):
        batch_size, _, seq_length = vision_outputs.shape

        reshaped_vision_outputs = vision_outputs.transpose(1, 2)
        reshaped_vision_outputs = reshaped_vision_outputs.reshape(
            batch_size, seq_length, self.patches_per_image, self.patches_per_image
        )
        reshaped_vision_outputs = reshaped_vision_outputs.contiguous()

        pooled_vision_outputs = self.avg_pool(reshaped_vision_outputs)
        pooled_vision_outputs = pooled_vision_outputs.flatten(2)
        pooled_vision_outputs = pooled_vision_outputs.transpose(1, 2)

        normed_vision_outputs, _ = self.mm_soft_emb_norm(pooled_vision_outputs)

        projected_vision_outputs = torch.matmul(
            normed_vision_outputs, self.mm_input_projection_weight
        )
        return projected_vision_outputs.type_as(vision_outputs)


class Gemma3ForConditionalGeneration(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()

        self.config = config

        if config.vision_config is not None:

            config.vision_config.quantize = config.quantize

            self.post_vision_model_layernorm = nn.LayerNorm.load(
                prefix="vision_tower.vision_model.post_layernorm",
                weights=weights,
                eps=config.vision_config.layer_norm_eps,
            )

            self.multimodal_projector = Gemma3MultimodalInputProjection(
                prefix="multi_modal_projector",
                config=config,
                weights=weights,
            )

            text_config = config.text_config
            text_config.speculator = config.speculator
            text_config.quantize = config.quantize

            self.vision_model = load_vision_model(
                prefix="vision_tower" if not prefix else f"{prefix}.vision_tower",
                config=config.vision_config,
                weights=weights,
            )

            self.text_model = load_text_model(
                prefix="language_model" if not prefix else f"{prefix}.language_model",
                config=config.text_config,
                weights=weights,
            )
        else:
            config.text_config.quantize = config.quantize
            config.text_config.speculator = config.speculator
            self.text_model = load_text_model(
                prefix=prefix,
                config=config.text_config,
                weights=weights,
            )

        self.pad_token_id = (
            config.pad_token_id if config.pad_token_id is not None else -1
        )
        self.dtype = weights.dtype

    def get_attention_mask(
        self,
        input_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        dtype: torch.dtype,
        bool_mask: bool = False,
    ):
        image_token_mask = (input_ids == self.config.image_token_index).to(
            input_ids.device
        )

        device = input_ids.device
        min_dtype = torch.finfo(dtype).min

        lengths = (cu_seqlen_prefill[1:] - cu_seqlen_prefill[:-1]).tolist()
        batch_size = len(lengths)

        sequence_length = max(lengths)
        target_length = sequence_length
        # Create the padding mask from the computed lengths.
        # pad_mask: [batch, sequence_length] where True indicates valid tokens.
        seq_range = torch.arange(sequence_length, device=device).unsqueeze(0)
        lengths_tensor = torch.tensor(lengths, device=device).unsqueeze(1)
        pad_mask = seq_range < lengths_tensor  # shape: [batch, sequence_length]

        # Build the base causal mask (for non-image tokens):
        causal_mask = torch.tril(
            torch.ones(
                (sequence_length, sequence_length), dtype=torch.bool, device=device
            )
        )
        base_mask = pad_mask.unsqueeze(2) & pad_mask.unsqueeze(
            1
        )  # [batch, sequence_length, sequence_length]
        base_mask = base_mask & causal_mask.unsqueeze(0)  # apply causal constraint

        image_token_mask = torch.nn.utils.rnn.pad_sequence(
            torch.split(image_token_mask, lengths), batch_first=True, padding_value=0
        )
        bidirectional_mask = image_token_mask.unsqueeze(2) & image_token_mask.unsqueeze(
            1
        )

        # Combine the causal base mask and the bidirectional mask.
        combined_mask = torch.logical_or(
            base_mask.unsqueeze(1), bidirectional_mask.unsqueeze(1)
        ).to(device)
        # combined_mask now has shape [batch, 1, sequence_length, sequence_length]

        full_attention_mask = torch.zeros(
            (batch_size, 1, sequence_length, target_length),
            device=device,
            dtype=torch.bool,
        )
        full_attention_mask[:, :, :, :sequence_length] = combined_mask

        if bool_mask:
            return full_attention_mask
        else:
            return torch.where(full_attention_mask, 0, min_dtype).to(device)

    def get_vision_embeds(
        self,
        pixel_values: torch.FloatTensor,
        pixel_attention_mask: Optional[torch.FloatTensor] = None,
        image_sizes: Optional[torch.Tensor] = None,
        image_grid_thw: Optional[torch.LongTensor] = None,
    ):
        pixel_values = pixel_values.to(dtype=self.dtype)
        image_outputs = self.vision_model(pixel_values)
        vision_outputs = self.post_vision_model_layernorm(
            image_outputs.last_hidden_state
        )
        image_features = self.multimodal_projector(vision_outputs)
        image_features = image_features.view(-1, image_features.shape[-1])
        return image_features

    def get_inputs_embeds(
        self,
        input_ids: torch.Tensor,
        vision_embeds: torch.Tensor = None,
    ):
        inputs_embeds = self.text_model.embed_tokens(input_ids)

        if vision_embeds is not None:
            # Replace the image token embeddings with the vision features
            image_token_mask = (input_ids == self.config.image_token_index).to(
                input_ids.device
            )
            inputs_embeds[image_token_mask] = vision_embeds.view(
                -1, vision_embeds.shape[-1]
            )
        return inputs_embeds

    def forward(
        self,
        inputs_embeds: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        prefill_cache_indices: Optional[torch.Tensor] = None,
        lm_head_indices: Optional[torch.Tensor] = None,
        pixel_values: torch.FloatTensor = None,
        # Unused here
        attention_mask: Optional[torch.BoolTensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        if cu_seqlen_prefill is not None:
            max_s += 1
            position_ids += 1

        # Use flash attention for text-only input
        # else:
        #     if cu_seqlen_prefill is not None:
        #         min_dtype = torch.finfo(inputs_embeds.dtype).min
        #         lengths = (cu_seqlen_prefill[1:] - cu_seqlen_prefill[:-1]).tolist()

        #         # Determine the maximum sequence length (after padding) from query.
        #         sequence_length = max(lengths)
        #         target_length = sequence_length

        #         # Create the padding mask from the computed lengths.
        #         # pad_mask: [batch, sequence_length] where True indicates valid tokens.
        #         seq_range = torch.arange(
        #             sequence_length, device=input_ids.device
        #         ).unsqueeze(0)
        #         lengths_tensor = torch.tensor(
        #             lengths, device=input_ids.device
        #         ).unsqueeze(1)
        #         pad_mask = seq_range < lengths_tensor  # shape: [batch, sequence_length]

        #         # Build the base causal mask (for non-image tokens):
        #         causal_mask = torch.tril(
        #             torch.ones(
        #                 (sequence_length, sequence_length),
        #                 dtype=torch.bool,
        #                 device=input_ids.device,
        #             )
        #         )
        #         base_mask = pad_mask.unsqueeze(2) & pad_mask.unsqueeze(
        #             1
        #         )  # [batch, sequence_length, sequence_length]
        #         base_mask = base_mask & causal_mask.unsqueeze(0)
        #         attention_mask = base_mask.unsqueeze(
        #             1
        #         )  # [batch, 1, sequence_length, sequence_length]
        #         full_attention_mask = torch.zeros(
        #             (len(lengths), 1, sequence_length, target_length),
        #             device=input_ids.device,
        #             dtype=torch.bool,
        #         )
        #         full_attention_mask[:, :, :, :sequence_length] = attention_mask

        #         attention_mask = torch.where(full_attention_mask, 0, min_dtype).to(
        #             input_ids.device
        #         )

        if attention_mask is not None:
            min_dtype = torch.finfo(inputs_embeds.dtype).min
            # prefill may be larger than sliding window
            effective_seq_len = max(
                position_ids.shape[0], self.config.text_config.sliding_window
            )
            sliding_window_mask = torch.tril(
                torch.ones_like(attention_mask, dtype=torch.bool),
                diagonal=-self.config.text_config.sliding_window,
            )
            attention_mask_local = torch.where(
                sliding_window_mask, min_dtype, attention_mask
            )
            offset = max(0, position_ids.shape[0] - effective_seq_len)
            attention_mask_local = attention_mask_local[
                :, :, :, offset : offset + effective_seq_len
            ]
        else:
            attention_mask_local = None

        hidden_states = self.text_model.model(
            inputs_embeds=inputs_embeds,
            position_ids=position_ids,
            cu_seqlen_prefill=cu_seqlen_prefill,
            kv_cache=kv_cache,
            block_tables=block_tables,
            slots=slots,
            seqlen=seqlen,
            max_s=max_s,
            attention_mask=attention_mask,
            attention_mask_local=attention_mask_local,
        )

        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.text_model.lm_head(hidden_states)

        # pad logit with 1 zero logit for the image token
        if pixel_values is not None:
            logits = torch.cat(
                [logits, torch.zeros(logits.size(0), 1, device=logits.device)], dim=1
            )
            if speculative_logits is not None:
                speculative_logits = torch.cat(
                    [
                        speculative_logits,
                        torch.zeros(
                            speculative_logits.size(0),
                            1,
                            device=speculative_logits.device,
                        ),
                    ],
                    dim=1,
                )

        return logits, speculative_logits


================================================
FILE: server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
================================================
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
import torch.distributed

from torch import nn
from transformers.activations import ACT2FN
from transformers.configuration_utils import PretrainedConfig
from typing import Optional, List, Tuple
from text_generation_server.layers.attention import (
    paged_attention,
    attention,
    Seqlen,
)
from text_generation_server.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    SpeculativeHead,
    get_linear,
)
from text_generation_server.layers.attention.kv_cache import get_kv_scales
from text_generation_server.layers.rotary import PositionRotaryEmbedding
from text_generation_server.layers.layernorm import (
    FastRMSNorm,
)
from text_generation_server.utils.weights import UnquantizedWeight


class GemmaConfig(PretrainedConfig):
    def __init__(
        self,
        vocab_size=256128,
        hidden_size=3072,
        intermediate_size=24576,
        num_hidden_layers=28,
        num_attention_heads=16,
        num_key_value_heads=16,
        head_dim=256,
        hidden_act="gelu_pytorch_tanh",
        max_position_embeddings=8192,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        pad_token_id=None,
        bos_token_id=1,
        eos_token_id=2,
        tie_word_embeddings=True,
        rope_theta=10000.0,
        rope_scaling=None,
        attention_bias=False,
        attention_dropout=0.0,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.head_dim = head_dim
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads

        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )


class GemmaFastRMSNorm(FastRMSNorm):
    @classmethod
    def load(cls, prefix: str, weights, eps=1e-6):
        dtype = weights.dtype
        weights.dtype = torch.float32
        weight = weights.get_tensor(f"{prefix}.weight") + 1
        weights.dtype = dtype
        new = cls(weight, eps)
        new.dtype = dtype
        return new

    # perform the multiplication in full precision and downcast after
    def forward(self, hidden_states, residual=None):
        if residual is not None:
            hidden_states += residual
        residual = hidden_states
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        hidden_states = hidden_states * self.weight
        return hidden_states.to(self.dtype), residual


def load_attention(config, prefix: str, weights):
    if config.num_attention_heads != config.num_key_value_heads:
        return _load_gqa(config, prefix, weights)
    else:
        return TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
            dim=0,
            weights=weights,
            bias=False,
        )


def _load_gqa(config, prefix: str, weights):
    assert config.num_attention_heads % weights.process_group.size() == 0

    weight = weights.get_multi_weights_col(
        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
        dim=0,
    )

    if isinstance(weight, UnquantizedWeight):
        weight.weight = weight.weight.to(dtype=weights.dtype).to(device=weights.device)

        head_size = config.head_dim
        num_heads = config.num_attention_heads // weights.process_group.size()
        num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
        assert list(weight.weight.shape) == [
            (num_heads + 2 * num_key_value_heads) * head_size,
            config.hidden_size,
        ], f"{list(weight.weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"

    return TensorParallelColumnLinear(get_linear(weight, bias=None))


class FlashGemmaAttention(torch.nn.Module):
    def __init__(self, prefix: str, config, weights, causal: bool):
        super().__init__()
        self.num_heads = config.num_attention_heads
        self.head_size = config.head_dim
        self.causal = causal

        self.rotary_emb = PositionRotaryEmbedding.static(
            config=config,
            dim=self.head_size,
            base=config.rope_theta,
            device=weights.device,
        )

        self.softmax_scale = self.head_size**-0.5

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()
        self.num_key_value_heads = (
            config.num_key_value_heads // weights.process_group.size()
        )

        self.query_key_value = load_attention(config, prefix, weights)
        self.kv_scales = get_kv_scales(weights, f"{prefix}")

        self.o_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.o_proj",
            weights=weights,
            bias=False,
        )
        self.num_groups = self.num_heads // self.num_key_value_heads
        self.kv_head_mapping = torch.arange(
            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
        ).repeat_interleave(self.num_groups)

    def forward(
        self,
        hidden_states,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        seqlen,
        max_s,
    ):
        qkv = self.query_key_value(hidden_states)
        query, kv = qkv.split(
            [
                self.head_size * self.num_heads,
                2 * self.head_size * self.num_key_value_heads,
            ],
            dim=1,
        )
        query = query.view(-1, self.num_heads, self.head_size)
        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)

        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)

        kv_cache.store(
            key=kv[:, 0],
            value=kv[:, 1],
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
            attn_output = attention(
                query=query,
                key=kv[:, 0],
                value=kv[:, 1],
                kv_cache=kv_cache,
                kv_scales=self.kv_scales,
                seqlen=seqlen,
                block_tables=block_tables,
                softmax_scale=self.softmax_scale,
                causal=self.causal,
            )
        # Decode
        else:
            attn_output = paged_attention(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                seqlen,
                max_s,
                kv_scales=self.kv_scales,
            )

        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))


class GemmaMLP(nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()
        act = config.hidden_act
        self.act = (
            ACT2FN[act]
            if "gelu" not in act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
                ),
            )
        )
        # Fuse gate and up proj
        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
            weights=weights,
            dim=0,
            bias=False,
        )
        self.down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.down_proj",
            weights=weights,
            bias=False,
        )
        self.intermediate_size = (
            config.intermediate_size // weights.process_group.size()
        )

    def forward(self, hidden_states):
        gate_up_states = self.gate_up_proj(hidden_states)
        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
        return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1])


class FlashGemmaLayer(nn.Module):
    def __init__(self, prefix: str, config, weights, causal: bool):
        super().__init__()
        self.self_attn = FlashGemmaAttention(
            prefix=f"{prefix}.self_attn", config=config, weights=weights, causal=causal
        )
        self.mlp = GemmaMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)

        self.input_layernorm = GemmaFastRMSNorm.load(
            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
        )
        self.post_attention_layernorm = GemmaFastRMSNorm.load(
            prefix=f"{prefix}.post_attention_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        seqlen,
        max_s,
    ):
        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)

        # Self Attention
        attn_output = self.self_attn(
            normed_hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
        )

        # faster post attention rms norm
        normed_attn_res_output, attn_res = self.post_attention_layernorm(
            attn_output, res
        )

        mlp_output = self.mlp(normed_attn_res_output)

        return mlp_output, attn_res


class FlashGemmaModel(torch.nn.Module):
    def __init__(self, prefix: str, config, weights, causal: bool):
        super().__init__()

        process_group = weights.process_group
        self.tp_rank = process_group.rank()
        self.tp_world_size = process_group.size()
        self.layers = nn.ModuleList(
            [
                FlashGemmaLayer(
                    prefix=f"{prefix}.layers.{layer_id}",
                    config=config,
                    weights=weights,
                    causal=causal,
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )
        self.norm = GemmaFastRMSNorm.load(
            prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
        )

        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads
        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads

    def forward(
        self,
        inputs_embeds: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
    ) -> torch.Tensor:
        hidden_states = inputs_embeds

        # Get rotary cos and sin for this forward
        # Avoid to index in each layer
        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
            position_ids, max_s, hidden_states.dtype
        )

        residual = None
        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                block_tables,
                slots,
                seqlen,
                max_s,
            )

        hidden_states, _ = self.norm(hidden_states, residual)

        return hidden_states


class FlashGemmaForCausalLM(torch.nn.Module):
    def __init__(self, prefix: str, config, weights, *, causal: bool = True):
        super().__init__()

        embed_norm = config.hidden_size**0.5
        if not prefix:
            prefix = "model"
        else:
            prefix = f"{prefix}.model"

        self.embed_tokens = TensorParallelEmbedding(
            prefix=f"{prefix}.embed_tokens", weights=weights
        )
        self.embed_tokens.weight *= embed_norm

        self.model = FlashGemmaModel(
            prefix=prefix, config=config, weights=weights, causal=causal
        )
        self.lm_head = SpeculativeHead.load(
            prefix=(
                f"{prefix}.embed_tokens"
                if config.tie_word_embeddings
                else f"{prefix}.lm_head"
            ),
            config=config,
            weights=weights,
        )

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        prefill_cache_indices: Optional[torch.Tensor],
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        input_embeds = self.embed_tokens(input_ids)
        hidden_states = self.model(
            input_embeds,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.lm_head(hidden_states)
        return logits, speculative_logits


================================================
FILE: server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py
================================================
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
import torch.distributed

from torch import nn
from transformers.activations import ACT2FN
from typing import Optional, List, Tuple
from text_generation_server.layers.attention import (
    paged_attention,
    attention,
    Seqlen,
)
from text_generation_server.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    SpeculativeHead,
    get_linear,
)
from text_generation_server.layers.attention.kv_cache import get_kv_scales


def load_qkv(config, prefix: str, weights, head_size, num_heads):
    if config.quantize == "gptq":
        return _load_qkv_gptq(
            config,
            prefix,
            weights,
        )
    elif config.quantize == "marlin":
        raise RuntimeError(
            "GPT-2 models with marlin quantization are not yet supported"
        )
    else:
        return _load_qkv(config, prefix, weights, head_size, num_heads)


def _load_qkv_gptq(config, prefix: str, weights):
    world_size = weights.process_group.size()
    rank = weights.process_group.rank()

    # Weights
    weight = weights.get_weights_col_packed_qkv(
        f"{prefix}.c_attn",
        config.num_attention_heads,
        config.num_attention_heads,
    )

    # Bias
    slice_ = weights._get_slice(f"{prefix}.c_attn.bias")
    shape = slice_.get_shape()
    total_size = shape[0]
    assert total_size % 3 == 0, f"Prepacked is not divisible by {3}"
    single_size = total_size // 3
    assert single_size % world_size == 0
    block_size = single_size // world_size
    start = rank * block_size
    stop = (rank + 1) * block_size
    tensors = []
    for i in range(3):
        tensor = slice_[start + i * single_size : stop + i * single_size]
        tensors.append(tensor)
    bias = torch.cat(tensors, dim=0)
    bias = bias.to(device=weights.device)

    return TensorParallelColumnLinear(get_linear(weight, bias))


def _load_qkv(config, prefix: str, weights, head_size, num_heads):
    """Load QKV from a single, transposed matrix."""

    slice_ = weights._get_slice(f"{prefix}.c_attn.weight")
    shape = slice_.get_shape()
    total_size = shape[1]
    assert total_size % 3 == 0, f"Prepacked is not divisible by {3}"
    world_size = weights.process_group.size()
    single_size = total_size // 3
    assert single_size % world_size == 0
    rank = weights.process_group.rank()

    # Weights
    block_size = single_size // world_size
    start = rank * block_size
    stop = (rank + 1) * block_size
    tensors = []
    for i in range(3):
        tensor = slice_[:, start + i * single_size : stop + i * single_size]
        tensors.append(tensor)
    weight = torch.cat(tensors, dim=1).T
    weight = weight.to(dtype=weights.dtype)
    weight = weight.to(device=weights.device)

    # Bias
    slice_ = weights._get_slice(f"{prefix}.c_attn.bias")
    shape = slice_.get_shape()
    total_size = shape[0]
    single_size = total_size // 3
    block_size = single_size // world_size
    assert single_size % world_size == 0
    start = rank * block_size
    stop = (rank + 1) * block_size
    b = []
    for i in range(3):
        tensor = slice_[start + i * single_size : stop + i * single_size]
        b.append(tensor)
    bias = torch.cat(b, dim=0)
    bias = bias.to(dtype=weights.dtype)
    bias = bias.to(device=weights.device)
    assert list(bias.shape) == [
        3 * num_heads * head_size
    ], f"{weight.shape} != {[3 * num_heads * head_size]}"

    return TensorParallelColumnLinear(get_linear(weight, bias))


def load_row(config, prefix: str, weights, bias: bool):
    """load_row, but with transposed weight matrices."""

    if config.quantize == "gptq":
        weight = weights.get_weights_row(prefix)
    else:
        weight = weights.get_sharded(f"{prefix}.weight", dim=0).T

    if bias and weights.process_group.rank() == 0:
        # Rank is only on the first rank process
        bias = weights.get_tensor(f"{prefix}.bias")
    else:
        bias = None

    return TensorParallelRowLinear(
        get_linear(weight, bias), process_group=weights.process_group
    )


def load_col(config, prefix: str, weights, bias: bool):
    """load_col, but with transposed weight matrices."""
    if config.quantize == "gptq":
        weight = weights.get_multi_weights_col([prefix], dim=1)
    else:
        weight = weights.get_sharded(f"{prefix}.weight", dim=1).T

    if bias:
        bias = weights.get_sharded(f"{prefix}.bias", dim=0)
    else:
        bias = None

    return TensorParallelColumnLinear(get_linear(weight, bias))


class FlashGPT2Attention(torch.nn.Module):
    def __init__(
        self,
        prefix: str,
        config,
        weights,
    ):
        super().__init__()
        self.num_heads = config.num_attention_heads
        self.hidden_size = config.hidden_size

        self.head_size = self.hidden_size // self.num_heads
        self.softmax_scale = self.head_size**-0.5

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()

        self.query_key_value = load_qkv(
            config,
            prefix=prefix,
            weights=weights,
            head_size=self.head_size,
            num_heads=self.num_heads,
        )
        self.kv_scales = get_kv_scales(weights, f"{prefix}")

        self.o_proj = load_row(
            config,
            prefix=f"{prefix}.c_proj",
            weights=weights,
            bias=True,
        )

        self.kv_head_mapping = torch.arange(
            0, self.num_heads, dtype=torch.int32, device=weights.device
        )

    def forward(
        self,
        hidden_states,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        seqlen,
        max_s,
    ):
        query, key, value = self.query_key_value(hidden_states).split(
            self.head_size * self.num_heads, dim=1
        )
        query = query.view(-1, self.num_heads, self.head_size)
        key = key.view(-1, self.num_heads, self.head_size)
        value = value.view(-1, self.num_heads, self.head_size)

        kv_cache.store(
            key=key,
            value=value,
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
            attn_output = attention(
                query=query,
                key=key,
                value=value,
                kv_cache=kv_cache,
                kv_scales=self.kv_scales,
                seqlen=seqlen,
                block_tables=block_tables,
                softmax_scale=self.softmax_scale,
            )
        # Decode
        else:
            attn_output = paged_attention(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                seqlen,
                max_s,
                kv_scales=self.kv_scales,
            )

        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))


class GPT2MLP(nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()
        act = config.activation_function
        self.act = (
            ACT2FN[act]
            if "gelu" not in act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
                ),
            )
        )

        self.c_fc = load_col(
            config, prefix=f"{prefix}.c_fc", weights=weights, bias=True
        )
        self.c_proj = load_row(
            config,
            prefix=f"{prefix}.c_proj",
            weights=weights,
            bias=True,
        )

        intermediate_size = (
            config.n_inner if config.n_inner is not None else 4 * config.hidden_size
        )

        self.intermediate_size = intermediate_size // weights.process_group.size()

    def forward(self, hidden_states):
        hidden_states = self.c_fc(hidden_states)
        hidden_states = self.act(hidden_states)
        return self.c_proj(hidden_states)


class FlashGPT2Layer(nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()
        self.self_attn = FlashGPT2Attention(
            prefix=f"{prefix}.attn", config=config, weights=weights
        )
        self.mlp = GPT2MLP(prefix=f"{prefix}.mlp", config=config, weights=weights)

        self.input_layernorm = nn.LayerNorm.load(
            prefix=f"{prefix}.ln_1", weights=weights, eps=config.layer_norm_epsilon
        )
        self.post_attention_layernorm = nn.LayerNorm.load(
            prefix=f"{prefix}.ln_2",
            weights=weights,
            eps=config.layer_norm_epsilon,
        )

    def forward(
        self,
        hidden_states,
        residual,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        seqlen,
        max_s,
    ):
        residual = hidden_states
        hidden_states = self.input_layernorm(hidden_states)

        # Self Attention
        attn_output = self.self_attn(
            hidden_states,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
        )

        hidden_states = attn_output + residual
        residual = hidden_states

        hidden_states = self.post_attention_layernorm(hidden_states)

        mlp_output = self.mlp(hidden_states)

        return residual + mlp_output, residual


class FlashGPT2Model(torch.nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()

        process_group = weights.process_group
        self.tp_rank = process_group.rank()
        self.tp_world_size = process_group.size()
        self.layers = nn.ModuleList(
            [
                FlashGPT2Layer(
                    prefix=(
                        f"h.{layer_id}" if not prefix else f"{prefix}.h.{layer_id}"
                    ),
                    config=config,
                    weights=weights,
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )

        self.norm = nn.LayerNorm.load(
            prefix="ln_f" if not prefix else f"{prefix}.ln_f",
            weights=weights,
            eps=config.layer_norm_epsilon,
        )

        self.gradient_checkpointing = False

        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads

    def forward(
        self,
        inputs_embeds: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        true_max_s: int,
        prefill_cache_indices: Optional[torch.Tensor],
    ) -> torch.Tensor:
        hidden_states = inputs_embeds

        residual = None
        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cu_seqlen_prefill,
                kv_cache[i],
                block_tables,
                slots,
                seqlen,
                max_s,
            )

        hidden_states = self.norm(hidden_states)

        return hidden_states


class FlashGPT2ForCausalLM(torch.nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()

        self.embed_tokens = TensorParallelEmbedding(
            prefix=("wte" if not prefix else f"{prefix}.wte"),
            weights=weights,
        )
        self.embed_positions = TensorParallelEmbedding(
            prefix=("wpe" if not prefix else f"{prefix}.wpe"),
            weights=weights,
        )

        self.model = FlashGPT2Model(prefix, config, weights)
        self.lm_head = SpeculativeHead.load(
            config,
            prefix="wte" if not prefix else f"{prefix}.wte",
            weights=weights,
        )

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        prefill_cache_indices: Optional[torch.Tensor] = None,
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        token_embeds = self.embed_tokens(input_ids)
        position_embeds = self.embed_positions(position_ids)
        inputs_embeds = token_embeds + position_embeds
        hidden_states = self.model(
            inputs_embeds,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
            true_max_s=max_s,
            prefill_cache_indices=prefill_cache_indices,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.lm_head(hidden_states)
        return logits, speculative_logits


================================================
FILE: server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py
================================================
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
import torch.distributed

from torch import nn
from transformers.activations import ACT2FN
from typing import Optional, List, Tuple
from text_generation_server.layers.attention.kv_cache import get_kv_scales
from text_generation_server.utils.import_utils import SYSTEM
from text_generation_server.layers.attention import (
    paged_attention,
    attention,
    Seqlen,
)
from text_generation_server.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    SpeculativeHead,
    get_linear,
)
from text_generation_server.layers.rotary import (
    PositionRotaryEmbedding,
)
from text_generation_server.layers.layernorm import (
    FastLayerNorm,
)


def load_attention(config, prefix: str, weights):
    return TensorParallelColumnLinear.load_multi(
        config,
        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
        dim=0,
        weights=weights,
        bias=False,
    )


def load_row(config, prefix: str, weights, bias: bool):
    weight = weights.get_weights_row(prefix)

    if bias and weights.process_group.rank() == 0:
        # Rank is only on the first rank process
        bias = weights.get_tensor(f"{prefix}.bias")
    else:
        bias = None

    linear = get_linear(weight, bias)
    return TensorParallelRowLinear(linear, process_group=weights.process_group)


class GPTJRotary(PositionRotaryEmbedding):
    def forward(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        cos: torch.Tensor,
        sin: torch.Tensor,
    ):
        # Such controlflows may add some overhead.
        if SYSTEM == "cuda":
            from text_generation_server.utils.kernels import load_kernel

            rotary = load_kernel(module="rotary", repo_id="kernels-community/rotary")

            q1 = query[..., ::2]
            q2 = query[..., 1::2]

            rotary.apply_rotary(q1, q2, cos, sin, q1, q2, False)

            k1 = key[..., ::2]
            k2 = key[..., 1::2]

            rotary.apply_rotary(k1, k2, cos, sin, k1, k2, False)
        elif SYSTEM == "rocm":
            import vllm._custom_ops as ops

            # NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems.
            # Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773

            head_size = query.shape[-1]

            # Inplace operation, updating query and key.
            ops.rotary_embedding(query, key, head_size, cos, sin, False)
        elif SYSTEM == "ipex":
            import intel_extension_for_pytorch as ipex

            ipex.llm.functional.rotary_embedding(
                query, key, sin, cos, query.size(-1), False
            )
        else:
            raise ValueError(
                "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
            )


class FlashGPTJAttention(torch.nn.Module):
    def __init__(
        self,
        prefix: str,
        config,
        weights,
    ):
        super().__init__()
        self.num_heads = config.num_attention_heads
        self.hidden_size = config.hidden_size

        self.head_size = self.hidden_size // self.num_heads
        self.softmax_scale = self.head_size**-0.5
        self.rotary_dim = config.rotary_dim

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()

        self.query_key_value = load_attention(
            config,
            prefix=prefix,
            weights=weights,
        )
        self.kv_scales = get_kv_scales(weights, f"{prefix}")

        self.o_proj = load_row(
            config,
            prefix=f"{prefix}.out_proj",
            weights=weights,
            bias=False,
        )

        self.kv_head_mapping = torch.arange(
            0, self.num_heads, dtype=torch.int32, device=weights.device
        )

        self.rotary_emb = GPTJRotary.static(
            config=config,
            dim=self.rotary_dim,
            base=10000,
            device=weights.device,
        )

    def forward(
        self,
        hidden_states,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        seqlen,
        max_s,
    ):
        query, key, value = self.query_key_value(hidden_states).split(
            self.head_size * self.num_heads, dim=1
        )
        query = query.view(-1, self.num_heads, self.head_size)
        key = key.view(-1, self.num_heads, self.head_size)
        value = value.view(-1, self.num_heads, self.head_size)

        # Compute rotary embeddings on rotary_ndims
        if self.rotary_dim is not None:
            self.rotary_emb(
                query[..., : self.rotary_dim], key[..., : self.rotary_dim], cos, sin
            )
        else:
            self.rotary_emb(query, key, cos, sin)

        kv_cache.store(
            key=key,
            value=value,
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
            attn_output = attention(
                query=query,
                key=key,
                value=value,
                kv_cache=kv_cache,
                kv_scales=self.kv_scales,
                seqlen=seqlen,
                block_tables=block_tables,
                softmax_scale=self.softmax_scale,
            )
        # Decode
        else:
            attn_output = paged_attention(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                seqlen,
                max_s,
                kv_scales=self.kv_scales,
            )

        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))


class GPTJMLP(nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()
        act = config.activation_function
        self.act = (
            ACT2FN[act]
            if "gelu" not in act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
                ),
            )
        )

        self.fc_in = TensorParallelColumnLinear.load(
            config, prefix=f"{prefix}.fc_in", weights=weights, bias=True
        )

        self.fc_out = load_row(
            config,
            prefix=f"{prefix}.fc_out",
            weights=weights,
            bias=True,
        )

    def forward(self, hidden_states):
        hidden_states = self.fc_in(hidden_states)
        hidden_states = self.act(hidden_states)
        return self.fc_out(hidden_states)


class FlashGPTJLayer(nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()
        self.self_attn = FlashGPTJAttention(
            prefix=f"{prefix}.attn", config=config, weights=weights
        )
        self.mlp = GPTJMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)

        self.input_layernorm = FastLayerNorm.load(
            prefix=f"{prefix}.ln_1", weights=weights, eps=config.layer_norm_epsilon
        )

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        seqlen,
        max_s,
    ):
        hidden_states, residual = self.input_layernorm(hidden_states, residual)
        # Self Attention
        attn_output = self.self_attn(
            hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
        )

        feed_forward_hidden_states = self.mlp(hidden_states)

        return attn_output + feed_forward_hidden_states, residual


class FlashGPTJModel(torch.nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()
        self.config = config

        self.wte = TensorParallelEmbedding(prefix=f"{prefix}.wte", weights=weights)
        self.layers = nn.ModuleList(
            [
                FlashGPTJLayer(
                    prefix=(
                        f"h.{layer_id}" if not prefix else f"{prefix}.h.{layer_id}"
                    ),
                    config=config,
                    weights=weights,
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )

        self.ln_f = FastLayerNorm.load(
            prefix="ln_f" if not prefix else f"{prefix}.ln_f",
            weights=weights,
            eps=config.layer_norm_epsilon,
        )

        self.gradient_checkpointing = False

        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads

    def forward(
        self,
        input_ids: Optional[torch.LongTensor],
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        prefill_cache_indices: Optional[torch.Tensor],
    ) -> torch.Tensor:
        hidden_states = self.wte(input_ids)

        # Get rotary cos and sin for this forward
        # Avoid to index in each layer
        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
            position_ids, max_s, hidden_states.dtype
        )

        residual = None
        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                block_tables,
                slots,
                seqlen,
                max_s,
            )

        hidden_states, _ = self.ln_f(hidden_states, residual)

        return hidden_states


class FlashGPTJForCausalLM(torch.nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()
        if not prefix:
            prefix = "transformer"
        else:
            prefix = f"{prefix}.transformer"
        self.model = FlashGPTJModel(prefix, config, weights)
        self.lm_head = SpeculativeHead.load(
            config,
            prefix="lm_head",
            weights=weights,
        )

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        prefill_cache_indices: Optional[torch.Tensor] = None,
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        hidden_states = self.model(
            input_ids,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
            prefill_cache_indices=prefill_cache_indices,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.lm_head(hidden_states)
        return logits, speculative_logits


================================================
FILE: server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
================================================
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from contextlib import contextmanager
from typing import List, Optional, Tuple, Type

import torch
import torch.distributed

from torch import nn
from transformers.activations import ACT2FN

from text_generation_server.layers.attention import (
    KVCache,
    get_kv_scales,
)
from text_generation_server.layers.moe import DenseMoELayer, MoELayer, SparseMoELayer
from text_generation_server.utils.import_utils import SYSTEM
from text_generation_server.layers.attention import (
    paged_attention,
    attention,
    Seqlen,
)
from text_generation_server.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    SpeculativeHead,
    TensorParallelMultiAdapterLinear,
    TensorParallelAdapterRowLinear,
)
from text_generation_server.layers.rotary import PositionRotaryEmbedding
from text_generation_server.layers.layernorm import (
    FastRMSNorm,
    FastLayerNorm,
)
from text_generation_server.layers import (
    FastLinear,
)
from text_generation_server.utils.weights import (
    Weights,
)
from text_generation_server.layers.fp8 import HybridFP8UnquantLoader

if SYSTEM != "ipex":
    pass

if SYSTEM == "rocm":
    try:
        import vllm._custom_ops as ops
    except Exception as e:
        raise ImportError(f"Could not load `vllm._custom_ops`. Full error: {e}")


def load_attention(config, prefix: str, weights, layer_id):
    # Only defined in granite.
    bias = getattr(config, "attention_bias", False)
    head_size = config.hidden_size // config.num_attention_heads
    sizes = None
    prefixes = None

    if config.model_type == "phi3":
        base_layer = TensorParallelColumnLinear.load_qkv(
            config,
            prefix=f"{prefix}.qkv_proj",
            weights=weights,
            bias=bias,
            num_heads=config.num_attention_heads,
            num_key_value_heads=config.num_key_value_heads,
        )
        prefixes = ["qkv_proj"]
    elif config.model_type == "baichuan":
        prefix = f"{prefix}.W_pack"
        base_layer = TensorParallelColumnLinear.load_qkv(
            config,
            prefix=prefix,
            weights=weights,
            bias=bias,
            num_heads=config.num_attention_heads,
            num_key_value_heads=config.num_key_value_heads,
        )
        prefixes = [prefix]
    else:
        prefixes = ["q_proj", "k_proj", "v_proj"]
        sizes = [
            head_size * config.num_attention_heads,
            head_size * config.num_key_value_heads,
            head_size * config.num_key_value_heads,
        ]
        base_layer = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
            dim=0,
            weights=weights,
            bias=bias,
        )

    return TensorParallelMultiAdapterLinear.load(
        base_layer=base_layer,
        layer_id=layer_id,
        layer_names=prefixes,
        sizes=sizes,
        process_group=weights.process_group,
    )


@contextmanager
def no_fp8(weights: Weights):
    """De-activate fp8 auto conversion for the duration of this context manager"""
    weights_loader = weights.weights_loader
    if isinstance(weights_loader, HybridFP8UnquantLoader) and weights_loader.to_fp8:
        weights_loader = HybridFP8UnquantLoader(
            weights_loader.activation_scale_ub, to_fp8=False
        )

    with weights.use_loader(weights_loader):
        yield


class FlashLlamaAttention(torch.nn.Module):
    def __init__(
        self,
        index: int,
        prefix: str,
        config,
        weights,
    ):
        super().__init__()
        self.num_heads = config.num_attention_heads
        self.hidden_size = config.hidden_size
        self.head_size = self.hidden_size // self.num_heads

        # Setting defaults for baichuan custom config which doesn't apply them.
        config.rope_theta = getattr(config, "rope_theta", 10000)
        config.num_key_value_heads = getattr(
            config, "num_key_value_heads", config.num_attention_heads
        )
        self.rotary_emb = PositionRotaryEmbedding.static(
            config=config,
            dim=self.head_size,
            base=config.rope_theta,
            device=weights.device,
        )

        # `config.attention_multiplier` is used in Granite
        self.softmax_scale = getattr(
            config, "attention_multiplier", self.head_size**-0.5
        )

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        if config.num_key_value_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_key_value_heads` must be divisible by `num_shards` (got `num_key_value_heads`: {config.num_key_value_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()
        self.num_key_value_heads = (
            config.num_key_value_heads // weights.process_group.size()
        )

        self.query_key_value = load_attention(config, prefix, weights, index)
        self.index = index

        self.kv_scales = get_kv_scales(weights, f"{prefix}")

        o_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.o_proj",
            weights=weights,
            bias=getattr(config, "attention_bias", False),
        )

        self.o_proj = TensorParallelAdapterRowLinear.load(
            o_proj,
            index,
            "o_proj",
            process_group=weights.process_group,
        )

        self.num_groups = self.num_heads // self.num_key_value_heads
        self.kv_head_mapping = torch.arange(
            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
        ).repeat_interleave(self.num_groups)

    def forward(
        self,
        hidden_states,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache: KVCache,
        block_tables,
        slots,
        seqlen,
        max_s,
        adapter_data,
    ):
        qkv = self.query_key_value(hidden_states, adapter_data)
        query, kv = qkv.split(
            [
                self.head_size * self.num_heads,
                2 * self.head_size * self.num_key_value_heads,
            ],
            dim=1,
        )
        query = query.view(-1, self.num_heads, self.head_size)
        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)

        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)

        kv_cache.store(
            key=kv[:, 0],
            value=kv[:, 1],
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
            attn_output = attention(
                query=query,
                key=kv[:, 0],
                value=kv[:, 1],
                kv_scales=self.kv_scales,
                kv_cache=kv_cache,
                seqlen=seqlen,
                block_tables=block_tables,
                softmax_scale=self.softmax_scale,
            )
        # Decode
        else:
            attn_output = paged_attention(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                seqlen,
                max_s,
                kv_scales=self.kv_scales,
            )

        return self.o_proj(
            attn_output.view(-1, self.num_heads * self.head_size), adapter_data
        )


class Phi3MoE(nn.Module):
    def __init__(
        self, prefix: str, config, moe_layer_cls: Type[MoELayer], weights: Weights
    ):
        super().__init__()

        # gating
        self.gate = FastLinear.load(config, f"{prefix}.gate", weights, bias=False)

        self.moe = moe_layer_cls(
            prefix=f"{prefix}.experts",
            n_experts=config.num_local_experts,
            n_expert_group=None,
            renormalize=True,
            topk=config.num_experts_per_tok,
            topk_group=None,
            weights=weights,
            gate_proj_name="w1",
            up_proj_name="w3",
            down_proj_name="w2",
        )

        self.process_group = weights.process_group

    def forward(self, x, adapter_data) -> torch.Tensor:
        # router_logits: (num_tokens, n_experts)
        router_logits = self.gate(x)
        out = self.moe(x, gating_output=router_logits)

        # Reduce sum
        if self.process_group.size() > 1:
            torch.distributed.all_reduce(out, group=self.process_group)

        return out.view(*x.shape)


class LlamaMLP(nn.Module):
    def __init__(self, prefix, config, weights, index):
        super().__init__()
        self.hidden_act = config.hidden_act
        self.act = (
            ACT2FN[self.hidden_act]
            if "gelu" not in self.hidden_act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh"
                    if self.hidden_act in ["gelu_fast", "gelu_pytorch_tanh"]
                    else "none"
                ),
            )
        )
        prefixes = None
        sizes = None

        # Fuse gate and up proj
        bias = getattr(config, "mlp_bias", False)
        if config.model_type == "phi3":
            gate_up_proj = TensorParallelColumnLinear.load_gate_up(
                config,
                prefix=f"{prefix}.gate_up_proj",
                weights=weights,
                bias=bias,
            )
        else:
            prefixes = ["gate_proj", "up_proj"]
            sizes = [
                config.intermediate_size,
                config.intermediate_size,
            ]
            gate_up_proj = TensorParallelColumnLinear.load_multi(
                config,
                prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
                weights=weights,
                dim=0,
                bias=bias,
            )

        self.gate_up_proj = TensorParallelMultiAdapterLinear.load(
            gate_up_proj,
            index,
            layer_names=prefixes,
            sizes=sizes,
            process_group=weights.process_group,
        )

        down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.down_proj",
            weights=weights,
            bias=bias,
        )

        self.down_proj = TensorParallelAdapterRowLinear.load(
            down_proj,
            index,
            "down_proj",
            process_group=weights.process_group,
        )

        self.intermediate_size = (
            config.intermediate_size // weights.process_group.size()
        )

        # TODO: This is a hotfix to be removed & properly refactored.
        self.quantize = config.quantize

        self.hidden_size = config.hidden_size

    def forward(self, hidden_states, adapter_data):
        if (
            SYSTEM == "rocm"
            and self.hidden_act == "silu"
            and hidden_states.dtype == torch.float16
            and hidden_states.shape[0] == 1
            and not self.quantize
            and self.hidden_size
            != 16384  # TODO: Temporary workaround for `LLMM_Silu` kernel not working with LLama3.1 405B; needs refactoring once fixed.
        ):
            out = torch.empty(
                hidden_states.shape[0],
                self.intermediate_size,
                dtype=hidden_states.dtype,
                device="cuda",
            )
            ops.LLMM_Silu(
                self.gate_up_proj.base_layer.linear.weight, hidden_states, out, 8
            )
            return self.down_proj(out, adapter_data)
        else:
            gate_up_states = self.gate_up_proj(hidden_states, adapter_data)
            gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
            return self.down_proj(
                self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], adapter_data
            )


class FlashLlamaLayer(nn.Module):
    def __init__(self, index, prefix, config, weights):
        super().__init__()

        with no_fp8(weights):
            self.self_attn = FlashLlamaAttention(
                index=index,
                prefix=f"{prefix}.self_attn",
                config=config,
                weights=weights,
            )

        if config.model_type == "phimoe":
            moe_layer_cls = (
                SparseMoELayer
                if SparseMoELayer.is_supported(weights)
                else DenseMoELayer
            )
            self.mlp = Phi3MoE(
                f"{prefix}.block_sparse_moe", config, moe_layer_cls, weights
            )
            # with moe the layernorms are are not rmsnorms and they have bias
            self.input_layernorm = FastLayerNorm.load(
                prefix=f"{prefix}.input_layernorm",
                weights=weights,
                eps=config.rms_norm_eps,
            )
            self.post_attention_layernorm = FastLayerNorm.load(
                prefix=f"{prefix}.post_attention_layernorm",
                weights=weights,
                eps=config.rms_norm_eps,
            )
        else:
            self.mlp = LlamaMLP(
                prefix=f"{prefix}.mlp", config=config, weights=weights, index=index
            )
            self.input_layernorm = FastRMSNorm.load(
                prefix=f"{prefix}.input_layernorm",
                weights=weights,
                eps=config.rms_norm_eps,
            )
            self.post_attention_layernorm = FastRMSNorm.load(
                prefix=f"{prefix}.post_attention_layernorm",
                weights=weights,
                eps=config.rms_norm_eps,
            )

        # Used in Granite
        # This could eventually be baked into the weights like we do for the embeddings/lm_head
        # but this would mean modifying the lora code
        self.residual_multiplier = getattr(config, "residual_multiplier", None)

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        seqlen,
        max_s,
        adapter_data,
        cross_attention_states,
    ):
        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)

        # Self Attention
        attn_output = self.self_attn(
            normed_hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
            adapter_data,
        )
        if self.residual_multiplier is not None:
            attn_output *= self.residual_multiplier

        normed_attn_res_output, attn_res = self.post_attention_layernorm(
            attn_output, res
        )

        mlp_output = self.mlp(normed_attn_res_output, adapter_data)
        if self.residual_multiplier is not None:
            mlp_output *= self.residual_multiplier

        return mlp_output, attn_res


class FlashLlamaModel(torch.nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()

        process_group = weights.process_group
        self.tp_rank = process_group.rank()
        self.tp_world_size = process_group.size()

        # Skip fp8 quant for first and last layers
        self.layers = nn.ModuleList()
        self.cross_attention_layers = getattr(config, "cross_attention_layers", [])
        with no_fp8(weights):
            self.layers.append(
                FlashLlamaLayer(
                    index=0,
                    prefix=f"{prefix}.layers.0",
                    config=config,
                    weights=weights,
                )
            )

        # Skip first and last layers
        for layer_id in range(1, config.num_hidden_layers - 1):
            if layer_id in self.cross_attention_layers:
                from text_generation_server.models.custom_modeling.mllama import (
                    FlashLlamaCrossLayer,
                )

                self.layers.append(
                    FlashLlamaCrossLayer(
                        index=layer_id,
                        prefix=(f"{prefix}.layers.{layer_id}"),
                        config=config,
                        weights=weights,
                    )
                )
            else:
                self.layers.append(
                    FlashLlamaLayer(
                        index=layer_id,
                        prefix=(f"{prefix}.layers.{layer_id}"),
                        config=config,
                        weights=weights,
                    )
                )

        with no_fp8(weights):
            last_layer_id = config.num_hidden_layers - 1
            self.layers.append(
                FlashLlamaLayer(
                    index=last_layer_id,
                    prefix=(f"{prefix}.layers.{last_layer_id}"),
                    config=config,
                    weights=weights,
                )
            )

        self.norm = FastRMSNorm.load(
            prefix=f"{prefix}.norm",
            weights=weights,
            eps=config.rms_norm_eps,
        )

        self.gradient_checkpointing = False

        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads
        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads

    def forward(
        self,
        inputs_embeds: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        true_max_s: int,
        prefill_cache_indices: Optional[torch.Tensor],
        adapter_data,
        cross_attention_states=None,
    ) -> torch.Tensor:
        hidden_states = inputs_embeds

        # Get rotary cos and sin for this forward
        # Avoid to index in each layer
        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
            position_ids, max_s, hidden_states.dtype
        )

        residual = None
        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                block_tables,
                slots,
                seqlen,
                max_s,
                adapter_data,
                cross_attention_states,
            )

        hidden_states, _ = self.norm(hidden_states, residual)

        return hidden_states


class FlashLlamaForCausalLM(torch.nn.Module):
    def __init__(self, prefix: str, config, weights, name=None):
        if name is None:
            name = "model"
        super().__init__()
        with no_fp8(weights):
            self.embed_tokens = TensorParallelEmbedding(
                prefix=(
                    f"{name}.embed_tokens"
                    if not prefix
                    else f"{prefix}.{name}.embed_tokens"
                ),
                weights=weights,
            )
        self.model = FlashLlamaModel(
            prefix=name if not prefix else f"{prefix}.{name}",
            config=config,
            weights=weights,
        )
        if config.tie_word_embeddings:
            suffix = "model.embed_tokens"
        else:
            suffix = "lm_head"

        # Used in Granite
        embedding_multiplier = getattr(config, "embedding_multiplier", None)
        if embedding_multiplier is not None:
            self.embed_tokens.weight.data *= embedding_multiplier
        prefix = suffix if not prefix or name != "model" else f"{prefix}.{suffix}"
        with no_fp8(weights):
            self.lm_head = SpeculativeHead.load(
                config,
                prefix,
                weights,
            )

        # Used in Granite
        self.logits_scaling = getattr(config, "logits_scaling", None)
        if self.logits_scaling is not None and self.lm_head.head is not None:
            try:
                # Scale the weights directly
                self.lm_head.head.linear.weight.data /= self.logits_scaling
                self.logits_scaled = True
            except Exception:
                self.logits_scaled = False

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        prefill_cache_indices: Optional[torch.Tensor] = None,
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
        cross_attention_states=None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        inputs_embeds = self.embed_tokens(input_ids)
        hidden_states = self.model(
            inputs_embeds,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
            true_max_s=max_s,
            prefill_cache_indices=prefill_cache_indices,
            adapter_data=adapter_data,
            cross_attention_states=cross_attention_states,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.lm_head(hidden_states)

        # Used in Granite
        if self.logits_scaling is not None and not self.logits_scaled:
            logits /= self.logits_scaling
            if speculative_logits is not None:
                speculative_logits /= self.logits_scaling

        return logits, speculative_logits


================================================
FILE: server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
================================================
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
import torch.distributed

from torch import nn
from transformers.activations import ACT2FN
from transformers.configuration_utils import PretrainedConfig
from typing import Optional, List, Tuple

from text_generation_server.layers.attention.kv_cache import get_kv_scales
from text_generation_server.utils.import_utils import SYSTEM
from text_generation_server.layers.attention import (
    paged_attention,
    attention,
    Seqlen,
)
from text_generation_server.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    SpeculativeHead,
    TensorParallelMultiAdapterLinear,
    TensorParallelAdapterRowLinear,
)
from text_generation_server.layers.rotary import PositionRotaryEmbedding
from text_generation_server.layers.layernorm import (
    FastRMSNorm,
)


if SYSTEM == "rocm":
    try:
        import vllm._custom_ops as ops
    except Exception as e:
        raise ImportError(f"Could not load `vllm._custom_ops`. Full error: {e}")


class MistralConfig(PretrainedConfig):
    model_type = "mistral"

    def __init__(
        self,
        vocab_size=32000,
        hidden_size=4096,
        intermediate_size=14336,
        num_hidden_layers=32,
        num_attention_heads=32,
        num_key_value_heads=8,
        hidden_act="silu",
        max_position_embeddings=4096 * 32,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        pad_token_id=None,
        bos_token_id=1,
        eos_token_id=2,
        pretraining_tp=1,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        sliding_window=None,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.sliding_window = sliding_window

        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.pretraining_tp = pretraining_tp
        self.use_cache = use_cache
        self.rope_theta = rope_theta

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )


class MistralAttention(torch.nn.Module):
    def __init__(self, prefix: str, config, weights, layer_id):
        super().__init__()
        self.max_past = (
            config.sliding_window if config.sliding_window is not None else -1
        )
        self.num_heads = config.num_attention_heads
        self.hidden_size = config.hidden_size
        if getattr(config, "head_dim", None) is not None:
            self.head_size = config.head_dim
        else:
            self.head_size = self.hidden_size // self.num_heads
        self.rotary_emb = PositionRotaryEmbedding.static(
            config=config,
            dim=self.head_size,
            base=config.rope_theta,
            device=weights.device,
        )

        self.softmax_scale = self.head_size**-0.5

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()
        self.num_key_value_heads = (
            config.num_key_value_heads // weights.process_group.size()
        )

        query_key_value = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
            dim=0,
            weights=weights,
            bias=False,
        )

        self.query_key_value = TensorParallelMultiAdapterLinear.load(
            query_key_value,
            layer_id,
            ["q_proj", "k_proj", "v_proj"],
            sizes=[
                self.head_size * config.num_attention_heads,
                self.head_size * config.num_key_value_heads,
                self.head_size * config.num_key_value_heads,
            ],
            process_group=weights.process_group,
        )
        self.kv_scales = get_kv_scales(weights, f"{prefix}")

        o_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.o_proj",
            weights=weights,
            bias=False,
        )
        self.o_proj = TensorParallelAdapterRowLinear.load(
            o_proj,
            layer_id,
            "o_proj",
            process_group=weights.process_group,
        )
        self.num_groups = self.num_heads // self.num_key_value_heads
        self.kv_head_mapping = torch.arange(
            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
        ).repeat_interleave(self.num_groups)

    def forward(
        self,
        hidden_states,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        seqlen,
        max_s,
        prefill_cache_indices,
        adapter_data,
    ):
        qkv = self.query_key_value(hidden_states, adapter_data)
        query, kv = qkv.split(
            [
                self.head_size * self.num_heads,
                2 * self.head_size * self.num_key_value_heads,
            ],
            dim=1,
        )
        query = query.view(-1, self.num_heads, self.head_size)
        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)

        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)

        if prefill_cache_indices is not None:
            kv_to_cache = kv[prefill_cache_indices]
        else:
            kv_to_cache = kv

        kv_cache.store(
            key=kv_to_cache[:, 0],
            value=kv_to_cache[:, 1],
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
            attn_output = attention(
                query=query,
                key=kv_to_cache[:, 0],
                value=kv_to_cache[:, 1],
                kv_cache=kv_cache,
                kv_scales=self.kv_scales,
                seqlen=seqlen,
                block_tables=block_tables,
                softmax_scale=self.softmax_scale,
                window_size_left=self.max_past,
            )
        # Decode
        else:
            attn_output = paged_attention(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                seqlen,
                max_s,
                kv_scales=self.kv_scales,
                window_size_left=self.max_past,
            )

        return self.o_proj(
            attn_output.view(-1, self.num_heads * self.head_size), adapter_data
        )


class MistralMLP(nn.Module):
    def __init__(self, prefix: str, config, weights, layer_id):
        super().__init__()
        self.hidden_act = config.hidden_act
        self.act = (
            ACT2FN[self.hidden_act]
            if "gelu" not in self.hidden_act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh"
                    if self.hidden_act in ["gelu_fast", "gelu_pytorch_tanh"]
                    else "none"
                ),
            )
        )
        # Fuse gate and up proj
        gate_up_proj = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
            weights=weights,
            dim=0,
            bias=False,
        )
        self.gate_up_proj = TensorParallelMultiAdapterLinear.load(
            gate_up_proj,
            layer_id,
            ["gate_proj", "up_proj"],
            sizes=[
                config.intermediate_size,
                config.intermediate_size,
            ],
            process_group=weights.process_group,
        )

        down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.down_proj",
            weights=weights,
            bias=False,
        )

        self.down_proj = TensorParallelAdapterRowLinear.load(
            down_proj,
            layer_id,
            "down_proj",
            process_group=weights.process_group,
        )
        self.intermediate_size = (
            config.intermediate_size // weights.process_group.size()
        )

        # TODO: This is a hotfix to be removed & properly refactored.
        self.quantize = config.quantize

    def forward(self, hidden_states, adapter_data):
        if (
            SYSTEM == "rocm"
            and self.hidden_act == "silu"
            and hidden_states.dtype == torch.float16
            and hidden_states.shape[0] == 1
            and not self.quantize
        ):
            out = torch.empty(
                hidden_states.shape[0],
                self.intermediate_size,
                dtype=hidden_states.dtype,
                device="cuda",
            )
            ops.LLMM_Silu(
                self.gate_up_proj.base_layer.linear.weight, hidden_states, out, 8
            )
            return self.down_proj(out, adapter_data)
        else:
            gate_up_states = self.gate_up_proj(hidden_states, adapter_data)
            gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
            return self.down_proj(
                self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], adapter_data
            )


class MistralLayer(nn.Module):
    def __init__(self, prefix: str, config, weights, layer_id):
        super().__init__()
        self.self_attn = MistralAttention(
            prefix=f"{prefix}.self_attn",
            config=config,
            weights=weights,
            layer_id=layer_id,
        )
        self.mlp = MistralMLP(
            prefix=f"{prefix}.mlp", config=config, weights=weights, layer_id=layer_id
        )

        self.input_layernorm = FastRMSNorm.load(
            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
        )
        self.post_attention_layernorm = FastRMSNorm.load(
            prefix=f"{prefix}.post_attention_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        seqlen,
        max_s,
        prefill_cache_indices,
        adapter_data,
    ):
        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)

        # Self Attention
        attn_output = self.self_attn(
            normed_hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
            prefill_cache_indices,
            adapter_data,
        )

        # faster post attention rms norm
        normed_attn_res_output, attn_res = self.post_attention_layernorm(
            attn_output, res
        )

        mlp_output = self.mlp(normed_attn_res_output, adapter_data)

        return mlp_output, attn_res


class MistralModel(torch.nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()

        process_group = weights.process_group
        self.tp_rank = process_group.rank()
        self.tp_world_size = process_group.size()
        self.layers = nn.ModuleList(
            [
                MistralLayer(
                    prefix=f"{prefix}.layers.{layer_id}",
                    config=config,
                    weights=weights,
                    layer_id=layer_id,
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )
        self.norm = FastRMSNorm.load(
            prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
        )

        self.gradient_checkpointing = False

        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads
        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads

    def forward(
        self,
        inputs_embeds: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        true_max_s: int,
        prefill_cache_indices: Optional[torch.Tensor],
        adapter_data: Optional[torch.Tensor] = None,
    ):
        hidden_states = inputs_embeds
        # Get rotary cos and sin for this forward
        # Avoid to index in each layer
        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
            position_ids, true_max_s, hidden_states.dtype
        )

        residual = None
        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                block_tables,
                slots,
                seqlen,
                max_s,
                prefill_cache_indices,
                adapter_data,
            )

        hidden_states, _ = self.norm(hidden_states, residual)
        return hidden_states


class FlashMistralForCausalLM(torch.nn.Module):
    def __init__(self, prefix: str, config, weights, name=None):
        if name is None:
            name = "model"
        super().__init__()
        self.embed_tokens = TensorParallelEmbedding(
            prefix=(
                f"{name}.embed_tokens"
                if not prefix
                else f"{prefix}.{name}.embed_tokens"
            ),
            weights=weights,
        )
        self.model = MistralModel(
            prefix=name if not prefix else f"{prefix}.{name}",
            config=config,
            weights=weights,
        )
        self.lm_head = SpeculativeHead.load(
            config,
            # TODO dirty hack for idefics2.
            prefix=(
                "lm_head" if not prefix or name != "model" else f"{prefix}.lm_head"
            ),
            weights=weights,
        )
        self.max_past = config.sliding_window
        self.max_past_tensor = (
            torch.tensor(config.sliding_window, device=weights.device)
            if self.max_past is not None
            else None
        )

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        prefill_cache_indices: Optional[torch.Tensor],
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        true_max_s = max_s
        if prefill_cache_indices is not None:
            # Slots also need to be sliced as it has the same size as the whole kv tensor
            slots = slots[prefill_cache_indices]
        elif self.max_past is not None:
            # Clamp in decode mode as paged attention requires clamped values whereas the flash attention
            # kernel requires the true values
            seqlen = seqlen.clamp(max=self.max_past_tensor)

        inputs_embeds = self.embed_tokens(input_ids)
        hidden_states = self.model(
            inputs_embeds,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
            true_max_s,
            prefill_cache_indices,
            adapter_data,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits = self.lm_head(hidden_states)
        return logits


================================================
FILE: server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
================================================
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List, Optional, Tuple, Type

import torch
import torch.distributed
from torch import nn
from transformers.configuration_utils import PretrainedConfig

from text_generation_server.layers import (
    FastLinear,
    SpeculativeHead,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    TensorParallelRowLinear,
    get_linear,
)
from text_generation_server.layers.attention import (
    Seqlen,
    attention,
    paged_attention,
)
from text_generation_server.layers.attention.kv_cache import get_kv_scales
from text_generation_server.layers.layernorm import FastRMSNorm
from text_generation_server.layers.moe import DenseMoELayer, MoELayer, SparseMoELayer
from text_generation_server.layers.rotary import PositionRotaryEmbedding
from text_generation_server.utils.weights import UnquantizedWeight


class MixtralConfig(PretrainedConfig):
    model_type = "mixtral"

    def __init__(
        self,
        vocab_size=32000,
        hidden_size=4096,
        intermediate_size=14336,
        num_hidden_layers=32,
        num_attention_heads=32,
        num_key_value_heads=8,
        hidden_act="silu",
        max_position_embeddings=4096 * 32,
        initializer_range=0.02,
        rms_norm_eps=1e-05,
        use_cache=True,
        pad_token_id=None,
        bos_token_id=1,
        eos_token_id=2,
        pretraining_tp=1,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        sliding_window=None,
        num_experts_per_tok=2,
        num_local_experts=8,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.sliding_window = sliding_window

        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.pretraining_tp = pretraining_tp
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.num_experts_per_tok = num_experts_per_tok
        self.num_local_experts = num_local_experts

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )


def promote_scalar(x: torch.Tensor) -> torch.Tensor:
    return x.view(1) if len(x.size()) == 0 else x


def load_attention(config, prefix: str, weights):
    if config.num_attention_heads != config.num_key_value_heads:
        return _load_gqa(config, prefix, weights)
    else:
        return TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
            dim=0,
            weights=weights,
            bias=False,
        )


def _load_gqa(config, prefix: str, weights):
    assert config.hidden_size % config.num_attention_heads == 0
    assert config.num_attention_heads % weights.process_group.size() == 0

    weight = weights.get_multi_weights_col(
        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
        dim=0,
    )

    if isinstance(weight, UnquantizedWeight):
        weight.weight = weight.weight.to(dtype=weights.dtype).to(device=weights.device)

        head_size = config.hidden_size // config.num_attention_heads
        num_heads = config.num_attention_heads // weights.process_group.size()
        num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
        assert list(weight.weight.shape) == [
            (num_heads + 2 * num_key_value_heads) * head_size,
            config.hidden_size,
        ], f"{list(weight.weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"

    return TensorParallelColumnLinear(get_linear(weight, bias=None))


def _load_experts(config, prefix: str, mat, weights):
    if config.quantize is not None:
        raise NotImplementedError("Mixtral does not support weight quantization yet.")

    assert mat in ["w1", "w2", "w3"]

    world_size = weights.process_group.size()
    rank = weights.process_group.rank()

    assert (
        config.intermediate_size % world_size == 0
    ), f"The chosen size {config.intermediate_size} is not compatible with sharding on {world_size} shards"

    block_size = config.intermediate_size // world_size
    start = rank * block_size
    stop = (rank + 1) * block_size

    tensor = torch.empty(
        (config.num_local_experts * block_size, config.hidden_size),
        dtype=weights.dtype,
        device=weights.device,
    )

    for i in range(config.num_local_experts):
        slice_ = weights._get_slice(f"{prefix}.{i}.{mat}.weight")

        if mat == "w2":
            expert_slice = slice_[:, start:stop].t().contiguous()
        else:
            expert_slice = slice_[start:stop]
        tensor[i * block_size : (i + 1) * block_size] = expert_slice.to(
            dtype=weights.dtype
        ).to(device=weights.device)
    return tensor


class MixtralAttention(torch.nn.Module):
    def __init__(
        self,
        prefix: str,
        config,
        weights,
    ):
        super().__init__()
        self.max_past = (
            config.sliding_window if config.sliding_window is not None else -1
        )
        self.num_heads = config.num_attention_heads
        self.hidden_size = config.hidden_size
        self.head_size = self.hidden_size // self.num_heads

        self.rotary_emb = PositionRotaryEmbedding.static(
            config=config,
            dim=self.head_size,
            base=config.rope_theta,
            device=weights.device,
        )

        self.softmax_scale = self.head_size**-0.5

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()
        self.num_key_value_heads = (
            config.num_key_value_heads // weights.process_group.size()
        )

        self.query_key_value = load_attention(config, prefix, weights)
        self.kv_scales = get_kv_scales(weights, f"{prefix}")

        self.o_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.o_proj",
            weights=weights,
            bias=False,
        )
        self.num_groups = self.num_heads // self.num_key_value_heads
        self.kv_head_mapping = torch.arange(
            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
        ).repeat_interleave(self.num_groups)

    def forward(
        self,
        hidden_states,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        seqlen,
        max_s,
        prefill_cache_indices,
    ):
        qkv = self.query_key_value(hidden_states)
        query, kv = qkv.split(
            [
                self.head_size * self.num_heads,
                2 * self.head_size * self.num_key_value_heads,
            ],
            dim=1,
        )
        query = query.view(-1, self.num_heads, self.head_size)
        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)

        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)

        if prefill_cache_indices is not None:
            kv_to_cache = kv[prefill_cache_indices]
        else:
            kv_to_cache = kv

        kv_cache.store(
            key=kv_to_cache[:, 0],
            value=kv_to_cache[:, 1],
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
            attn_output = attention(
                query=query,
                key=kv_to_cache[:, 0],
                value=kv_to_cache[:, 1],
                kv_cache=kv_cache,
                kv_scales=self.kv_scales,
                seqlen=seqlen,
                block_tables=block_tables,
                softmax_scale=self.softmax_scale,
                window_size_left=self.max_past,
            )
        # Decode
        else:
            attn_output = paged_attention(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                seqlen,
                max_s,
                kv_scales=self.kv_scales,
                window_size_left=self.max_past,
            )

        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))


@torch.jit.script
def select_experts(gate_logits: torch.Tensor, top_k: int):
    # all_probs: (sequence_length, n_experts) and upcast for softmax
    all_probs = torch.nn.functional.softmax(gate_logits, dim=1, dtype=torch.float)
    # weights, selected_experts: (sequence_length, top-k)
    weights, selected_experts = torch.topk(all_probs, top_k, dim=-1)
    weights /= weights.sum(dim=-1, keepdim=True)
    weights = weights.view(-1)
    selected_experts = selected_experts.view(-1)

    return selected_experts, weights


@torch.jit.script
def round_up(x: torch.Tensor, value: int):
    return torch.div(x + (value - 1), value, rounding_mode="trunc") * value


class MixtralMoE(nn.Module):
    def __init__(
        self, prefix, config: MixtralConfig, moe_layer_cls: Type[MoELayer], weights
    ):
        super().__init__()

        # gating
        self.gate = FastLinear.load(config, f"{prefix}.gate", weights, bias=False)

        self.moe = moe_layer_cls(
            n_expert_group=None,
            n_experts=config.num_local_experts,
            prefix=f"{prefix}.experts",
            renormalize=True,
            topk=config.num_experts_per_tok,
            topk_group=None,
            weights=weights,
            gate_proj_name="w1",
            up_proj_name="w3",
            down_proj_name="w2",
        )
        assert isinstance(self.moe, MoELayer)

        self.process_group = weights.process_group

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # router_logits: (num_tokens, n_experts)
        router_logits = self.gate(x)
        out = self.moe(x, gating_output=router_logits)

        # Reduce sum
        if self.process_group.size() > 1:
            torch.distributed.all_reduce(out, group=self.process_group)

        return out.view(*x.shape)


class MixtralLayer(nn.Module):
    def __init__(self, prefix: str, layer_id, config, weights):
        super().__init__()
        prefix = f"{prefix}.layers.{layer_id}"

        self.self_attn = MixtralAttention(
            prefix=f"{prefix}.self_attn", config=config, weights=weights
        )

        moe_layer_cls = (
            SparseMoELayer if SparseMoELayer.is_supported(weights) else DenseMoELayer
        )
        self.moe = MixtralMoE(
            f"{prefix}.block_sparse_moe", config, moe_layer_cls, weights
        )

        self.input_layernorm = FastRMSNorm.load(
            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
        )
        self.post_attention_layernorm = FastRMSNorm.load(
            prefix=f"{prefix}.post_attention_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        seqlen,
        max_s,
        prefill_cache_indices,
    ):
        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)

        # Self Attention
        attn_output = self.self_attn(
            normed_hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
            prefill_cache_indices,
        )

        # faster post attention rms norm
        normed_attn_res_output, attn_res = self.post_attention_layernorm(
            attn_output, res
        )

        moe_output = self.moe(normed_attn_res_output)

        return moe_output, attn_res


class MixtralModel(torch.nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()

        self.embed_tokens = TensorParallelEmbedding(
            prefix=(
                "model.embed_tokens" if not prefix else f"{prefix}.model.embed_tokens"
            ),
            weights=weights,
        )

        self.layers = nn.ModuleList(
            [
                MixtralLayer(
                    "model" if not prefix else f"{prefix}.model",
                    layer_id,
                    config,
                    weights,
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )
        self.norm = FastRMSNorm.load(
            prefix="model.norm" if not prefix else f"{prefix}.model.norm",
            weights=weights,
            eps=config.rms_norm_eps,
        )

        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads
        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        true_max_s: int,
        prefill_cache_indices: Optional[torch.Tensor],
    ) -> torch.Tensor:
        hidden_states = self.embed_tokens(input_ids)

        # Get rotary cos and sin for this forward
        # Avoid to index in each layer
        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
            position_ids, true_max_s, hidden_states.dtype
        )

        residual = None
        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                block_tables,
                slots,
                seqlen,
                max_s,
                prefill_cache_indices,
            )

        hidden_states, _ = self.norm(hidden_states, residual)

        return hidden_states


class FlashMixtralForCausalLM(torch.nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()

        self.model = MixtralModel(prefix, config, weights)
        self.lm_head = SpeculativeHead.load(
            config,
            prefix="lm_head" if not prefix else f"{prefix}.lm_head",
            weights=weights,
        )
        self.max_past = config.sliding_window
        self.max_past_tensor = (
            torch.tensor(config.sliding_window, device=weights.device)
            if self.max_past is not None
            else None
        )

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        prefill_cache_indices: Optional[torch.Tensor],
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        true_max_s = max_s
        if prefill_cache_indices is not None:
            # Slots also need to be sliced as it has the same size as the whole kv tensor
            slots = slots[prefill_cache_indices]
        elif self.max_past is not None:
            # Clamp in decode mode as paged attention requires clamped values whereas the flash attention
            # kernel requires the true values
            seqlen = seqlen.clamp(max=self.max_past_tensor)

        hidden_states = self.model(
            input_ids,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
            true_max_s,
            prefill_cache_indices,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits = self.lm_head(hidden_states)
        return logits


================================================
FILE: server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
================================================
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
import torch.distributed

from torch import nn
from transformers.activations import ACT2FN
from transformers.modeling_utils import PreTrainedModel
from transformers.models.gpt_neox import GPTNeoXConfig as TransformersGPTNeoXConfig
from typing import Optional, List, Tuple
from text_generation_server.layers.attention import (
    paged_attention,
    attention,
    Seqlen,
)
from text_generation_server.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    SpeculativeHead,
    get_linear,
)
from text_generation_server.layers.attention.kv_cache import get_kv_scales
from text_generation_server.layers.layernorm import (
    FastLayerNorm,
)
from text_generation_server.layers.rotary import (
    PositionRotaryEmbedding,
)
from text_generation_server.utils.weights import UnquantizedWeight


class GPTNeoXConfig(TransformersGPTNeoXConfig):
    attribute_map = {
        "num_key_value_heads": "num_attention_heads",
    }


def load_row(config, prefix: str, weights, bias: bool):
    weight = weights.get_weights_row(prefix)

    if bias and weights.process_group.rank() == 0:
        # Rank is only on the first rank process
        bias = weights.get_tensor(f"{prefix}.bias")
    else:
        bias = None

    linear = get_linear(weight, bias)
    if config.use_parallel_residual:
        return linear
    else:
        return TensorParallelRowLinear(linear, process_group=weights.process_group)


def load_qkv(config, prefix: str, weights, num_heads, head_size, hidden_size):
    weight = weights.get_multi_weights_col([prefix], dim=0)
    if isinstance(weight, UnquantizedWeight):
        # Only on non quantized versions
        weight.weight = (
            weight.weight.view(
                num_heads,
                3,
                head_size,
                hidden_size,
            )
            .permute(1, 0, 2, 3)
            .reshape(-1, hidden_size)
        )

    bias = weights.get_sharded(f"{prefix}.bias", dim=0)
    bias = bias.view(num_heads, 3, head_size).permute(1, 0, 2).reshape(-1)

    linear = get_linear(weight, bias)
    if config.use_parallel_residual:
        return linear
    else:
        return TensorParallelColumnLinear(linear)


class FlashNeoxAttention(torch.nn.Module):
    def __init__(self, config, prefix, weights):
        super().__init__()
        num_heads = config.num_attention_heads
        hidden_size = config.hidden_size

        self.num_heads = num_heads
        self.hidden_size = hidden_size
        self.head_size = hidden_size // num_heads

        self.rotary_dim = int(config.rotary_pct * self.head_size)

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()

        self.rotary_emb = PositionRotaryEmbedding.static(
            config=config,
            dim=self.rotary_dim,
            base=config.rotary_emb_base,
            device=weights.device,
        )

        self.softmax_scale = self.head_size ** (-0.5)

        self.query_key_value = load_qkv(
            config,
            prefix=f"{prefix}.query_key_value",
            weights=weights,
            num_heads=self.num_heads,
            head_size=self.head_size,
            hidden_size=self.hidden_size,
        )
        self.kv_scales = get_kv_scales(weights, f"{prefix}")
        self.dense = load_row(
            config, prefix=f"{prefix}.dense", weights=weights, bias=True
        )
        self.kv_head_mapping = torch.arange(
            0, self.num_heads, dtype=torch.int32, device=weights.device
        )

    def forward(
        self,
        hidden_states,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        seqlen,
        max_s,
    ):
        qkv = self.query_key_value(hidden_states)
        qkv = qkv.view(-1, 3, self.num_heads, self.head_size)

        # Compute rotary embeddings on rotary_ndims
        query_rot = qkv[:, 0][..., : self.rotary_dim]
        query_pass = qkv[:, 0][..., self.rotary_dim :]
        key_rot = qkv[:, 1][..., : self.rotary_dim]
        key_pass = qkv[:, 1][..., self.rotary_dim :]

        # Inplace rotary
        self.rotary_emb(query_rot, key_rot, cos, sin)
        qkv[:, 0] = torch.cat((query_rot, query_pass), dim=-1)
        qkv[:, 1] = torch.cat((key_rot, key_pass), dim=-1)

        kv_cache.store(
            key=qkv[:, 1],
            value=qkv[:, 2],
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
            attn_output = attention(
                query=qkv[:, 0],
                key=qkv[:, 1],
                value=qkv[:, 2],
                kv_cache=kv_cache,
                kv_scales=self.kv_scales,
                seqlen=seqlen,
                block_tables=block_tables,
                softmax_scale=self.softmax_scale,
            )
        # Decode
        else:
            attn_output = paged_attention(
                qkv[:, 0],
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                seqlen,
                max_s,
                kv_scales=self.kv_scales,
            )

        return self.dense(attn_output.view(-1, self.num_heads * self.head_size))


class FlashMLP(nn.Module):
    def __init__(self, config, prefix, weights):
        super().__init__()
        act = config.hidden_act
        self.act = (
            ACT2FN[act]
            if "gelu" not in act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
                ),
            )
        )

        self.dense_h_to_4h = TensorParallelColumnLinear.load(
            config, prefix=f"{prefix}.dense_h_to_4h", weights=weights, bias=True
        )
        self.dense_4h_to_h = load_row(
            config, prefix=f"{prefix}.dense_4h_to_h", weights=weights, bias=True
        )

    def forward(self, hidden_states):
        hidden_states = self.dense_h_to_4h(hidden_states)
        hidden_states = self.act(hidden_states)
        hidden_states = self.dense_4h_to_h(hidden_states)
        return hidden_states


class FlashNeoXLayer(nn.Module):
    def __init__(self, layer_id, config, weights):
        super().__init__()

        layer_norm_eps = config.layer_norm_eps

        prefix = f"gpt_neox.layers.{layer_id}"

        self.use_parallel_residual = config.use_parallel_residual
        self.input_layernorm = FastLayerNorm.load(
            prefix=f"{prefix}.input_layernorm", weights=weights, eps=layer_norm_eps
        )
        self.post_attention_layernorm = FastLayerNorm.load(
            prefix=f"{prefix}.post_attention_layernorm",
            weights=weights,
            eps=layer_norm_eps,
        )
        self.attention = FlashNeoxAttention(
            config, prefix=f"{prefix}.attention", weights=weights
        )

        self.mlp = FlashMLP(config, prefix=f"{prefix}.mlp", weights=weights)
        self.process_group = weights.process_group

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        seqlen,
        max_s,
    ):
        if self.use_parallel_residual:
            ln1_hidden_states, _ = self.input_layernorm(hidden_states)

            attn_output = self.attention(
                ln1_hidden_states,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache,
                block_tables,
                slots,
                seqlen,
                max_s,
            )

            ln2_hidden_states, _ = self.post_attention_layernorm(hidden_states)

            mlp_output = self.mlp(ln2_hidden_states)
            intermediate = mlp_output + attn_output

            if self.process_group.size() > 1:
                torch.distributed.all_reduce(intermediate, group=self.process_group)

            return intermediate + hidden_states, None
        else:
            hidden_states, residual = self.input_layernorm(hidden_states, residual)

            hidden_states = self.attention(
                hidden_states,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache,
                block_tables,
                slots,
                seqlen,
                max_s,
            )

            hidden_states, residual = self.post_attention_layernorm(
                hidden_states, residual
            )

            mlp_output = self.mlp(hidden_states)

            return mlp_output, residual


class FlashGPTNeoXPreTrainedModel(PreTrainedModel):
    config_class = GPTNeoXConfig
    base_model_prefix = "gpt_neox"
    supports_gradient_checkpointing = False
    _no_split_modules = None


class FlashGPTNeoXModel(FlashGPTNeoXPreTrainedModel):
    def __init__(self, prefix: str, config, weights):
        super().__init__(config)
        self.config = config

        self.embed_in = TensorParallelEmbedding(
            prefix=f"{prefix}.embed_in", weights=weights
        )

        self.layers = nn.ModuleList(
            [
                FlashNeoXLayer(layer_id, config, weights)
                for layer_id in range(config.num_hidden_layers)
            ]
        )
        self.final_layer_norm = FastLayerNorm.load(
            prefix=f"{prefix}.final_layer_norm",
            weights=weights,
            eps=config.layer_norm_eps,
        )

        self.gradient_checkpointing = False

        self.head_size = self.layers[0].attention.head_size
        self.num_heads = self.layers[0].attention.num_heads

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
    ) -> torch.Tensor:
        hidden_states = self.embed_in(input_ids)

        # Get rotary cos and sin for this forward
        # Avoid to index in each layer
        cos, sin = self.layers[0].attention.rotary_emb.get_cos_sin(
            position_ids, max_s, hidden_states.dtype
        )

        residual = None
        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                block_tables,
                slots,
                seqlen,
                max_s,
            )

        hidden_states, _ = self.final_layer_norm(hidden_states, residual)

        return hidden_states


class FlashGPTNeoXForCausalLM(FlashGPTNeoXPreTrainedModel):
    def __init__(self, prefix, config, weights):
        super().__init__(config)

        if not prefix:
            prefix = "gpt_neox"
        else:
            prefix = f"{prefix}.gpt_neox"

        self.gpt_neox = FlashGPTNeoXModel(prefix, config, weights)

        self.embed_out = SpeculativeHead.load(
            config, prefix="embed_out", weights=weights
        )

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        prefill_cache_indices: Optional[torch.Tensor],
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        hidden_states = self.gpt_neox(
            input_ids,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits = self.embed_out(hidden_states)
        return logits


================================================
FILE: server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py
================================================
# coding=utf-8
# Copyright 2024 HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
import torch.distributed
from torch import nn
from typing import Optional, List, Tuple

from text_generation_server.layers.tensor_parallel import TensorParallelColumnLinear
from text_generation_server.layers.attention import Seqlen
from text_generation_server.models.custom_modeling.vlm import (
    load_text_model,
    load_vision_model,
)


class PaliGemmaForConditionalGeneration(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        config.vision_config.quantize = config.quantize
        self.vision_tower = load_vision_model(
            prefix="vision_tower" if not prefix else f"{prefix}.vision_tower",
            config=config.vision_config,
            weights=weights,
        )
        self.post_vision_tower_layernorm = nn.LayerNorm.load(
            prefix="vision_tower.vision_model.post_layernorm",
            weights=weights,
            eps=config.vision_config.layer_norm_eps,
        )

        self.multi_modal_projector = TensorParallelColumnLinear.load(
            config,
            prefix="multi_modal_projector.linear",
            weights=weights,
            bias=True,
        )

        self.vocab_size = config.text_config.vocab_size
        self.config = config

        text_config = config.text_config
        text_config.speculator = config.speculator
        text_config.quantize = config.quantize
        self.text_model = load_text_model(
            prefix="language_model" if not prefix else f"{prefix}.language_model",
            config=config.text_config,
            weights=weights,
        )
        self.pad_token_id = (
            config.pad_token_id if config.pad_token_id is not None else -1
        )
        self.dtype = weights.dtype

    def get_vision_embeds(
        self,
        pixel_values: torch.FloatTensor,
        pixel_attention_mask: Optional[torch.FloatTensor] = None,
        image_sizes: Optional[torch.Tensor] = None,
        image_grid_thw: Optional[torch.LongTensor] = None,
    ):
        pixel_values = pixel_values.to(dtype=self.dtype)
        image_outputs = self.vision_tower(pixel_values)
        last_hidden_state = self.post_vision_tower_layernorm(
            image_outputs.last_hidden_state
        )
        image_features = self.multi_modal_projector(last_hidden_state)
        image_features = image_features.view(-1, image_features.shape[-1])
        return image_features

    def get_inputs_embeds(
        self,
        input_ids: torch.Tensor,
        vision_embeds: torch.Tensor = None,
    ):
        inputs_embeds = self.text_model.embed_tokens(input_ids)

        if vision_embeds is not None:
            mask = input_ids == self.config.image_token_index
            inputs_embeds[mask] = vision_embeds

        return inputs_embeds

    def forward(
        self,
        inputs_embeds: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        prefill_cache_indices: Optional[torch.Tensor] = None,
        lm_head_indices: Optional[torch.Tensor] = None,
        # Unused here
        attention_mask: Optional[torch.BoolTensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        # TODO This is odd but apparently pali gemma position ids start at 1.
        if cu_seqlen_prefill is not None:
            max_s += 1
            position_ids += 1

        hidden_states = self.text_model.model(
            inputs_embeds=inputs_embeds,
            position_ids=position_ids,
            cu_seqlen_prefill=cu_seqlen_prefill,
            kv_cache=kv_cache,
            block_tables=block_tables,
            slots=slots,
            seqlen=seqlen,
            max_s=max_s,
        )

        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.text_model.lm_head(hidden_states)

        return logits, speculative_logits


================================================
FILE: server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
================================================
import torch
import torch.distributed

from torch import nn
from transformers.activations import ACT2FN
from transformers.configuration_utils import PretrainedConfig
from typing import Optional, List, Tuple

from text_generation_server.layers.attention import (
    paged_attention,
    attention,
    Seqlen,
)
from text_generation_server.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    SpeculativeHead,
    get_linear,
)
from text_generation_server.layers.attention.kv_cache import get_kv_scales
from text_generation_server.layers.layernorm import (
    FastLayerNorm,
)
from text_generation_server.layers.rotary import (
    PositionRotaryEmbedding,
)


class PhiConfig(PretrainedConfig):
    def __init__(
        self,
        vocab_size=51200,
        hidden_size=2560,
        num_hidden_layers=32,
        num_attention_heads=32,
        num_key_value_heads=32,
        hidden_act="gelu_fast",  # llama uses silu
        layer_norm_eps=1e-05,  # rms in llama,
        pad_token_id=0,
        bos_token_id=1,
        eos_token_id=2,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        resid_pdrop=0.1,  # llama doesn't have this
        partial_rotary_factor=0.5,  # important difference between llama and phi
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.layer_norm_eps = layer_norm_eps
        self.rope_theta = rope_theta
        self.resid_pdrop = resid_pdrop
        self.partial_rotary_factor = partial_rotary_factor

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )


# this is the same as llama except for Phi uses bias=True
def load_attention(config, prefix, weights):
    if config.num_attention_heads != config.num_key_value_heads:
        return _load_gqa(config, prefix, weights)
    else:
        return TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
            dim=0,
            weights=weights,
            bias=True,
        )


def _load_gqa(config, prefix: str, weights):
    assert config.hidden_size % config.num_attention_heads == 0
    assert config.num_attention_heads % weights.process_group.size() == 0

    weight = weights.get_multi_weights_col(
        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
        dim=0,
    )

    if config.quantize not in ["gptq", "awq", "marlin"]:
        weight = weight.to(dtype=weights.dtype).to(device=weights.device)

        head_size = config.hidden_size // config.num_attention_heads
        num_heads = config.num_attention_heads // weights.process_group.size()
        num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
        assert list(weight.shape) == [
            (num_heads + 2 * num_key_value_heads) * head_size,
            config.hidden_size,
        ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"

    # this is the same as llama except for Phi uses bias=True
    return TensorParallelColumnLinear(get_linear(weight, bias=True))


class FlashPhiAttention(torch.nn.Module):
    def __init__(
        self,
        prefix: str,
        config,
        weights,
    ):
        super().__init__()
        self.num_heads = config.num_attention_heads
        self.hidden_size = config.hidden_size
        self.head_size = self.hidden_size // self.num_heads

        self.softmax_scale = self.head_size**-0.5
        self.rotary_dim = int(config.partial_rotary_factor * self.head_size)

        self.rotary_emb = PositionRotaryEmbedding.static(
            config=config,
            dim=self.rotary_dim,
            base=config.rope_theta,
            device=weights.device,
        )

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )

        self.num_heads = self.num_heads // weights.process_group.size()
        self.num_key_value_heads = (
            config.num_key_value_heads // weights.process_group.size()
        )

        self.query_key_value = load_attention(config, prefix, weights)
        self.kv_scales = get_kv_scales(weights, f"{prefix}")

        # in llama the dense layer is called "o_proj" and has bias=False
        self.dense = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.dense",
            weights=weights,
            bias=True,
        )
        self.num_groups = self.num_heads // self.num_key_value_heads
        self.kv_head_mapping = torch.arange(
            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
        ).repeat_interleave(self.num_groups)

    def forward(
        self,
        hidden_states,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        seqlen,
        max_s,
    ):
        # Compute query, key, value and split
        qkv = self.query_key_value(hidden_states)
        query, kv = qkv.split(
            [
                self.head_size * self.num_heads,
                2 * self.head_size * self.num_key_value_heads,
            ],
            dim=1,
        )

        # Reshape query and key for rotary embeddings
        query = query.view(-1, self.num_heads, self.head_size)
        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)

        # NOTE: this is the main difference between Llama and Phi
        # in llama the rotary embeddings are applied to the whole query and key.
        # Phi uses PARTIAL rotary embeddings, which are applied to the first 32 dimensions
        #
        # Apply partial positional embeddings in place
        self.rotary_emb(
            query[:, :, : self.rotary_dim], kv[:, 0, :, : self.rotary_dim], cos, sin
        )

        # Reshape key and value and cache
        kv_cache.store(
            key=kv[:, 0],
            value=kv[:, 1],
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            attn_output = attention(
                query=query,
                key=kv[:, 0],
                value=kv[:, 1],
                kv_scales=self.kv_scales,
                kv_cache=kv_cache,
                seqlen=seqlen,
                block_tables=block_tables,
                softmax_scale=self.softmax_scale,
            )
        # Decode
        else:
            attn_output = paged_attention(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                seqlen,
                max_s,
                kv_scales=self.kv_scales,
            )

        return self.dense(attn_output.view(-1, self.num_heads * self.head_size))


class PhiMLP(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        act = config.hidden_act
        self.act = (
            ACT2FN[act]
            if "gelu" not in act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
                ),
            )
        )

        # llama weights are up_proj and down_proj and bias=False
        self.up_proj = TensorParallelColumnLinear.load(
            config,
            prefix=f"{prefix}.fc1",
            weights=weights,
            bias=True,
        )
        self.down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.fc2",
            weights=weights,
            bias=True,
        )

    def forward(self, hidden_states):
        # NOTE: Llama requires the gate up states to an intermediate size
        # Phi does not and we can avoid the `view` operation
        return self.down_proj(self.act(self.up_proj(hidden_states)))


class FlashPhiLayer(nn.Module):
    def __init__(self, prefix: str, layer_id, config, weights):
        super().__init__()
        prefix = f"{prefix}.layers.{layer_id}"
        self.self_attn = FlashPhiAttention(
            prefix=f"{prefix}.self_attn", config=config, weights=weights
        )
        self.mlp = PhiMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
        self.input_layernorm = FastLayerNorm.load(
            prefix=f"{prefix}.input_layernorm",
            weights=weights,
            eps=config.layer_norm_eps,
        )
        self.resid_dropout = torch.nn.Dropout(config.resid_pdrop)

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        seqlen,
        max_s,
    ):
        hidden_states, res = self.input_layernorm(hidden_states, residual)
        # Self Attention
        attn_output = self.self_attn(
            hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
        )

        hidden_states = self.resid_dropout(attn_output).add(
            self.resid_dropout(self.mlp(hidden_states))
        )

        return hidden_states, res


class FlashPhiModel(torch.nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()

        process_group = weights.process_group
        self.tp_rank = process_group.rank()
        self.tp_world_size = process_group.size()
        self.embed_tokens = TensorParallelEmbedding(
            prefix=f"{prefix}.embed_tokens", weights=weights
        )
        self.layers = nn.ModuleList(
            [
                FlashPhiLayer(
                    prefix,
                    layer_id,
                    config,
                    weights,
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )
        self.gradient_checkpointing = False

        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads
        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads

        self.norm = FastLayerNorm.load(
            prefix="model.final_layernorm",
            weights=weights,
            eps=config.layer_norm_eps,
        )

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
    ) -> torch.Tensor:
        hidden_states = self.embed_tokens(input_ids)

        # Get rotary cos and sin for this forward
        # Avoid to index in each layer
        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
            position_ids, max_s, hidden_states.dtype
        )

        residual = None
        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                block_tables,
                slots,
                seqlen,
                max_s,
            )

        hidden_states, _ = self.norm(hidden_states, residual)

        return hidden_states


class FlashPhiForCausalLM(torch.nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()

        if not prefix:
            prefix = "model"
        else:
            prefix = f"{prefix}.model"

        self.model = FlashPhiModel(prefix, config, weights)
        self.lm_head = SpeculativeHead.load(
            config,
            prefix="lm_head",
            weights=weights,
        )

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        prefill_cache_indices: Optional[torch.Tensor],
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        hidden_states = self.model(
            input_ids,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]

        return self.lm_head(hidden_states)


================================================
FILE: server/text_generation_server/models/custom_modeling/flash_phi_moe_modeling.py
================================================
# coding=utf-8
# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""PyTorch Phi-MoE model."""

from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging


logger = logging.get_logger(__name__)


PHIMOE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "microsoft/Phi-3.5-MoE-instruct": "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct/resolve/main/config.json",
}


class PhiMoEConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`PhiMoEModel`]. It is used to instantiate a Phi-MoE
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the
    [microsoft/Phi-3.5-MoE-instruct](https://huggingface.co/microsoft/Phi-3.5-MoE-instruct).

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 32064):
            Vocabulary size of the PhiMoE model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`PhiMoEModel`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 6400):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_key_value_heads (`int`, *optional*, defaults to 8):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
            The maximum sequence length that this model might ever be used with. Mixtral's sliding window attention
            allows sequence of up to 4096*32 tokens.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*):
            The id of the padding token.
        bos_token_id (`int`, *optional*, defaults to 1):
            The id of the "beginning-of-sequence" token.
        eos_token_id (`int`, *optional*, defaults to 2):
            The id of the "end-of-sequence" token.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be tied.
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        rope_scaling (`dict`, *optional*):
            The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
            contain the following keys: `type`, `short_factor`, `long_factor`, `short_mscale`, `long_mscale` and
            `original_max_position_embeddings`. The `type` must be `longrope`, the `short_mscale` and `long_scale` must
            be numbers, the `short_factor` and `long_factor` must be lists of numbers with the same length as half of
            the attention head size and the `original_max_position_embeddings` must be an integer.
        sliding_window (`int`, *optional*):
            Sliding window attention window size. If not specified, will default to `262144`.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        num_experts_per_tok (`int`, *optional*, defaults to 2):
            The number of experts to root per-token, can be also interpreted as the `top-p` routing
            parameter
        num_local_experts (`int`, *optional*, defaults to 16):
            Number of experts per Sparse MLP layer.
        output_router_logits (`bool`, *optional*, defaults to `False`):
            Whether or not the router logits should be returned by the model. Enabeling this will also
            allow the model to output the auxiliary loss. See [here]() for more details
        router_aux_loss_coef (`float`, *optional*, defaults to 0.0):
            The aux loss factor for the total loss.
        router_jitter_noise (`float`, *optional*, defaults to 0.01):
            Amount of noise to add to the router.

    ```python
    >>> from transformers import PhiMoEModel, PhiMoEConfig

    >>> # Initializing a Phi-3 style configuration
    >>> configuration = PhiMoEConfig.from_pretrained("microsoft/Phi-3.5-MoE-instruct")

    >>> # Initializing a model from the configuration
    >>> model = PhiMoEModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "phimoe"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        vocab_size=32064,
        hidden_size=4096,
        intermediate_size=6400,
        num_hidden_layers=32,
        num_attention_heads=32,
        num_key_value_heads=8,
        hidden_act="silu",
        max_position_embeddings=4096 * 32,
        initializer_range=0.02,
        rms_norm_eps=1e-5,
        use_cache=True,
        pad_token_id=None,
        bos_token_id=1,
        eos_token_id=2,
        tie_word_embeddings=False,
        rope_theta=1e6,
        rope_scaling=None,
        sliding_window=None,
        attention_dropout=0.0,
        num_experts_per_tok=2,
        num_local_experts=16,
        output_router_logits=False,
        router_aux_loss_coef=0.001,
        router_jitter_noise=0.01,
        input_jitter_noise=0.0,
        attention_bias=False,
        lm_head_bias=False,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.sliding_window = sliding_window
        self.attention_bias = attention_bias
        self.lm_head_bias = lm_head_bias
        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.attention_dropout = attention_dropout

        self.num_experts_per_tok = num_experts_per_tok
        self.num_local_experts = num_local_experts
        self.output_router_logits = output_router_logits
        self.router_aux_loss_coef = router_aux_loss_coef
        self.router_jitter_noise = router_jitter_noise
        self.input_jitter_noise = input_jitter_noise

        self.rope_scaling = rope_scaling
        self._rope_scaling_validation()

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )

    def _rope_scaling_validation(self):
        """
        Validate the `rope_scaling` configuration.
        """
        if self.rope_scaling is None:
            return

        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 6:
            raise ValueError(
                "`rope_scaling` must be a dictionary with three fields, `type`, `short_factor`, `long_factor`, "
                f"`short_mscale`, `long_mscale` and `original_max_position_embeddings`, got {self.rope_scaling}"
            )
        rope_scaling_type = self.rope_scaling.get("type", None)
        rope_scaling_short_factor = self.rope_scaling.get("short_factor", None)
        rope_scaling_long_factor = self.rope_scaling.get("long_factor", None)
        rope_scaling_short_mscale = self.rope_scaling.get("short_mscale", None)
        rope_scaling_long_mscale = self.rope_scaling.get("long_mscale", None)
        original_max_position_embeddings = self.rope_scaling.get(
            "original_max_position_embeddings", None
        )
        if rope_scaling_type is None or rope_scaling_type not in ["longrope"]:
            raise ValueError(
                f"`rope_scaling`'s type field must be one of ['longrope'], got {rope_scaling_type}"
            )
        if not (
            isinstance(rope_scaling_short_factor, list)
            and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor)
        ):
            raise ValueError(
                f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}"
            )
        if (
            not len(rope_scaling_short_factor)
            == self.hidden_size // self.num_attention_heads // 2
        ):
            raise ValueError(
                f"`rope_scaling`'s short_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_short_factor)}"
            )
        if not (
            isinstance(rope_scaling_long_factor, list)
            and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor)
        ):
            raise ValueError(
                f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}"
            )
        if (
            not len(rope_scaling_long_factor)
            == self.hidden_size // self.num_attention_heads // 2
        ):
            raise ValueError(
                f"`rope_scaling`'s long_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_long_factor)}"
            )
        if not isinstance(rope_scaling_short_mscale, (int, float)):
            raise ValueError(
                f"`rope_scaling`'s short_mscale field must be a number, got {rope_scaling_short_mscale}"
            )
        if not isinstance(rope_scaling_long_mscale, (int, float)):
            raise ValueError(
                f"`rope_scaling`'s long_mscale field must be a number, got {rope_scaling_long_mscale}"
            )
        if not isinstance(original_max_position_embeddings, int):
            raise ValueError(
                f"`rope_scaling`'s original_max_position_embeddings field must be an integer, got {original_max_position_embeddings}"
            )


================================================
FILE: server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
================================================
import torch
import torch.distributed

from torch import nn
from transformers.activations import ACT2FN
from typing import Optional, List, Tuple

from text_generation_server.layers.attention import (
    paged_attention,
    attention,
    Seqlen,
)
from text_generation_server.layers import (
    TensorParallelMultiAdapterLinear,
    TensorParallelAdapterRowLinear,
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    SpeculativeHead,
)
from text_generation_server.layers.attention.kv_cache import get_kv_scales
from text_generation_server.layers.rotary import PositionRotaryEmbedding
from text_generation_server.layers.layernorm import (
    FastRMSNorm,
)


def load_attention(config, prefix, weights, layer_id):
    prefixes = [f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"]
    head_size = config.hidden_size // config.num_attention_heads
    sizes = [
        head_size * config.num_attention_heads,
        head_size * config.num_key_value_heads,
        head_size * config.num_key_value_heads,
    ]
    if config.num_attention_heads != config.num_key_value_heads:
        base_layer = _load_gqa(config, prefix, weights)
    else:
        base_layer = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=prefixes,
            dim=0,
            weights=weights,
            bias=True,
        )
    return TensorParallelMultiAdapterLinear.load(
        base_layer=base_layer,
        layer_id=layer_id,
        layer_names=prefixes,
        sizes=sizes,
        process_group=weights.process_group,
    )


def _load_gqa(config, prefix: str, weights):
    assert config.hidden_size % config.num_attention_heads == 0
    assert config.num_attention_heads % weights.process_group.size() == 0

    return TensorParallelColumnLinear.load_multi(
        config,
        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
        dim=0,
        weights=weights,
        bias=True,
    )


class Qwen2Attention(torch.nn.Module):
    def __init__(
        self,
        index: int,
        prefix: str,
        config,
        weights,
    ):
        super().__init__()
        self.window_size = (
            config.sliding_window if config.sliding_window is not None else -1
        )
        self.num_heads = config.num_attention_heads
        self.hidden_size = config.hidden_size
        self.head_size = self.hidden_size // self.num_heads

        self.rotary_emb = PositionRotaryEmbedding.static(
            config=config,
            dim=self.head_size,
            base=config.rope_theta,
            device=weights.device,
        )

        self.softmax_scale = self.head_size**-0.5

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()
        self.num_key_value_heads = (
            config.num_key_value_heads // weights.process_group.size()
        )

        self.query_key_value = load_attention(config, prefix, weights, index)

        self.kv_scales = get_kv_scales(weights, f"{prefix}")

        o_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.o_proj",
            weights=weights,
            bias=False,
        )
        self.o_proj = TensorParallelAdapterRowLinear.load(
            o_proj,
            index,
            "o_proj",
            process_group=weights.process_group,
        )
        self.num_groups = self.num_heads // self.num_key_value_heads
        self.kv_head_mapping = torch.arange(
            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
        ).repeat_interleave(self.num_groups)

    def forward(
        self,
        hidden_states,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        seqlen,
        max_s,
        prefill_cache_indices,
        adapter_data,
    ):
        qkv = self.query_key_value(hidden_states, adapter_data)
        query, kv = qkv.split(
            [
                self.head_size * self.num_heads,
                2 * self.head_size * self.num_key_value_heads,
            ],
            dim=1,
        )
        query = query.view(-1, self.num_heads, self.head_size)
        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)

        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)

        if prefill_cache_indices is not None:
            kv_to_cache = kv[prefill_cache_indices]
        else:
            kv_to_cache = kv

        kv_cache.store(
            key=kv_to_cache[:, 0],
            value=kv_to_cache[:, 1],
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
            attn_output = attention(
                query=query,
                key=kv_to_cache[:, 0],
                value=kv_to_cache[:, 1],
                kv_cache=kv_cache,
                kv_scales=self.kv_scales,
                seqlen=seqlen,
                block_tables=block_tables,
                softmax_scale=self.softmax_scale,
                window_size_left=self.window_size,
            )
        # Decode
        else:
            attn_output = paged_attention(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                seqlen,
                max_s,
                kv_scales=self.kv_scales,
                window_size_left=self.window_size,
            )

        return self.o_proj(
            attn_output.view(-1, self.num_heads * self.head_size), adapter_data
        )


class Qwen2MLP(nn.Module):
    def __init__(self, prefix, config, weights, index):
        super().__init__()
        act = config.hidden_act
        self.act = (
            ACT2FN[act]
            if "gelu" not in act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
                ),
            )
        )
        # Fuse gate and up proj
        prefixes = [f"{prefix}.gate_proj", f"{prefix}.up_proj"]
        sizes = [
            config.intermediate_size,
            config.intermediate_size,
        ]
        gate_up_proj = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=prefixes,
            weights=weights,
            dim=0,
            bias=False,
        )
        self.gate_up_proj = TensorParallelMultiAdapterLinear.load(
            gate_up_proj,
            layer_id=index,
            layer_names=prefixes,
            sizes=sizes,
            process_group=weights.process_group,
        )
        down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.down_proj",
            weights=weights,
            bias=False,
        )
        self.down_proj = TensorParallelAdapterRowLinear.load(
            down_proj,
            index,
            "down_proj",
            process_group=weights.process_group,
        )
        self.intermediate_size = (
            config.intermediate_size // weights.process_group.size()
        )

    def forward(self, hidden_states, adapter_data):
        gate_up_states = self.gate_up_proj(hidden_states, adapter_data)
        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
        return self.down_proj(
            self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], adapter_data
        )


class Qwen2Layer(nn.Module):
    def __init__(self, prefix, layer_id, config, weights):
        super().__init__()
        prefix = f"{prefix}.layers.{layer_id}"
        self.self_attn = Qwen2Attention(
            index=layer_id, prefix=f"{prefix}.self_attn", config=config, weights=weights
        )
        self.mlp = Qwen2MLP(
            prefix=f"{prefix}.mlp", config=config, weights=weights, index=layer_id
        )
        self.input_layernorm = FastRMSNorm.load(
            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
        )
        self.post_attention_layernorm = FastRMSNorm.load(
            prefix=f"{prefix}.post_attention_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        seqlen,
        max_s,
        prefill_cache_indices,
        adapter_data,
    ):
        normed_hidden_states, residual = self.input_layernorm(hidden_states)

        # Self Attention
        attn_output = self.self_attn(
            normed_hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
            prefill_cache_indices,
            adapter_data,
        )
        hidden_states = attn_output + residual

        # faster post attention rms norm
        hidden_states, residual = self.post_attention_layernorm(hidden_states)
        mlp_output = self.mlp(hidden_states, adapter_data)
        hidden_states = mlp_output + residual
        return hidden_states


class Qwen2Model(torch.nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()

        prefix = f"{prefix}.model" if prefix else "model"

        process_group = weights.process_group
        self.tp_rank = process_group.rank()
        self.tp_world_size = process_group.size()
        self.layers = nn.ModuleList(
            [
                Qwen2Layer(
                    prefix,
                    layer_id,
                    config,
                    weights,
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )
        self.norm = FastRMSNorm.load(
            prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
        )

        self.gradient_checkpointing = False

        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads
        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads

    def forward(
        self,
        inputs_embeds: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        true_max_s: int,
        prefill_cache_indices: Optional[torch.Tensor],
        adapter_data,
    ) -> torch.Tensor:
        hidden_states = inputs_embeds

        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
            position_ids,
            true_max_s,
            hidden_states.dtype,
        )

        residual = None
        for i, layer in enumerate(self.layers):
            hidden_states = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                block_tables,
                slots,
                seqlen,
                max_s,
                prefill_cache_indices,
                adapter_data,
            )

        hidden_states, _ = self.norm(hidden_states)

        return hidden_states


class Qwen2ForCausalLM(torch.nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()

        self.model = Qwen2Model(prefix, config, weights)

        if config.tie_word_embeddings:
            suffix = "model.embed_tokens"
        else:
            suffix = "lm_head"

        self.lm_head = SpeculativeHead.load(
            config,
            prefix=f"{prefix}.{suffix}" if prefix else suffix,
            weights=weights,
        )

        self.embed_tokens = TensorParallelEmbedding(
            prefix=f"{prefix}.embed_tokens" if prefix else "model.embed_tokens",
            weights=weights,
        )

        self.window_size = config.sliding_window
        self.window_size_tensor = (
            torch.tensor(config.sliding_window, device=weights.device)
            if self.window_size is not None
            else None
        )

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        prefill_cache_indices: Optional[torch.Tensor] = None,
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        true_max_s = max_s
        if prefill_cache_indices is not None:
            # Slots also need to be sliced as it has the same size as the whole kv tensor
            slots = slots[prefill_cache_indices]
        elif self.window_size is not None:
            # Clamp in decode mode as paged attention requires clamped values whereas the flash attention
            # kernel requires the true values
            seqlen = seqlen.clamp(max=self.window_size_tensor)

        inputs_embeds = self.embed_tokens(input_ids)

        hidden_states = self.model(
            inputs_embeds,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
            true_max_s,
            prefill_cache_indices,
            adapter_data,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits = self.lm_head(hidden_states)
        return logits


================================================
FILE: server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
================================================
from typing import List, Optional, Tuple

import torch
import torch.distributed
from torch import nn
from transformers.configuration_utils import PretrainedConfig
from transformers.modeling_utils import PreTrainedModel
from text_generation_server.layers import (
    SpeculativeHead,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    TensorParallelRowLinear,
    get_linear,
)
from text_generation_server.layers.attention.kv_cache import get_kv_scales
from text_generation_server.layers.layernorm import FastLayerNorm
from text_generation_server.layers.rotary import PositionRotaryEmbedding
from text_generation_server.layers.attention import (
    attention,
    paged_attention,
    Seqlen,
)


def load_row(config, prefix: str, weights, bias: bool):
    weight = weights.get_weights_row(prefix)

    if bias and weights.process_group.rank() == 0:
        # Rank is only on the first rank process
        bias = weights.get_tensor(f"{prefix}.bias")
    else:
        bias = None

    linear = get_linear(weight, bias)
    if config.parallel_attn:
        return linear
    else:
        return TensorParallelRowLinear(linear, process_group=weights.process_group)


class RWConfig(PretrainedConfig):
    attribute_map = {
        "num_hidden_layers": "n_layer",
        "num_attention_heads": "n_head",
        "num_key_value_heads": "n_head_kv",
    }

    def __init__(
        self,
        model_type="RefinedWeb",
        vocab_size=250880,
        hidden_size=64,
        num_hidden_layers=None,
        num_attention_heads=None,
        num_ln_in_prallel_attention=None,
        layer_norm_epsilon=1e-5,
        initializer_range=0.02,
        use_cache=True,
        bos_token_id=1,
        eos_token_id=2,
        hidden_dropout=0.0,
        attention_dropout=0.0,
        num_kv_heads=None,
        multi_query=False,
        alibi=False,
        new_decoder_architecture=None,
        bias=False,
        parallel_attn=False,
        rope_theta=10_000.0,
        **kwargs,
    ):
        if alibi:
            raise NotImplementedError(
                "alibi is not supported by this version of the model"
            )

        self.model_type = model_type
        self.alibi = False
        self.rotary = True
        self.rope_theta = rope_theta
        self.max_position_embeddings = 2048

        self.vocab_size = vocab_size
        # Backward compatibility with n_embed kwarg
        n_embed = kwargs.pop("n_embed", None)
        self.hidden_size = hidden_size if n_embed is None else n_embed
        self.n_layer = (
            num_hidden_layers
            if num_hidden_layers is not None
            else kwargs.pop("n_layer", 2)
        )
        self.n_head = (
            num_attention_heads
            if num_attention_heads is not None
            else kwargs.pop("n_head", 8)
        )
        self.layer_norm_epsilon = layer_norm_epsilon
        self.num_ln_in_parallel_attn = num_ln_in_prallel_attention
        self.initializer_range = initializer_range
        self.use_cache = use_cache
        self.hidden_dropout = hidden_dropout
        self.attention_dropout = attention_dropout
        self.bias = bias
        self.parallel_attn = parallel_attn

        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id

        if num_kv_heads is not None:
            self.n_head_kv = num_kv_heads
        else:
            old_n_head_kv = kwargs.pop("n_head_kv", None)
            if old_n_head_kv is not None:
                self.n_head_kv = old_n_head_kv
            else:
                self.n_head_kv = 1 if multi_query else self.n_head

        if new_decoder_architecture is not None:
            self.new_decoder_architecture = new_decoder_architecture
        elif model_type == "RefinedWeb":
            self.new_decoder_architecture = True
        else:
            self.new_decoder_architecture = False

        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)


class FlashRWAttention(torch.nn.Module):
    def __init__(
        self,
        config,
        prefix: str,
        weights,
    ):
        super().__init__()
        self.num_heads = config.n_head
        self.num_heads_kv = config.n_head_kv
        self.hidden_size = config.hidden_size
        self.head_size = self.hidden_size // self.num_heads
        self.rope_theta = config.rope_theta

        self.rotary_emb = PositionRotaryEmbedding.static(
            config=config,
            dim=self.head_size,
            base=self.rope_theta,
            device=weights.device,
        )
        self.softmax_scale = self.head_size ** (-0.5)

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()

        self.query_key_value = TensorParallelColumnLinear.load(
            config,
            prefix=f"{prefix}.query_key_value",
            weights=weights,
            bias=config.bias,
        )
        self.kv_scales = get_kv_scales(weights, f"{prefix}")
        self.dense = load_row(
            config, prefix=f"{prefix}.dense", weights=weights, bias=config.bias
        )

        if self.num_heads_kv == 1:
            self.kv_head_mapping = torch.zeros(
                self.num_heads, dtype=torch.int32, device=weights.device
            )
        else:
            self.kv_head_mapping = torch.arange(
                0, self.num_heads, dtype=torch.int32, device=weights.device
            )

    def forward(
        self,
        hidden_states,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        seqlen,
        max_s,
    ):
        qkv = self.query_key_value(hidden_states)

        # Split query from key_value
        query, kv = qkv.split(
            [self.head_size * self.num_heads, 2 * self.head_size * self.num_heads_kv],
            dim=1,
        )

        # Prepare query and key_value for indexing
        query = query.view(-1, self.num_heads, self.head_size)
        kv = kv.view(-1, 2, self.num_heads_kv, self.head_size)

        # Inplace rotary
        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)

        kv_cache.store(
            key=kv[:, 0],
            value=kv[:, 1],
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
            attn_output = attention(
                query=query,
                key=kv[:, 0],
                value=kv[:, 1],
                kv_cache=kv_cache,
                kv_scales=self.kv_scales,
                seqlen=seqlen,
                block_tables=block_tables,
                softmax_scale=self.softmax_scale,
            )
        # Decode
        else:
            attn_output = paged_attention(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                seqlen,
                max_s,
                kv_scales=self.kv_scales,
            )

        return self.dense(attn_output.view(-1, self.num_heads * self.head_size))


class FlashRWLargeAttention(torch.nn.Module):
    def __init__(
        self,
        config,
        prefix: str,
        weights,
    ):
        super().__init__()

        hidden_size = config.hidden_size
        num_heads = config.n_head
        # num_heads_kv = config.n_head_kv
        num_groups = config.n_head_kv

        self.hidden_size = hidden_size
        self.head_size = hidden_size // num_heads
        self.num_groups = num_groups
        self.rope_theta = config.rope_theta

        self.rotary_emb = PositionRotaryEmbedding.static(
            config=config,
            dim=self.head_size,
            base=self.rope_theta,
            device=weights.device,
        )
        self.softmax_scale = self.head_size ** (-0.5)

        # self.num_groups = num_heads // (num_heads_kv * 2)
        self.num_heads = num_heads // self.num_groups
        # self.num_heads_kv = num_heads_kv // self.num_groups
        process_group = weights.process_group

        if process_group.size() > self.num_groups:
            raise NotImplementedError(
                "Tensor Parallelism is not implemented for world_size > n groups"
            )
        if self.num_groups % process_group.size() != 0:
            raise NotImplementedError(
                f"Tensor Parallelism is not implemented for {self.num_groups} not divisible by {process_group.size()}"
            )

        self.num_groups = self.num_groups // process_group.size()

        self.query_key_value = TensorParallelColumnLinear.load(
            config,
            prefix=f"{prefix}.query_key_value",
            weights=weights,
            bias=config.bias,
        )
        self.kv_scales = get_kv_scales(weights, f"{prefix}")
        self.dense = load_row(
            config, prefix=f"{prefix}.dense", weights=weights, bias=config.bias
        )

        self.kv_head_mapping = torch.arange(
            0, self.num_groups, dtype=torch.int32, device=weights.device
        ).repeat_interleave(self.num_heads)

    def forward(
        self,
        hidden_states,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        seqlen,
        max_s,
    ):
        qkv = self.query_key_value(hidden_states)
        qkv = qkv.view(-1, self.num_groups, self.num_heads + 2, self.head_size)

        # Split on group dimension
        query, kv = qkv.split(
            [self.num_heads, 2],
            dim=2,
        )
        # Merge groups and heads
        query = query.reshape(-1, self.num_groups * self.num_heads, self.head_size)

        # Inplace rotary
        self.rotary_emb(query, torch.select(kv, dim=2, index=0), cos, sin)

        kv_cache.store(
            key=kv[:, :, 0].contiguous(),
            value=kv[:, :, 1].contiguous(),
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
            attn_output = attention(
                query=query,
                key=kv[:, :, 0],
                value=kv[:, :, 1],
                kv_cache=kv_cache,
                kv_scales=self.kv_scales,
                seqlen=seqlen,
                block_tables=block_tables,
                softmax_scale=self.softmax_scale,
            )
        # Decode
        else:
            attn_output = paged_attention(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                seqlen,
                max_s,
                kv_scales=self.kv_scales,
            )

        return self.dense(
            attn_output.view(-1, self.num_groups * self.num_heads * self.head_size)
        )


class FlashMLP(nn.Module):
    def __init__(self, config, prefix: str, weights):
        super().__init__()
        self.act = torch.nn.functional.gelu

        self.dense_h_to_4h = TensorParallelColumnLinear.load(
            config, prefix=f"{prefix}.dense_h_to_4h", weights=weights, bias=config.bias
        )
        self.dense_4h_to_h = load_row(
            config, prefix=f"{prefix}.dense_4h_to_h", weights=weights, bias=config.bias
        )

    def forward(self, hidden_states):
        hidden_states = self.dense_h_to_4h(hidden_states)
        hidden_states = self.act(hidden_states)
        hidden_states = self.dense_4h_to_h(hidden_states)
        return hidden_states


class FlashRWLayer(nn.Module):
    def __init__(
        self,
        layer_id,
        prefix: str,
        config,
        weights,
    ):
        super().__init__()

        parallel_attn = config.parallel_attn
        self.parallel_attn = parallel_attn

        prefix = f"{prefix}.h.{layer_id}"

        # NOTE: Falcon 180B uses the ln_attn prefix
        ln_prefix = "input_layernorm"
        if config.num_hidden_layers == 80:
            ln_prefix = "ln_attn"

        self.input_layernorm = FastLayerNorm.load(
            prefix=f"{prefix}.{ln_prefix}",
            weights=weights,
            eps=config.layer_norm_epsilon,
        )
        self.self_attention = FlashRWAttention(
            config,
            prefix=f"{prefix}.self_attention",
            weights=weights,
        )
        self.post_attention_layernorm = (
            FastLayerNorm.load(
                prefix=f"{prefix}.post_attention_layernorm",
                weights=weights,
                eps=config.layer_norm_epsilon,
            )
            if not parallel_attn
            else None
        )

        self.mlp = FlashMLP(
            config,
            prefix=f"{prefix}.mlp",
            weights=weights,
        )

        self.process_group = weights.process_group

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        seqlen,
        max_s,
    ):
        if self.parallel_attn:
            ln_hidden_states, residual = self.input_layernorm(hidden_states, residual)

            attn_output = self.self_attention(
                ln_hidden_states,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache,
                block_tables,
                slots,
                seqlen,
                max_s,
            )

            mlp_output = self.mlp(ln_hidden_states)
            intermediate = mlp_output + attn_output

            if self.process_group.size() > 1:
                torch.distributed.all_reduce(intermediate, group=self.process_group)

            return intermediate, residual
        else:
            hidden_states, residual = self.input_layernorm(hidden_states, residual)

            hidden_states = self.self_attention(
                hidden_states,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache,
                block_tables,
                slots,
                seqlen,
                max_s,
            )

            if self.post_attention_layernorm is not None:
                hidden_states, residual = self.post_attention_layernorm(
                    hidden_states, residual
                )

            mlp_output = self.mlp(hidden_states)

            return mlp_output, residual


class FlashRWLayerNorm(nn.Module):
    def __init__(self, config, prefix: str, weights):
        super().__init__()
        # Falcon2 includes the number of layer norms in the config
        # in the case no number of layer norms is provided, we default to 1
        self.num_ln = getattr(config, "num_ln_in_parallel_attn", 1)

        # Falcon 180B uses the ln_attn prefix and has 2 layer norms
        if config.num_hidden_layers == 80:
            self.num_ln = 2

        if self.num_ln == 1:
            self.input_ln = FastLayerNorm.load(
                prefix=f"{prefix}.input_layernorm",
                weights=weights,
                eps=config.layer_norm_epsilon,
            )
        elif self.num_ln == 2:
            self.ln_attn = FastLayerNorm.load(
                prefix=f"{prefix}.ln_attn",
                weights=weights,
                eps=config.layer_norm_epsilon,
            )
            self.ln_mlp = FastLayerNorm.load(
                prefix=f"{prefix}.ln_mlp",
                weights=weights,
                eps=config.layer_norm_epsilon,
            )
        else:
            raise ValueError("Number of layer norms can either be 1 or 2.")

    def forward(
        self,
        hidden_states,
        residual,
    ):
        if self.num_ln == 1:
            ln_hidden_states, residual = self.input_ln(hidden_states, residual)
            return ln_hidden_states, ln_hidden_states, residual
        elif self.num_ln == 2:
            ln_attn, residual = self.ln_attn(hidden_states, residual)
            ln_mlp, _ = self.ln_mlp(residual)
            return ln_attn, ln_mlp, residual


class FlashRWLargeLayer(nn.Module):
    def __init__(self, layer_id, prefix: str, config, weights):
        super().__init__()
        prefix = f"{prefix}.h.{layer_id}"

        self.ln_layer = FlashRWLayerNorm(config, prefix, weights)

        self.self_attention = FlashRWLargeAttention(
            config,
            prefix=f"{prefix}.self_attention",
            weights=weights,
        )
        assert config.parallel_attn, "This version doesn't support non parallel_attn"

        self.mlp = FlashMLP(config, prefix=f"{prefix}.mlp", weights=weights)

        self.process_group = weights.process_group

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        seqlen,
        max_s,
    ):
        # Layer norm.
        ln_attn, ln_mlp, residual = self.ln_layer(hidden_states, residual)

        # Self attention.
        attn_output = self.self_attention(
            ln_attn,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
        )

        # MLP.
        mlp_output = self.mlp(ln_mlp)

        intermediate = attn_output + mlp_output

        if self.process_group.size() > 1:
            torch.distributed.all_reduce(intermediate, group=self.process_group)

        return intermediate, residual


class FlashRWPreTrainedModel(PreTrainedModel):
    config_class = RWConfig


class FlashRWModel(FlashRWPreTrainedModel):
    def __init__(self, prefix: str, config, weights):
        super().__init__(config)
        self.config = config

        self.word_embeddings = TensorParallelEmbedding(
            prefix=f"{prefix}.word_embeddings", weights=weights
        )

        if config.new_decoder_architecture:
            self.h = nn.ModuleList(
                [
                    FlashRWLargeLayer(layer_id, prefix, config, weights)
                    for layer_id in range(config.num_hidden_layers)
                ]
            )
            self.cache_size = self.h[0].self_attention.num_groups
        else:
            self.h = nn.ModuleList(
                [
                    FlashRWLayer(layer_id, prefix, config, weights)
                    for layer_id in range(config.num_hidden_layers)
                ]
            )
            self.cache_size = self.h[0].self_attention.num_heads_kv

        self.ln_f = FastLayerNorm.load(
            prefix=f"{prefix}.ln_f",
            weights=weights,
            eps=config.layer_norm_epsilon,
        )

        self.head_size = self.h[0].self_attention.head_size

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
    ) -> torch.Tensor:
        hidden_states = self.word_embeddings(input_ids)

        # Get rotary cos and sin for this forward
        # Avoid to index in each layer
        cos, sin = self.h[0].self_attention.rotary_emb.get_cos_sin(
            position_ids, max_s, hidden_states.dtype
        )

        residual = None
        for i, layer in enumerate(self.h):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                block_tables,
                slots,
                seqlen,
                max_s,
            )

        hidden_states, _ = self.ln_f(hidden_states, residual)

        return hidden_states


class FlashRWForCausalLM(FlashRWPreTrainedModel):
    def __init__(self, prefix: str, config, weights):
        super().__init__(config)

        if not prefix:
            prefix = "transformer"
        else:
            prefix = f"{prefix}.transformer"

        self.transformer = FlashRWModel(prefix, config, weights)

        self.lm_head = SpeculativeHead.load(config, prefix="lm_head", weights=weights)

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        prefill_cache_indices: Optional[torch.Tensor],
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        hidden_states = self.transformer(
            input_ids,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits = self.lm_head(hidden_states)
        return logits


================================================
FILE: server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
================================================
import torch
import torch.distributed

from torch import nn
from transformers.activations import ACT2FN
from typing import Optional, List, Tuple

from text_generation_server.layers.attention import (
    paged_attention,
    attention,
    Seqlen,
)
from text_generation_server.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    SpeculativeHead,
    TensorParallelEmbedding,
    get_linear,
)
from text_generation_server.layers.attention.kv_cache import get_kv_scales
from text_generation_server.layers.gptq import GPTQWeightsLoader
from text_generation_server.layers.layernorm import (
    FastLayerNorm,
)


def load_multi_mqa(
    config, prefix: str, weights, bias: bool, head_size, num_heads, hidden_size
):
    if config.quantize == "gptq":
        return _load_multi_mqa_gptq(
            config, prefix, weights, bias, head_size, num_heads, hidden_size
        )
    elif config.quantize == "marlin":
        raise RuntimeError(
            "santacoder models with marlin quantization are not yet supported"
        )
    else:
        return _load_multi_mqa(
            config, prefix, weights, bias, head_size, num_heads, hidden_size
        )


def _load_multi_mqa_gptq(
    config, prefix: str, weights, bias: bool, head_size, num_heads, hidden_size
):
    from text_generation_server.layers.gptq import GPTQWeight

    if any("c_attn" in k for k in weights.routing.keys()) and not config.transpose:
        world_size = weights.process_group.size()
        rank = weights.process_group.rank()

        slice_ = weights._get_slice(f"{prefix}.c_attn.qweight")
        shape = slice_.get_shape()
        block_size = (shape[1] - 2 * head_size) // world_size
        start = rank * block_size
        stop = (rank + 1) * block_size
        assert (shape[1] - 2 * head_size) % world_size == 0
        q_tensor = slice_[:, start:stop]
        kv_tensor = slice_[:, -2 * head_size :]
        qweight = torch.cat([q_tensor, kv_tensor], dim=1)
        qweight = qweight.to(device=weights.device)

        slice_ = weights._get_slice(f"{prefix}.c_attn.scales")
        shape = slice_.get_shape()
        block_size = (shape[1] - 2 * head_size) // world_size
        start = rank * block_size
        stop = (rank + 1) * block_size
        assert (shape[1] - 2 * head_size) % world_size == 0
        q_tensor = slice_[:, start:stop]
        kv_tensor = slice_[:, -2 * head_size :]
        scales = torch.cat([q_tensor, kv_tensor], dim=1)
        scales = scales.to(device=weights.device)

        slice_ = weights._get_slice(f"{prefix}.c_attn.qzeros")
        shape = slice_.get_shape()
        block_size = (shape[1] - (2 * head_size) * 4 // 32) // world_size
        start = rank * block_size
        stop = (rank + 1) * block_size
        assert 2 * head_size % (32 // 4) == 0
        q_tensor = slice_[:, start:stop]
        kv_tensor = slice_[:, -2 * head_size * 4 // 32 :]
        qzeros = torch.cat([q_tensor, kv_tensor], dim=1)
        qzeros = qzeros.to(device=weights.device)

        loader = weights.weights_loader
        assert isinstance(loader, GPTQWeightsLoader)
        loader._get_gptq_params(weights)
        if loader.quant_method == "gptq":
            g_idx = weights.get_tensor(f"{prefix}.c_attn.g_idx")
            g_idx = g_idx.to(device=weights.device)
        elif loader.quant_method == "awq":
            g_idx = None
            from text_generation_server.layers.awq.conversion_utils import (
                fast_awq_to_gptq,
            )

            qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)

        from text_generation_server.layers.gptq import HAS_EXLLAMA

        weight = GPTQWeight(
            qweight=qweight,
            qzeros=qzeros,
            scales=scales,
            g_idx=g_idx,
            bits=loader.bits,
            groupsize=loader.groupsize,
            use_awq_kernel=loader.quantize == "awq",
            use_exllama=HAS_EXLLAMA,
        )

        if bias:
            slice_ = weights._get_slice(f"{prefix}.c_attn.bias")
            shape = slice_.get_shape()
            block_size = (shape[0] - 2 * head_size) // world_size
            assert (shape[0] - 2 * head_size) % world_size == 0
            q_tensor = slice_[start:stop]
            start = rank * block_size
            stop = (rank + 1) * block_size
            q_tensor = slice_[start:stop]
            kv_tensor = slice_[-2 * head_size :]
            bias = torch.cat([q_tensor, kv_tensor], dim=0)
            bias = bias.to(device=weights.device)

        return TensorParallelColumnLinear(get_linear(weight, bias))
    else:
        raise NotImplementedError("Gptq loading with santacoder is not implemented")


def _load_multi_mqa(
    config, prefix: str, weights, bias: bool, head_size, num_heads, hidden_size
):
    if any("c_attn" in k for k in weights.routing.keys()):
        slice_ = weights._get_slice(f"{prefix}.c_attn.weight")
        shape = slice_.get_shape()
        world_size = weights.process_group.size()
        rank = weights.process_group.rank()
        if config.transpose:
            block_size = (shape[1] - 2 * head_size) // world_size
            start = rank * block_size
            stop = (rank + 1) * block_size
            assert (shape[1] - 2 * head_size) % world_size == 0
            q_tensor = slice_[:, start:stop]
            kv_tensor = slice_[:, -2 * head_size :]
            weight = torch.cat([q_tensor, kv_tensor], dim=1).T
        else:
            block_size = (shape[0] - 2 * head_size) // world_size
            start = rank * block_size
            stop = (rank + 1) * block_size
            assert (shape[0] - 2 * head_size) % world_size == 0
            q_tensor = slice_[start:stop]
            kv_tensor = slice_[-2 * head_size :]
            weight = torch.cat([q_tensor, kv_tensor], dim=0)
        if bias:
            slice_ = weights._get_slice(f"{prefix}.c_attn.bias")
            shape = slice_.get_shape()
            block_size = (shape[0] - 2 * head_size) // world_size
            assert (shape[0] - 2 * head_size) % world_size == 0
            start = rank * block_size
            stop = (rank + 1) * block_size
            q_tensor = slice_[start:stop]
            kv_tensor = slice_[-2 * head_size :]
            bias = torch.cat([q_tensor, kv_tensor], dim=0)
    else:
        if config.transpose:
            w = [
                weights.get_sharded(f"{prefix}.q_attn.weight", dim=1).T,
                weights.get_tensor(f"{prefix}.kv_attn.weight").T,
            ]
            weight = torch.cat(w, dim=0)
        else:
            w = [
                weights.get_sharded(f"{prefix}.q_attn.weight", dim=0),
                weights.get_tensor(f"{prefix}.kv_attn.weight"),
            ]
            weight = torch.cat(w, dim=1)

        if bias:
            b = [
                weights.get_sharded(f"{prefix}.q_attn.bias", dim=0),
                weights.get_tensor(f"{prefix}.kv_attn.bias"),
            ]
            bias = torch.cat(b, dim=0)
        else:
            bias = None

    weight = weight.to(dtype=weights.dtype).to(device=weights.device)
    assert list(weight.shape) == [
        (num_heads + 2) * head_size,
        hidden_size,
    ], f"{weight.shape} != {[(num_heads + 2) * head_size, hidden_size]}"
    if bias is not None:
        bias = bias.to(dtype=weights.dtype).to(device=weights.device)
        assert list(bias.shape) == [
            (num_heads + 2) * head_size
        ], f"{weight.shape} != {[(num_heads + 2) * head_size]}"
    return TensorParallelColumnLinear(get_linear(weight, bias))


def load_col(config, prefix: str, weights, bias: bool):
    if config.transpose:
        weight = weights.get_sharded(f"{prefix}.weight", dim=1).T
    else:
        weight = weights.get_multi_weights_col([prefix], dim=0)

    if bias:
        bias = weights.get_sharded(f"{prefix}.bias", dim=0)
    else:
        bias = None
    return TensorParallelColumnLinear(get_linear(weight, bias))


def load_row(config, prefix: str, weights, bias: bool):
    if config.transpose:
        weight = weights.get_sharded(f"{prefix}.weight", dim=0).T
    else:
        weight = weights.get_weights_row(prefix)

    if bias and weights.process_group.rank() == 0:
        # Rank is only on the first rank process
        bias = weights.get_tensor(f"{prefix}.bias")
    else:
        bias = None
    return TensorParallelRowLinear(
        get_linear(weight, bias), process_group=weights.process_group
    )


class FlashMQAttention(torch.nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        num_heads = config.num_attention_heads
        hidden_size = config.hidden_size

        self.num_heads = num_heads
        self.hidden_size = hidden_size
        self.head_size = hidden_size // num_heads

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()

        self.softmax_scale = self.head_size ** (-0.5)

        self.c_attn = load_multi_mqa(
            config,
            prefix=prefix,
            weights=weights,
            bias=True,
            head_size=self.head_size,
            hidden_size=hidden_size,
            num_heads=self.num_heads,
        )
        self.c_proj = load_row(
            config, prefix=f"{prefix}.c_proj", weights=weights, bias=True
        )
        self.kv_scales = get_kv_scales(weights, f"{prefix}")
        self.kv_head_mapping = torch.zeros(
            self.num_heads, dtype=torch.int32, device=weights.device
        )

    def forward(
        self,
        hidden_states,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        seqlen,
        max_s,
    ):
        qkv = self.c_attn(hidden_states)

        # Split query from key_value
        query, key_value = qkv.split(
            [self.head_size * self.num_heads, 2 * self.head_size], dim=1
        )

        # Prepare query and key_value for indexing
        query = query.view(-1, self.num_heads, self.head_size)
        key_value = key_value.view(-1, 2, 1, self.head_size)

        kv_cache.store(
            key=key_value[:, 0],
            value=key_value[:, 1],
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
            attn_output = attention(
                query=query,
                key=key_value[:, 0],
                value=key_value[:, 1],
                kv_cache=kv_cache,
                kv_scales=self.kv_scales,
                seqlen=seqlen,
                block_tables=block_tables,
                softmax_scale=self.softmax_scale,
            )
        # Decode
        else:
            attn_output = paged_attention(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                seqlen,
                max_s,
                kv_scales=self.kv_scales,
            )

        return self.c_proj(attn_output.view(-1, self.num_heads * self.head_size))


class MLP(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        act = config.activation_function
        self.act = (
            ACT2FN[act]
            if "gelu" not in act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
                ),
            )
        )

        self.c_fc = load_col(
            config, prefix=f"{prefix}.c_fc", weights=weights, bias=True
        )
        self.c_proj = load_row(
            config, prefix=f"{prefix}.c_proj", weights=weights, bias=True
        )

    def forward(self, hidden_states):
        hidden_states = self.c_fc(hidden_states)
        hidden_states = self.act(hidden_states)
        hidden_states = self.c_proj(hidden_states)
        return hidden_states


class Block(nn.Module):
    def __init__(self, prefix: str, layer_id, config, weights):
        super().__init__()
        prefix = f"{prefix}.h.{layer_id}"
        self.ln_1 = FastLayerNorm.load(
            prefix=f"{prefix}.ln_1", weights=weights, eps=config.layer_norm_epsilon
        )
        self.ln_2 = FastLayerNorm.load(
            prefix=f"{prefix}.ln_2", weights=weights, eps=config.layer_norm_epsilon
        )
        self.self_attn = FlashMQAttention(
            prefix=f"{prefix}.attn",
            config=config,
            weights=weights,
        )
        self.mlp = MLP(
            prefix=f"{prefix}.mlp",
            config=config,
            weights=weights,
        )

    def forward(
        self,
        hidden_states,
        residual,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        seqlen,
        max_s,
    ):
        hidden_states, residual = self.ln_1(hidden_states, residual)
        hidden_states = self.self_attn(
            hidden_states,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
        )

        hidden_states, residual = self.ln_2(hidden_states, residual)

        mlp_output = self.mlp(hidden_states)

        return mlp_output, residual


class FlashSantacoderModel(nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()
        self.config = config

        self.process_group = weights.process_group
        self.wte = TensorParallelEmbedding(
            prefix=f"{prefix}.wte",
            weights=weights,
            reduce=False,
        )
        self.wpe = TensorParallelEmbedding(
            prefix=f"{prefix}.wpe",
            weights=weights,
            reduce=False,
        )

        self.layers = nn.ModuleList(
            [
                Block(
                    prefix,
                    layer_id,
                    config,
                    weights,
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )
        self.ln_f = FastLayerNorm.load(
            prefix="transformer.ln_f", weights=weights, eps=config.layer_norm_epsilon
        )

        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
    ) -> torch.Tensor:
        hidden_states = self.wte(input_ids) + self.wpe(position_ids)

        if self.process_group.size() > 1:
            torch.distributed.all_reduce(hidden_states, group=self.process_group)

        residual = None
        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cu_seqlen_prefill,
                kv_cache[i],
                block_tables,
                slots,
                seqlen,
                max_s,
            )

        hidden_states, _ = self.ln_f(hidden_states, residual)

        return hidden_states


class FlashSantacoderForCausalLM(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()

        if not prefix:
            prefix = "transformer"
        else:
            prefix = f"{prefix}.transformer"

        config.transpose = config.architectures[0].startswith("GPT2")
        self.model = FlashSantacoderModel(prefix, config, weights)
        self.lm_head = SpeculativeHead.load(
            config, prefix=f"{prefix}.wte", weights=weights
        )

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        prefill_cache_indices: Optional[torch.Tensor],
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        hidden_states = self.model(
            input_ids,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits = self.lm_head(hidden_states)
        return logits


================================================
FILE: server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
================================================
# coding=utf-8
# Copyright 2024 Starcoder2 AI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
import torch.distributed

from torch import nn
from transformers.activations import ACT2FN
from transformers.configuration_utils import PretrainedConfig
from typing import Optional, List, Tuple

from text_generation_server.layers.attention import (
    paged_attention,
    attention,
    Seqlen,
)
from text_generation_server.layers import (
    TensorParallelMultiAdapterLinear,
    TensorParallelAdapterRowLinear,
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    SpeculativeHead,
    get_linear,
)
from text_generation_server.layers.attention.kv_cache import get_kv_scales
from text_generation_server.layers.layernorm import (
    FastLayerNorm,
    FastRMSNorm,
)
from text_generation_server.layers.rotary import (
    PositionRotaryEmbedding,
)
from text_generation_server.utils.weights import UnquantizedWeight


class Starcoder2Config(PretrainedConfig):
    model_type = "starcoder2"

    def __init__(
        self,
        vocab_size=49152,
        hidden_size=3072,
        intermediate_size=12288,
        num_hidden_layers=30,
        num_attention_heads=24,
        num_key_value_heads=2,
        mlp_type="default",
        hidden_act="gelu_pytorch_tanh",
        max_position_embeddings=4096,
        initializer_range=0.018042,
        norm_type="layer_norm",
        norm_epsilon=1e-5,
        use_cache=True,
        bos_token_id=50256,
        eos_token_id=50256,
        rope_theta=10000.0,
        sliding_window=None,
        attention_dropout=0.0,
        residual_dropout=0.0,
        embedding_dropout=0.0,
        use_bias: bool = True,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.sliding_window = sliding_window
        self.use_bias = use_bias

        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.mlp_type = mlp_type
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.norm_type = norm_type
        self.norm_epsilon = norm_epsilon
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.attention_dropout = attention_dropout
        self.residual_dropout = residual_dropout
        self.embedding_dropout = embedding_dropout

        super().__init__(
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            **kwargs,
        )


def load_attention(config, prefix, weights, layer_id):
    prefixes = [f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"]
    head_size = config.hidden_size // config.num_attention_heads
    sizes = [
        head_size * config.num_attention_heads,
        head_size * config.num_key_value_heads,
        head_size * config.num_key_value_heads,
    ]
    if config.num_attention_heads != config.num_key_value_heads:
        base_layer = _load_gqa(config, prefix, weights)
    else:
        base_layer = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=prefixes,
            dim=0,
            weights=weights,
            bias=config.use_bias,
        )
    return TensorParallelMultiAdapterLinear.load(
        base_layer=base_layer,
        layer_id=layer_id,
        layer_names=prefixes,
        sizes=sizes,
        process_group=weights.process_group,
    )


def _load_gqa(config, prefix: str, weights):
    assert config.hidden_size % config.num_attention_heads == 0
    assert config.num_attention_heads % weights.process_group.size() == 0

    weight = weights.get_multi_weights_col(
        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
        dim=0,
    )

    if isinstance(weight, UnquantizedWeight):
        weight.weight = weight.weight.to(dtype=weights.dtype).to(device=weights.device)

        head_size = config.hidden_size // config.num_attention_heads
        num_heads = config.num_attention_heads // weights.process_group.size()
        num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
        assert list(weight.weight.shape) == [
            (num_heads + 2 * num_key_value_heads) * head_size,
            config.hidden_size,
        ], f"{list(weight.weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"

    if config.use_bias:
        w = [
            weights.get_sharded(f"{p}.bias", dim=0)
            for p in [f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"]
        ]
        bias = torch.cat(w, dim=0).to(dtype=weights.dtype).to(device=weights.device)
    else:
        bias = None

    return TensorParallelColumnLinear(get_linear(weight, bias=bias))


class Starcoder2Attention(torch.nn.Module):
    def __init__(
        self,
        index: int,
        prefix: str,
        config,
        weights,
    ):
        super().__init__()
        self.max_past = (
            config.sliding_window if config.sliding_window is not None else -1
        )
        self.num_heads = config.num_attention_heads
        self.hidden_size = config.hidden_size
        self.head_size = self.hidden_size // self.num_heads

        self.rotary_emb = PositionRotaryEmbedding.static(
            config=config,
            dim=self.head_size,
            base=config.rope_theta,
            device=weights.device,
        )

        self.softmax_scale = self.head_size**-0.5

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()
        self.num_key_value_heads = (
            config.num_key_value_heads // weights.process_group.size()
        )

        self.query_key_value = load_attention(config, prefix, weights, index)
        self.kv_scales = get_kv_scales(weights, f"{prefix}")

        o_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.o_proj",
            weights=weights,
            bias=getattr(config, "use_bias", False),
        )

        self.o_proj = TensorParallelAdapterRowLinear.load(
            o_proj,
            index,
            "o_proj",
            process_group=weights.process_group,
        )

        self.num_groups = self.num_heads // self.num_key_value_heads
        self.kv_head_mapping = torch.arange(
            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
        ).repeat_interleave(self.num_groups)

    def forward(
        self,
        hidden_states,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        seqlen,
        max_s,
        prefill_cache_indices,
        adapter_data,
    ):
        qkv = self.query_key_value(hidden_states, adapter_data)
        query, kv = qkv.split(
            [
                self.head_size * self.num_heads,
                2 * self.head_size * self.num_key_value_heads,
            ],
            dim=1,
        )
        query = query.view(-1, self.num_heads, self.head_size)
        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)

        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)

        if prefill_cache_indices is not None:
            kv_to_cache = kv[prefill_cache_indices]
        else:
            kv_to_cache = kv

        kv_cache.store(
            key=kv_to_cache[:, 0],
            value=kv_to_cache[:, 1],
            slots=slots,
            kv_scales=self.kv_scales,
        )

        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
            attn_output = attention(
                query=query,
                key=kv_to_cache[:, 0],
                value=kv_to_cache[:, 1],
                kv_cache=kv_cache,
                kv_scales=self.kv_scales,
                seqlen=seqlen,
                block_tables=block_tables,
                softmax_scale=self.softmax_scale,
                window_size_left=self.max_past,
            )
        # Decode
        else:
            attn_output = paged_attention(
                query,
                kv_cache,
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                seqlen,
                max_s,
                kv_scales=self.kv_scales,
                window_size_left=self.max_past,
            )

        return self.o_proj(
            attn_output.view(-1, self.num_heads * self.head_size), adapter_data
        )


class Starcoder2MLP(nn.Module):
    def __init__(self, prefix, config, weights, index):
        super().__init__()
        act = config.hidden_act
        self.act = (
            ACT2FN[act]
            if "gelu" not in act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
                ),
            )
        )
        # Fuse gate and up proj
        c_fc = TensorParallelColumnLinear.load(
            config,
            prefix=f"{prefix}.c_fc",
            weights=weights,
            bias=config.use_bias,
        )
        c_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.c_proj",
            weights=weights,
            bias=config.use_bias,
        )

        self.c_fc = TensorParallelMultiAdapterLinear.load(
            c_fc,
            layer_id=index,
            layer_names=[f"{prefix}.c_fc"],
            sizes=[config.intermediate_size, config.intermediate_size],
            process_group=weights.process_group,
        )

        self.c_proj = TensorParallelAdapterRowLinear.load(
            c_proj,
            index,
            "c_proj",
            process_group=weights.process_group,
        )

    def forward(self, hidden_states, adapter_data):
        hidden_states = self.c_fc(hidden_states, adapter_data)
        hidden_states = self.act(hidden_states)
        return self.c_proj(hidden_states, adapter_data)


class Starcoder2GatedMLP(nn.Module):
    def __init__(self, index, prefix, config, weights):
        super().__init__()
        act = config.hidden_act
        self.act = (
            ACT2FN[act]
            if "gelu" not in act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
                ),
            )
        )
        # Fuse gate and up proj
        prefixes = [f"{prefix}.gate_proj", f"{prefix}.up_proj"]
        sizes = [
            config.intermediate_size,
            config.intermediate_size,
        ]
        gate_up_proj = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=prefixes,
            weights=weights,
            dim=0,
            bias=config.use_bias,
        )
        self.gate_up_proj = TensorParallelMultiAdapterLinear.load(
            gate_up_proj,
            index,
            layer_names=prefixes,
            sizes=sizes,
            process_group=weights.process_group,
        )
        down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.down_proj",
            weights=weights,
            bias=config.use_bias,
        )
        self.down_proj = TensorParallelAdapterRowLinear.load(
            down_proj,
            index,
            "down_proj",
            process_group=weights.process_group,
        )
        self.intermediate_size = (
            config.intermediate_size // weights.process_group.size()
        )

    def forward(self, hidden_states, adapter_data):
        gate_up_states = self.gate_up_proj(hidden_states, adapter_data)
        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
        return self.down_proj(
            self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], adapter_data
        )


STARCODER2_NORMALIZATION_CLASSES = {
    "layer_norm": FastLayerNorm,
    "rms_norm": FastRMSNorm,
}

STARCODER2_MLP_CLASSES = {
    "default": Starcoder2MLP,
    "gated": Starcoder2GatedMLP,
}


class Starcoder2Layer(nn.Module):
    def __init__(self, layer_id, config, weights):
        super().__init__()
        prefix = f"model.layers.{layer_id}"
        self.self_attn = Starcoder2Attention(
            prefix=f"{prefix}.self_attn", config=config, weights=weights, index=layer_id
        )

        self.mlp = STARCODER2_MLP_CLASSES[config.mlp_type](
            prefix=f"{prefix}.mlp", config=config, weights=weights, index=layer_id
        )

        self.input_layernorm = STARCODER2_NORMALIZATION_CLASSES[config.norm_type].load(
            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.norm_epsilon
        )
        self.post_attention_layernorm = STARCODER2_NORMALIZATION_CLASSES[
            config.norm_type
        ].load(
            prefix=f"{prefix}.post_attention_layernorm",
            weights=weights,
            eps=config.norm_epsilon,
        )

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        seqlen,
        max_s,
        prefill_cache_indices,
        adapter_data,
    ):
        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)

        # Self Attention
        attn_output = self.self_attn(
            normed_hidden_states,
            cos,
            sin,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
            prefill_cache_indices,
            adapter_data,
        )

        # faster post attention rms norm
        normed_attn_res_output, attn_res = self.post_attention_layernorm(
            attn_output, res
        )

        mlp_output = self.mlp(normed_attn_res_output, adapter_data)

        return mlp_output, attn_res


class Starcoder2Model(torch.nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()

        process_group = weights.process_group
        self.tp_rank = process_group.rank()
        self.tp_world_size = process_group.size()
        self.embed_tokens = TensorParallelEmbedding(
            prefix=f"{prefix}.embed_tokens", weights=weights
        )
        self.layers = nn.ModuleList(
            [
                Starcoder2Layer(
                    layer_id,
                    config,
                    weights,
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )
        self.norm = STARCODER2_NORMALIZATION_CLASSES[config.norm_type].load(
            prefix=f"{prefix}.norm", weights=weights, eps=config.norm_epsilon
        )

        self.gradient_checkpointing = False

        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads
        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        true_max_s: int,
        prefill_cache_indices: Optional[torch.Tensor],
        adapter_data,
    ) -> torch.Tensor:
        hidden_states = self.embed_tokens(input_ids)

        # Get rotary cos and sin for this forward
        # Avoid to index in each layer
        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
            position_ids, true_max_s, hidden_states.dtype
        )

        residual = None
        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
                kv_cache[i],
                block_tables,
                slots,
                seqlen,
                max_s,
                prefill_cache_indices,
                adapter_data,
            )

        hidden_states, _ = self.norm(hidden_states, residual)

        return hidden_states


class FlashStarcoder2ForCausalLM(torch.nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()

        if not prefix:
            prefix = "model"
        else:
            prefix = f"{prefix}.model"

        self.model = Starcoder2Model(prefix, config, weights)
        try:
            self.lm_head = SpeculativeHead.load(
                config,
                prefix="lm_head",
                weights=weights,
            )
        except RuntimeError:
            self.lm_head = SpeculativeHead.load(
                config,
                prefix=f"{prefix}.embed_tokens",
                weights=weights,
            )

        self.max_past = config.sliding_window
        self.max_past_tensor = (
            torch.tensor(config.sliding_window, device=weights.device)
            if self.max_past is not None
            else None
        )

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        prefill_cache_indices: Optional[torch.Tensor],
        lm_head_indices: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        true_max_s = max_s
        if prefill_cache_indices is not None:
            # Slots also need to be sliced as it has the same size as the whole kv tensor
            slots = slots[prefill_cache_indices]
        elif self.max_past is not None:
            # Clamp in decode mode as paged attention requires clamped values whereas the flash attention
            # kernel requires the true values
            seqlen = seqlen.clamp(max=self.max_past_tensor)

        hidden_states = self.model(
            input_ids,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            seqlen,
            max_s,
            true_max_s,
            prefill_cache_indices,
            adapter_data,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits = self.lm_head(hidden_states)
        return logits


================================================
FILE: server/text_generation_server/models/custom_modeling/gemma3/configuration_gemma3.py
================================================
#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
#           This file was automatically generated from src/transformers/models/gemma3/modular_gemma3.py.
#               Do NOT edit this file manually as any edits will be overwritten by the generation of
#             the file from the modular. If any change should be done, please apply the change to the
#                          modular_gemma3.py file directly. One of our CI enforces this.
#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# coding=utf-8
# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional

from transformers.configuration_utils import PretrainedConfig
from transformers.modeling_rope_utils import rope_config_validation
from transformers.utils import logging
from transformers import SiglipVisionConfig

logger = logging.get_logger(__name__)


class Gemma3TextConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`Gemma3Model`]. It is used to instantiate a Gemma3
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Gemma3-4B.
    e.g. [google/gemma-3-4b](https://huggingface.co/google/gemma-3-4b)
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 262144):
            Vocabulary size of the Gemma3 model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`Gemma3Model`]
        hidden_size (`int`, *optional*, defaults to 2304):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 9216):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 26):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*, defaults to 4):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
            `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 256):
            The attention head dimension.
        sliding_window (`int`, *optional*, defaults to 4096): in Gemma3, every other layer uses sliding window
            attention. This is the size of the sliding window.
        query_pre_attn_scalar (`float`, *optional*):
            The scaling factor used on the attention scores, not that
        rope_theta (`float`, *optional*, defaults to 1000000.0):
            The base period of the RoPE embeddings used for global attention.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
            accordingly.
            Expected contents:
                `rope_type` (`str`):
                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                    'llama3'], with 'default' being the original RoPE implementation.
                `factor` (`float`, *optional*):
                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                    original maximum pre-trained length.
                `original_max_position_embeddings` (`int`, *optional*):
                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                    pretraining.
                `attention_factor` (`float`, *optional*):
                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                    computation. If unspecified, it defaults to value recommended by the implementation, using the
                    `factor` field to infer the suggested value.
                `beta_fast` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                    ramp function. If unspecified, it defaults to 32.
                `beta_slow` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                    ramp function. If unspecified, it defaults to 1.
                `short_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `long_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `low_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                `high_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
        rope_local_base_freq (float, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings for local attention.
        sliding_window_pattern (`int`, *optional*, defaults to 6):
            Pattern for the sliding window attention.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
            The non-linear activation function (function or string) in the decoder. Will default to
            `"gelu_pytorch_tanh"` if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"`
            activation function.
        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 1):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings
        max_position_embeddings (`int`, *optional*, defaults to 131072):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        final_logit_softcapping (`bool`, *optional*, defaults to `True`):
            Whether to apply logit softcapping or nor
        attn_logit_softcapping (`float`, *optional*, defaults to 50.0):
            Scaling factor when applying tanh soft-capping on the attention scorexs.
        cache_implementation (`str`, *optional*, defaults to `"hybrid"`):
            The cache type to be used with `generate`.

    ```python
    >>> from transformers import Gemma3Model, Gemma3TextConfig
    >>> # Initializing a Gemma3 gemma3-4b style configuration
    >>> configuration = Gemma3Config()
    >>> # Initializing a model from the gemma3-4b style configuration
    >>> model = Gemma3Model(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "gemma3_text"

    def __init__(
        self,
        vocab_size: int = 262_144,
        hidden_size: int = 2304,
        intermediate_size: int = 9216,
        num_hidden_layers: int = 26,
        num_attention_heads: int = 8,
        num_key_value_heads: int = 4,
        head_dim: int = 256,
        sliding_window: int = 4096,
        query_pre_attn_scalar: Optional[float] = 256,
        rope_theta: float = 1_000_000.0,
        rope_scaling=None,
        rope_local_base_freq: float = 10_000.0,
        sliding_window_pattern: int = 6,
        rms_norm_eps: float = 1e-6,
        hidden_activation: str = "gelu_pytorch_tanh",
        pad_token_id: int = 0,
        eos_token_id: int = 1,
        bos_token_id: int = 2,
        tie_word_embeddings: bool = True,
        max_position_embeddings: int = 131_072,
        initializer_range: float = 0.02,
        attention_bias: bool = False,
        attention_dropout: float = 0.0,
        use_cache: bool = True,
        final_logit_softcapping=None,
        attn_logit_softcapping=None,
        cache_implementation: str = "hybrid",
        **kwargs,
    ):
        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )

        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.head_dim = head_dim
        self.num_key_value_heads = num_key_value_heads
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.rope_local_base_freq = rope_local_base_freq
        # For configuring HybridCache to work with 5:1 attention pattern
        self.sliding_window_pattern = sliding_window_pattern
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout
        self.hidden_activation = hidden_activation
        self.query_pre_attn_scalar = query_pre_attn_scalar
        self.sliding_window = sliding_window
        self.final_logit_softcapping = final_logit_softcapping
        self.attn_logit_softcapping = attn_logit_softcapping
        self.cache_implementation = cache_implementation
        rope_config_validation(self)


class Gemma3Config(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`Gemma3ForConditionalGeneration`]. It is used to instantiate an
    Gemma3ForConditionalGeneration according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the PaliGemma-2B.

    e.g. [google/gemma-3-4b](https://huggingface.co/google/gemma-3-4b)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        text_config (`Union[Gemma3TextConfig, dict]`, *optional*):
            The config object of the text backbone.
        vision_config (`Union[AutoConfig, dict]`,  *optional*):
            Custom vision config or dict.
        mm_tokens_per_image (`int`, *optional*, defaults to 256):
            The number of tokens per image embedding.
        boi_token_index (`int`, *optional*, defaults to 255999):
            The begin-of-image token index to wrap the image prompt.
        eoi_token_index (`int`, *optional*, defaults to 256000):
            The end-of-image token index to wrap the image prompt.
        image_token_index (`int`, *optional*, defaults to 262144):
            The image token index to encode the image prompt.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

    Example:

    ```python
    >>> from transformers import Gemma3ForConditionalGeneration, Gemma3Config, SiglipVisionConfig, Gemma3TextConfig

    >>> # Initializing a Siglip-like vision config
    >>> vision_config = SiglipVisionConfig()

    >>> # Initializing a Gemma3 Text config
    >>> text_config = Gemma3TextConfig()

    >>> # Initializing a Gemma3 gemma-3-4b style configuration
    >>> configuration = Gemma3Config(vision_config, text_config)

    >>> # Initializing a model from the gemma-3-4b style configuration
    >>> model = Gemma3TextConfig(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "gemma3"
    sub_configs = {
        "text_config": Gemma3TextConfig,
        "vision_config": SiglipVisionConfig,
    }

    def __init__(
        self,
        text_config: Optional[Gemma3TextConfig] = None,
        vision_config: Optional[SiglipVisionConfig] = None,
        mm_tokens_per_image: int = 256,
        boi_token_index: int = 255_999,
        eoi_token_index: int = 256_000,
        image_token_index: int = 262_144,
        initializer_range: float = 0.02,
        **kwargs,
    ):
        if text_config is None:
            text_config = Gemma3TextConfig()
            logger.info(
                "text_config is None, using default Gemma3TextConfig vision config."
            )
        elif isinstance(text_config, dict):
            text_config = Gemma3TextConfig(**text_config)

        if isinstance(vision_config, dict):
            vision_config = SiglipVisionConfig(**vision_config)
        else:
            vision_config = SiglipVisionConfig()
            logger.info(
                "vision_config is None or incompatible with Gemma3VisionConfig intialization. Gemma3 will be limited "
                "to text tasks."
            )

        self.text_config = text_config
        self.vision_config = vision_config
        self.mm_tokens_per_image = mm_tokens_per_image
        self.boi_token_index = boi_token_index
        self.eoi_token_index = eoi_token_index
        self.image_token_index = image_token_index
        self.initializer_range = initializer_range

        super().__init__(**kwargs)


__all__ = ["Gemma3Config", "Gemma3TextConfig"]


================================================
FILE: server/text_generation_server/models/custom_modeling/gemma3/image_processing_gemma3.py
================================================
# coding=utf-8
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for Gemma3."""

import itertools
import math
from typing import Dict, List, Optional, Union

import numpy as np

from transformers.image_processing_utils import (
    BaseImageProcessor,
    BatchFeature,
    get_size_dict,
)
from transformers.image_transforms import (
    convert_to_rgb,
    resize,
    to_channel_dimension_format,
)
from transformers.image_utils import (
    IMAGENET_STANDARD_MEAN,
    IMAGENET_STANDARD_STD,
    ChannelDimension,
    ImageInput,
    PILImageResampling,
    get_image_size,
    infer_channel_dimension_format,
    is_scaled_image,
    to_numpy_array,
    valid_images,
    validate_preprocess_arguments,
)
from transformers.utils import (
    TensorType,
    filter_out_non_signature_kwargs,
    is_vision_available,
    logging,
)

from .utils import make_nested_list_of_images


logger = logging.get_logger(__name__)


if is_vision_available():
    import PIL


class Gemma3ImageProcessor(BaseImageProcessor):
    r"""
    Constructs a SigLIP image processor.

    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
            `do_resize` in the `preprocess` method.
        size (`Dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`):
            Size of the image after resizing. Can be overridden by `size` in the `preprocess` method.
        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
            the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
            method.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image by the specified mean and standard deviation. Can be overridden by
            `do_normalize` in the `preprocess` method.
        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
        image_std (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
            Can be overridden by the `image_std` parameter in the `preprocess` method.
        do_convert_rgb (`bool`, *optional*, defaults to `True`):
            Whether to convert the image to RGB.
        do_pan_and_scan (`bool`, *optional*):
            Whether to apply `pan_and_scan` to images.
    """

    model_input_names = ["pixel_values", "num_crops"]

    def __init__(
        self,
        do_resize: bool = True,
        size: Dict[str, int] = None,
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        do_rescale: bool = False,
        rescale_factor: Union[int, float] = 1 / 255,
        do_normalize: bool = True,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        do_convert_rgb: bool = None,
        do_pan_and_scan: bool = None,
        pan_and_scan_min_crop_size: int = None,
        pan_and_scan_max_num_crops: int = None,
        pan_and_scan_min_ratio_to_activate: float = None,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        size = size if size is not None else {"height": 224, "width": 224}
        image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
        image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD

        self.do_resize = do_resize
        self.size = size
        self.resample = resample
        self.do_rescale = do_rescale
        self.rescale_factor = rescale_factor
        self.do_normalize = do_normalize
        self.image_mean = image_mean
        self.image_std = image_std
        self.do_convert_rgb = do_convert_rgb
        self.do_pan_and_scan = do_pan_and_scan
        self.pan_and_scan_min_crop_size = pan_and_scan_min_crop_size
        self.pan_and_scan_max_num_crops = pan_and_scan_max_num_crops
        self.pan_and_scan_min_ratio_to_activate = pan_and_scan_min_ratio_to_activate

    def pan_and_scan(
        self,
        image: np.ndarray,
        pan_and_scan_min_crop_size: int,
        pan_and_scan_max_num_crops: int,
        pan_and_scan_min_ratio_to_activate: float,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ):
        """
        Pan and Scan and image, whatever it means. TODO: write-up docs

        Args:
            image (`np.ndarray`):
                Image to resize.
            pan_and_scan_min_crop_size (`int`):
                Size of pan_and_scan_min_crop_size.
            pan_and_scan_max_num_crops (`int`):
                pan_and_scan_max_num_crops for the image.
            pan_and_scan_min_ratio_to_activate (`int`):
                pan_and_scan_min_ratio_to_activate for the image..
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        """
        height, width = get_image_size(image)

        # Square or landscape image.
        if width >= height:
            # Only apply PaS if the image is sufficiently exaggerated
            if width / height < pan_and_scan_min_ratio_to_activate:
                return []

            # Select ideal number of crops close to the image aspect ratio and such that crop_size > min_crop_size.
            num_crops_w = int(
                math.floor(width / height + 0.5)
            )  # Half round up rounding.
            num_crops_w = min(
                int(math.floor(width / pan_and_scan_min_crop_size)), num_crops_w
            )

            # Make sure the number of crops is in range [2, pan_and_scan_max_num_crops].
            num_crops_w = max(2, num_crops_w)
            num_crops_w = min(pan_and_scan_max_num_crops, num_crops_w)
            num_crops_h = 1

        # Portrait image.
        else:
            # Only apply PaS if the image is sufficiently exaggerated
            if height / width < pan_and_scan_min_ratio_to_activate:
                return []

            # Select ideal number of crops close to the image aspect ratio and such that crop_size > min_crop_size.
            num_crops_h = int(math.floor(height / width + 0.5))
            num_crops_h = min(
                int(math.floor(height / pan_and_scan_min_crop_size)), num_crops_h
            )

            # Make sure the number of crops is in range [2, pan_and_scan_max_num_crops].
            num_crops_h = max(2, num_crops_h)
            num_crops_h = min(pan_and_scan_max_num_crops, num_crops_h)
            num_crops_w = 1

        crop_size_w = int(math.ceil(width / num_crops_w))
        crop_size_h = int(math.ceil(height / num_crops_h))

        # Don't apply PaS if crop size is too small.
        if min(crop_size_w, crop_size_h) < pan_and_scan_min_crop_size:
            return []

        crop_positions_w = [crop_size_w * i for i in range(num_crops_w)]
        crop_positions_h = [crop_size_h * i for i in range(num_crops_h)]

        if input_data_format == ChannelDimension.LAST:
            image_crops = [
                image[pos_h : pos_h + crop_size_h, pos_w : pos_w + crop_size_w]
                for pos_h, pos_w in itertools.product(
                    crop_positions_h, crop_positions_w
                )
            ]
        else:
            image_crops = [
                image[:, pos_h : pos_h + crop_size_h, pos_w : pos_w + crop_size_w]
                for pos_h, pos_w in itertools.product(
                    crop_positions_h, crop_positions_w
                )
            ]

        return image_crops

    def _process_images_for_pas(
        self,
        images: List[np.ndarray],
        do_pan_and_scan: bool,
        pan_and_scan_min_crop_size: int,
        pan_and_scan_max_num_crops: int,
        pan_and_scan_min_ratio_to_activate: float,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ):
        pas_images_list = []
        num_crops = []
        for image in images:
            pas_images = self.pan_and_scan(
                image=image,
                pan_and_scan_min_crop_size=pan_and_scan_min_crop_size,
                pan_and_scan_max_num_crops=pan_and_scan_max_num_crops,
                pan_and_scan_min_ratio_to_activate=pan_and_scan_min_ratio_to_activate,
                data_format=data_format,
                input_data_format=input_data_format,
            )
            pas_images_list.extend([image] + pas_images)
            num_crops.append(len(pas_images))
        return pas_images_list, num_crops

    @filter_out_non_signature_kwargs()
    def preprocess(
        self,
        images: ImageInput,
        do_resize: bool = None,
        size: Dict[str, int] = None,
        resample: PILImageResampling = None,
        do_rescale: bool = None,
        rescale_factor: float = None,
        do_normalize: bool = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        do_convert_rgb: bool = True,
        do_pan_and_scan: bool = None,
        pan_and_scan_min_crop_size: int = None,
        pan_and_scan_max_num_crops: int = None,
        pan_and_scan_min_ratio_to_activate: float = None,
    ) -> PIL.Image.Image:
        """
        Preprocess an image or batch of images.

        Args:
            images (`ImageInput`):
                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether to resize the image.
            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
                Size of the image after resizing.
            resample (`int`, *optional*, defaults to `self.resample`):
                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
                has an effect if `do_resize` is set to `True`.
            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                Whether to rescale the image.
            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
                Whether to normalize the image.
            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
                `True`.
            return_tensors (`str` or `TensorType`, *optional*):
                The type of tensors to return. Can be one of:
                - Unset: Return a list of `np.ndarray`.
                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
                The channel dimension format for the output image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - Unset: Use the channel dimension format of the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
                Whether to convert the image to RGB.
            do_pan_and_scan (`bool`, *optional*, defaults to `self.do_convert_rgb`):
                Whether to apply `pan_and_scan` to images.
        """
        do_resize = do_resize if do_resize is not None else self.do_resize
        size = size if size is not None else self.size
        size = get_size_dict(size, param_name="size", default_to_square=False)
        resample = resample if resample is not None else self.resample
        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
        rescale_factor = (
            rescale_factor if rescale_factor is not None else self.rescale_factor
        )
        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
        image_mean = image_mean if image_mean is not None else self.image_mean
        image_std = image_std if image_std is not None else self.image_std
        do_convert_rgb = (
            do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
        )
        do_pan_and_scan = (
            do_pan_and_scan if do_pan_and_scan is not None else self.do_pan_and_scan
        )
        pan_and_scan_min_crop_size = (
            pan_and_scan_min_crop_size
            if pan_and_scan_min_crop_size is not None
            else self.pan_and_scan_min_crop_size
        )
        pan_and_scan_max_num_crops = (
            pan_and_scan_max_num_crops
            if pan_and_scan_max_num_crops is not None
            else self.pan_and_scan_max_num_crops
        )
        pan_and_scan_min_ratio_to_activate = (
            pan_and_scan_min_ratio_to_activate
            if pan_and_scan_min_ratio_to_activate is not None
            else self.pan_and_scan_min_ratio_to_activate
        )

        images_list = make_nested_list_of_images(images)

        if not valid_images(images_list[0]):
            raise ValueError(
                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
                "torch.Tensor, tf.Tensor or jax.ndarray."
            )

        validate_preprocess_arguments(
            do_rescale=do_rescale,
            rescale_factor=rescale_factor,
            do_normalize=do_normalize,
            image_mean=image_mean,
            image_std=image_std,
            do_resize=do_resize,
            size=size,
            resample=resample,
        )
        if do_convert_rgb:
            images_list = [
                [convert_to_rgb(image) for image in images] for images in images_list
            ]

        # All transformations expect numpy arrays.
        images_list = [
            [to_numpy_array(image) for image in images] for images in images_list
        ]

        if do_rescale and is_scaled_image(images_list[0][0]):
            logger.warning_once(
                "It looks like you are trying to rescale already rescaled images. If the input"
                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
            )

        if input_data_format is None:
            # We assume that all images have the same channel dimension format.
            input_data_format = infer_channel_dimension_format(images_list[0][0])

        if do_pan_and_scan:
            images_list_and_num_crops = [
                self._process_images_for_pas(
                    images=images,
                    do_pan_and_scan=do_pan_and_scan,
                    pan_and_scan_min_crop_size=pan_and_scan_min_crop_size,
                    pan_and_scan_max_num_crops=pan_and_scan_max_num_crops,
                    pan_and_scan_min_ratio_to_activate=pan_and_scan_min_ratio_to_activate,
                    data_format=data_format,
                    input_data_format=input_data_format,
                )
                for images in images_list
            ]
            images_list = [images for images, _ in images_list_and_num_crops]
            num_crops = [num_crops for _, num_crops in images_list_and_num_crops]
        else:
            num_crops = [[0] for images in images_list]

        if do_resize:
            height, width = size["height"], size["width"]
            images_list = [
                [
                    resize(
                        image=image,
                        size=(height, width),
                        resample=resample,
                        input_data_format=input_data_format,
                    )
                    for image in images
                ]
                for images in images_list
            ]

        if do_rescale:
            images_list = [
                [
                    self.rescale(
                        image=image,
                        scale=rescale_factor,
                        input_data_format=input_data_format,
                    )
                    for image in images
                ]
                for images in images_list
            ]

        if do_normalize:
            images_list = [
                [
                    self.normalize(
                        image=image,
                        mean=image_mean,
                        std=image_std,
                        input_data_format=input_data_format,
                    )
                    for image in images
                ]
                for images in images_list
            ]

        images = [
            to_channel_dimension_format(
                image, data_format, input_channel_dim=input_data_format
            )
            for images in images_list
            for image in images
        ]

        data = {"pixel_values": images, "num_crops": num_crops}
        return BatchFeature(data=data, tensor_type=return_tensors)


__all__ = ["Gemma3ImageProcessor"]


================================================
FILE: server/text_generation_server/models/custom_modeling/gemma3/processing_gemma3.py
================================================
# coding=utf-8
# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
from typing import List, Optional, Union

from transformers.feature_extraction_utils import BatchFeature
from transformers.image_utils import ImageInput
from transformers.processing_utils import (
    ImagesKwargs,
    ProcessingKwargs,
    ProcessorMixin,
    Unpack,
)
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
from transformers.utils import to_py_obj
from text_generation_server.models.custom_modeling.gemma3.image_processing_gemma3 import (
    Gemma3ImageProcessor,
)

from transformers.image_utils import PILImageResampling

from .utils import make_nested_list_of_images


class Gemma3ImagesKwargs(ImagesKwargs):
    do_pan_and_scan: Optional[bool]
    pan_and_scan_min_crop_size: Optional[int]
    pan_and_scan_max_num_crops: Optional[int]
    pan_and_scan_min_ratio_to_activate: Optional[float]
    do_convert_rgb: Optional[bool]


class Gemma3ProcessorKwargs(ProcessingKwargs, total=False):
    _defaults = {
        "text_kwargs": {
            "padding": False,
        },
        "images_kwargs": {
            "do_pan_and_scan": False,
            "pan_and_scan_min_crop_size": 256,
            "pan_and_scan_max_num_crops": 4,
            "pan_and_scan_min_ratio_to_activate": 1.2,
        },
    }


class Gemma3Processor(ProcessorMixin):
    attributes = ["image_processor", "tokenizer"]
    valid_kwargs = ["chat_template"]
    # # image_processor_class = "Gemma3ImageProcessor"
    image_processor_class = "AutoProcessor"
    tokenizer_class = "AutoTokenizer"

    def __init__(
        self,
        image_processor,
        tokenizer,
        chat_template=None,
        num_mm_soft_tokens_per_image: int = 256,
        **kwargs,
    ):
        num_mm_soft_tokens_per_image = 256
        chat_template = None

        image_processor = Gemma3ImageProcessor(
            image_mean=(127.5,) * 3,
            image_std=(127.5,) * 3,
            size={"height": 896, "width": 896},
            do_rescale=False,
            resample=PILImageResampling.BILINEAR,
        )

        self.image_token_id = tokenizer.image_token_id
        image_tokens_expanded = "".join(
            [tokenizer.image_token] * num_mm_soft_tokens_per_image
        )
        self.full_image_sequence = (
            f"\n\n{tokenizer.boi_token}{image_tokens_expanded}{tokenizer.eoi_token}\n\n"
        )

        self.image_processor = image_processor
        self.tokenizer = tokenizer
        self.chat_template = chat_template

        # super().__init__(
        #     image_processor=image_processor,
        #     tokenizer=tokenizer,
        #     chat_template=chat_template,
        #     **kwargs,
        # )

    def __call__(
        self,
        images: ImageInput = None,
        text: Union[
            TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]
        ] = None,
        videos=None,
        audio=None,
        **kwargs: Unpack[Gemma3ProcessorKwargs],
    ) -> BatchFeature:
        if text is None and images is None:
            raise ValueError("Provide at least one of `text` or `images`.")

        output_kwargs = self._merge_kwargs(
            Gemma3ProcessorKwargs,
            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
            **kwargs,
        )

        if isinstance(text, str):
            text = [text]
        elif not isinstance(text, list) and not isinstance(text[0], str):
            raise ValueError(
                "Invalid input text. Please provide a string, or a list of strings"
            )

        image_inputs = {}
        if images is not None:
            batched_images = make_nested_list_of_images(images)
            image_inputs = self.image_processor(
                batched_images, **output_kwargs["images_kwargs"]
            )

            # Create empty text to be replaced with placeholders
            if not text:
                text = [
                    " ".join(["<image>"] * len(images)) for images in batched_images
                ]

            if len(batched_images) != len(text):
                raise ValueError(
                    f"Received inconsistently sized batches of images ({len(batched_images)}) and text ({len(text)})."
                )

            # Replace image tokens by the full expanded sequence
            batch_num_crops = to_py_obj(image_inputs.pop("num_crops"))
            for prompt, images, num_crops in zip(text, batched_images, batch_num_crops):
                image_indexes = [m.start() for m in re.finditer("<image>", prompt)]

                if len(images) != len(image_indexes):
                    raise ValueError(
                        f"Prompt contained {len(image_indexes)} image tokens but received {len(images)} images."
                    )

                # Insert additional image tokens for Pan-and-Scan crops
                for num, idx in reversed(list(zip(num_crops, image_indexes))):
                    if num:
                        formatted_image_text = (
                            "Here is the original image <image> and here are some crops to help you see better "
                            + " ".join(["<image>"] * num)
                        )
                        prompt = (
                            prompt[:idx]
                            + formatted_image_text
                            + prompt[idx + len("<image>") :]
                        )

            # Expand placeholder image tokens to the full image token sequence
            text = [
                prompt.replace("<image>", self.full_image_sequence) for prompt in text
            ]

        text_input = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
        return BatchFeature(data={**text_input, **image_inputs})

    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Gemma
    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to GemmaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        """
        return self.tokenizer.batch_decode(*args, **kwargs)

    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Gemma
    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to GemmaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        """
        return self.tokenizer.decode(*args, **kwargs)

    @property
    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names with CLIP->PaliGemma
    def model_input_names(self):
        tokenizer_input_names = self.tokenizer.model_input_names
        image_processor_input_names = self.image_processor.model_input_names
        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))


__all__ = ["Gemma3Processor"]


================================================
FILE: server/text_generation_server/models/custom_modeling/gemma3/utils.py
================================================
# coding=utf-8
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List, Union


from transformers.image_utils import ImageInput, is_valid_image, is_pil_image


def is_valid_list_of_images(images: List):
    return images and all(is_valid_image(image) for image in images)


def make_nested_list_of_images(
    images: Union[List[ImageInput], ImageInput],
) -> ImageInput:
    """
    Ensure that the output is a nested list of images.
    Args:
        images (`Union[List[ImageInput], ImageInput]`):
            The input image.
    Returns:
        list: A list of list of images or a list of 4d array of images.
    """
    # If it's a list of batches, it's already in the right format
    if (
        isinstance(images, (list, tuple))
        and all(isinstance(images_i, (list, tuple)) for images_i in images)
        and all(is_valid_list_of_images(images_i) for images_i in images)
    ):
        return images

    # If it's a list of images, it's a single batch, so convert it to a list of lists
    if isinstance(images, (list, tuple)) and is_valid_list_of_images(images):
        if is_pil_image(images[0]) or images[0].ndim == 3:
            return [images]
        if images[0].ndim == 4:
            return [list(image) for image in images]

    # If it's a single image, convert it to a list of lists
    if is_valid_image(images):
        if is_pil_image(images) or images.ndim == 3:
            return [[images]]
        if images.ndim == 4:
            return [list(images)]

    raise ValueError(
        "Invalid input type. Must be a single image, a list of images, or a list of batches of images."
    )


================================================
FILE: server/text_generation_server/models/custom_modeling/idefics2.py
================================================
# coding=utf-8
# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch Idefics2 model."""

from typing import List, Optional, Tuple

import torch
import torch.utils.checkpoint
from torch import nn
import math

from transformers.activations import ACT2FN
from text_generation_server.models.custom_modeling.vlm import (
    load_text_model,
)
from text_generation_server.layers.attention import Seqlen
from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask

from text_generation_server.layers import (
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    TensorParallelRowLinear,
)
from text_generation_server.utils.weights import DefaultWeightsLoader, UnquantizedWeight


def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(
        batch, num_key_value_heads, n_rep, slen, head_dim
    )
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


class Idefics2VisionEmbeddings(nn.Module):
    """
    This is a modified version of `siglip.modelign_siglip.SiglipVisionEmbeddings` to enable images of variable
    resolution.

    The modifications are adapted from [Patch n' Pack: NaViT, a Vision Transformer for any Aspect Ratio and Resolution](https://arxiv.org/abs/2307.06304)
    which allows treating images in their native aspect ratio and without the need to resize them to the same
    fixed size. In particular, we start from the original pre-trained SigLIP model
    (which uses images of fixed-size square images) and adapt it by training on images of variable resolutions.
    """

    def __init__(self, prefix, config, weights):
        super().__init__()
        self.embed_dim = config.hidden_size
        self.image_size = config.image_size
        self.patch_size = config.patch_size

        self.patch_embedding = nn.Conv2d(
            in_channels=config.num_channels,
            out_channels=self.embed_dim,
            kernel_size=self.patch_size,
            stride=self.patch_size,
            padding="valid",
        )
        self.patch_embedding.weight = nn.Parameter(
            weights.get_tensor(f"{prefix}.patch_embedding.weight"), requires_grad=False
        )
        self.patch_embedding.bias = nn.Parameter(
            weights.get_tensor(f"{prefix}.patch_embedding.bias"), requires_grad=False
        )

        self.num_patches_per_side = self.image_size // self.patch_size
        self.num_patches = self.num_patches_per_side**2
        self.num_positions = self.num_patches
        self.position_embedding = TensorParallelEmbedding(
            prefix=f"{prefix}.position_embedding", weights=weights
        )

    def forward(
        self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.BoolTensor
    ) -> torch.Tensor:
        batch_size, _, max_im_h, max_im_w = pixel_values.shape

        patch_embeds = self.patch_embedding(pixel_values)
        embeddings = patch_embeds.flatten(2).transpose(1, 2)

        max_nb_patches_h, max_nb_patches_w = (
            max_im_h // self.patch_size,
            max_im_w // self.patch_size,
        )
        boundaries = torch.arange(
            1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side
        )
        position_ids = torch.full(
            size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0
        )

        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
            nb_patches_h = p_attn_mask[:, 0].sum()
            nb_patches_w = p_attn_mask[0].sum()

            fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
            fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)

            bucket_coords_h = torch.bucketize(
                fractional_coords_h, boundaries, right=True
            )
            bucket_coords_w = torch.bucketize(
                fractional_coords_w, boundaries, right=True
            )

            pos_ids = (
                bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w
            ).flatten()
            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids

        position_ids = position_ids.to(self.position_embedding.weight.device)
        embeddings = embeddings + self.position_embedding(position_ids)
        return embeddings


class Idefics2VisionAttention(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_size = self.embed_dim // self.num_heads
        if self.head_size * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {self.num_heads})."
            )
        self.scale = self.head_size**-0.5
        self.dropout = config.attention_dropout

        self.num_heads = self.num_heads // weights.process_group.size()
        self.embed_dim = self.embed_dim // weights.process_group.size()

        self.qkv = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
            dim=0,
            weights=weights,
            bias=True,
        )
        self.out_proj = TensorParallelRowLinear.load(
            config=config, prefix=f"{prefix}.out_proj", weights=weights, bias=True
        )
        self.is_causal = False

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        batch_size, q_len, _ = hidden_states.size()

        qkv = self.qkv(hidden_states)
        query_states, key_states, value_states = qkv.split(
            [
                self.head_size * self.num_heads,
                self.head_size * self.num_heads,
                self.head_size * self.num_heads,
            ],
            dim=2,
        )

        query_states = query_states.view(
            batch_size, q_len, self.num_heads, self.head_size
        ).transpose(1, 2)
        key_states = key_states.view(
            batch_size, q_len, self.num_heads, self.head_size
        ).transpose(1, 2)
        value_states = value_states.view(
            batch_size, q_len, self.num_heads, self.head_size
        ).transpose(1, 2)

        k_v_seq_len = key_states.shape[-2]
        attn_weights = (
            torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
        )

        if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
            raise ValueError(
                f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
                f" {attn_weights.size()}"
            )

        if attention_mask is not None:
            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
                raise ValueError(
                    f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
                )
            attn_weights = attn_weights + attention_mask

        # upcast attention to fp32
        attn_weights = nn.functional.softmax(
            attn_weights, dim=-1, dtype=torch.float32
        ).to(query_states.dtype)
        attn_weights = nn.functional.dropout(
            attn_weights, p=self.dropout, training=self.training
        )
        attn_output = torch.matmul(attn_weights, value_states)

        if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_size):
            raise ValueError(
                f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_size)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)

        attn_output = self.out_proj(attn_output)

        return attn_output


class Idefics2VisionMLP(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        self.activation_fn = ACT2FN[config.hidden_act]
        self.fc1 = TensorParallelColumnLinear.load(
            prefix=f"{prefix}.fc1", config=config, weights=weights, bias=True
        )
        self.fc2 = TensorParallelRowLinear.load(
            prefix=f"{prefix}.fc2", config=config, weights=weights, bias=True
        )

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.fc1(hidden_states)
        hidden_states = self.activation_fn(hidden_states)
        hidden_states = self.fc2(hidden_states)
        return hidden_states


class Idefics2EncoderLayer(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.embed_dim = config.hidden_size
        self.self_attn = Idefics2VisionAttention(
            prefix=f"{prefix}.self_attn", config=config, weights=weights
        )
        self.layer_norm1 = nn.LayerNorm.load(
            prefix=f"{prefix}.layer_norm1", eps=config.layer_norm_eps, weights=weights
        )
        self.layer_norm2 = nn.LayerNorm.load(
            prefix=f"{prefix}.layer_norm2", eps=config.layer_norm_eps, weights=weights
        )
        self.mlp = Idefics2VisionMLP(
            prefix=f"{prefix}.mlp", config=config, weights=weights
        )

    # Copied from transformers.models.siglip.modeling_siglip.SiglipEncoderLayer.forward
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
    ) -> torch.Tensor:
        residual = hidden_states

        hidden_states = self.layer_norm1(hidden_states)
        hidden_states = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
        )
        hidden_states = residual + hidden_states

        residual = hidden_states
        hidden_states = self.layer_norm2(hidden_states)
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states

        return hidden_states


class Idefics2Encoder(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        self.layers = nn.ModuleList(
            [
                Idefics2EncoderLayer(
                    prefix=f"{prefix}.layers.{i}", config=config, weights=weights
                )
                for i in range(config.num_hidden_layers)
            ]
        )

    # Ignore copy
    def forward(
        self,
        inputs_embeds,
        attention_mask: Optional[torch.Tensor] = None,
    ):
        hidden_states = inputs_embeds
        for encoder_layer in self.layers:
            hidden_states = encoder_layer(
                hidden_states,
                attention_mask,
            )
        return hidden_states


class Idefics2VisionTransformer(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        self.embeddings = Idefics2VisionEmbeddings(
            prefix=f"{prefix}.embeddings", config=config, weights=weights
        )
        self.encoder = Idefics2Encoder(
            prefix=f"{prefix}.encoder", config=config, weights=weights
        )
        self.post_layernorm = nn.LayerNorm.load(
            prefix=f"{prefix}.post_layernorm",
            weights=weights,
            eps=config.layer_norm_eps,
        )

    def forward(
        self,
        pixel_values,
        patch_attention_mask: Optional[torch.BoolTensor] = None,
    ):
        batch_size = pixel_values.size(0)
        if patch_attention_mask is None:
            patch_size = self.config.patch_size
            patch_attention_mask = torch.ones(
                (
                    batch_size,
                    pixel_values.size(2) // patch_size,
                    pixel_values.size(3) // patch_size,
                )
            )
            patch_attention_mask = patch_attention_mask.to(
                dtype=torch.bool, device=pixel_values.device
            )

        hidden_states = self.embeddings(
            pixel_values=pixel_values, patch_attention_mask=patch_attention_mask
        )

        patch_attention_mask = patch_attention_mask.view(batch_size, -1)
        # The call to `_upad_input` in `_flash_attention_forward` is expensive
        # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
        # avoiding passing the attention_mask, which is equivalent to attending to the full sequence
        if not torch.any(~patch_attention_mask):
            patch_attention_mask = None
        else:
            patch_attention_mask = _prepare_4d_attention_mask(
                patch_attention_mask, hidden_states.dtype
            )

        encoder_outputs = self.encoder(
            inputs_embeds=hidden_states,
            attention_mask=patch_attention_mask,
        )

        last_hidden_state = encoder_outputs
        last_hidden_state = self.post_layernorm(last_hidden_state)

        return last_hidden_state


class Idefics2MLP(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        act = config.text_config.hidden_act
        self.act = (
            ACT2FN[act]
            if "gelu" not in act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
                ),
            )
        )
        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
            weights=weights,
            dim=0,
            bias=False,
        )
        self.down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.down_proj",
            weights=weights,
            bias=False,
        )

    def forward(self, hidden_states):
        start_shape = hidden_states.shape[:-1]
        gate_up_states = self.gate_up_proj(hidden_states)
        intermediate_size = gate_up_states.shape[-1] // 2
        gate_up_states = gate_up_states.view(-1, 2, intermediate_size)
        return self.down_proj(
            self.act(gate_up_states[:, 0]) * gate_up_states[:, 1]
        ).view(*start_shape, -1)


class Idefics2RMSNorm(nn.Module):
    def __init__(self, prefix, weights, eps):
        """
        Idefics2RMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        self.weight = nn.Parameter(
            weights.get_tensor(f"{prefix}.weight"), requires_grad=False
        )
        self.variance_epsilon = eps

    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)


class Idefics2PerceiverAttention(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()

        self.layer_idx = None
        self.hidden_size = config.text_config.hidden_size
        self.num_heads = config.perceiver_config.resampler_n_heads
        self.head_size = config.perceiver_config.resampler_head_dim
        self.num_key_value_heads = config.perceiver_config.num_key_value_heads
        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
        self.attention_dropout = config.perceiver_config.attention_dropout
        self.num_heads = self.num_heads // weights.process_group.size()
        self.num_key_value_heads = (
            self.num_key_value_heads // weights.process_group.size()
        )

        self.q_proj = TensorParallelColumnLinear.load(
            config,
            prefix=f"{prefix}.q_proj",
            weights=weights,
            bias=False,
        )
        self.kv = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.k_proj", f"{prefix}.v_proj"],
            dim=0,
            weights=weights,
            bias=False,
        )
        self.o_proj = TensorParallelRowLinear.load(
            config=config, prefix=f"{prefix}.o_proj", weights=weights, bias=False
        )

        self.is_causal = False

    def forward(
        self,
        latents: torch.Tensor,
        context: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        bsz, q_len, _ = latents.size()
        kv_seq_len = q_len + context.size()[1]

        hidden_states = torch.concat([context, latents], dim=-2)
        query_states = self.q_proj(latents)
        kv = self.kv(hidden_states)
        key_states, value_states = kv.split(
            [
                self.head_size * self.num_key_value_heads,
                self.head_size * self.num_key_value_heads,
            ],
            dim=2,
        )

        query_states = query_states.view(
            bsz, q_len, self.num_heads, self.head_size
        ).transpose(1, 2)
        key_states = key_states.view(
            bsz, kv_seq_len, self.num_key_value_heads, self.head_size
        ).transpose(1, 2)
        value_states = value_states.view(
            bsz, kv_seq_len, self.num_key_value_heads, self.head_size
        ).transpose(1, 2)

        # repeat k/v heads if n_kv_heads < n_heads
        key_states = repeat_kv(key_states, self.num_key_value_groups)
        value_states = repeat_kv(value_states, self.num_key_value_groups)

        attn_weights = torch.matmul(
            query_states, key_states.transpose(2, 3)
        ) / math.sqrt(self.head_size)

        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
            raise ValueError(
                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
                f" {attn_weights.size()}"
            )

        if attention_mask is not None:
            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
                raise ValueError(
                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                )

            attn_weights = attn_weights + attention_mask

        # upcast attention to fp32
        attn_weights = nn.functional.softmax(
            attn_weights, dim=-1, dtype=torch.float32
        ).to(query_states.dtype)
        attn_output = torch.matmul(attn_weights, value_states)

        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_size):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_size)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_size)

        attn_output = self.o_proj(attn_output)

        return attn_output


class Idefics2PerceiverLayer(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.hidden_size = config.text_config.hidden_size
        self.n_latents = config.perceiver_config.resampler_n_latents
        self.depth = config.perceiver_config.resampler_depth
        self.rms_norm_eps = config.text_config.rms_norm_eps

        self.input_latents_norm = Idefics2RMSNorm(
            prefix=f"{prefix}.input_latents_norm",
            weights=weights,
            eps=self.rms_norm_eps,
        )
        self.input_context_norm = Idefics2RMSNorm(
            prefix=f"{prefix}.input_context_norm",
            weights=weights,
            eps=self.rms_norm_eps,
        )
        self.self_attn = Idefics2PerceiverAttention(
            prefix=f"{prefix}.self_attn", config=config, weights=weights
        )
        self.post_attention_layernorm = Idefics2RMSNorm(
            prefix=f"{prefix}.post_attention_layernorm",
            weights=weights,
            eps=self.rms_norm_eps,
        )
        self.mlp = Idefics2MLP(prefix=f"{prefix}.mlp", config=config, weights=weights)

    def forward(
        self,
        latents: torch.Tensor,
        context: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
    ):
        """
        Args:
            latents (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            context (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
        """
        residual = latents

        latents = self.input_latents_norm(latents)
        context = self.input_context_norm(context)

        latents = self.self_attn(
            latents=latents,
            context=context,
            attention_mask=attention_mask,
        )
        latents = residual + latents
        residual = latents

        latents = self.post_attention_layernorm(latents)
        latents = self.mlp(latents)
        latents = residual + latents

        return latents


class Idefics2PerceiverResampler(nn.Module):
    def __init__(self, prefix, config, weights) -> None:
        super().__init__()
        self.hidden_size = config.text_config.hidden_size
        self.hidden_act = config.perceiver_config.hidden_act
        self.n_latents = config.perceiver_config.resampler_n_latents
        self.depth = config.perceiver_config.resampler_depth
        self.rms_norm_eps = config.text_config.rms_norm_eps

        # Create Latents for Perceiver
        self.latents = weights.get_tensor(f"{prefix}.latents")

        # Create Transformer Blocks
        self.layers = nn.ModuleList(
            [
                Idefics2PerceiverLayer(
                    prefix=f"{prefix}.layers.{idx}", config=config, weights=weights
                )
                for idx in range(self.depth)
            ]
        )
        self.norm = Idefics2RMSNorm(
            prefix=f"{prefix}.norm",
            weights=weights,
            eps=config.text_config.rms_norm_eps,
        )

    def forward(
        self,
        context: torch.Tensor,
        attention_mask,
    ) -> torch.Tensor:
        # seq embed -> bsz seq embed
        latents = self.latents.unsqueeze(0).expand(
            (context.shape[0], *self.latents.size())
        )

        latent_attention_mask = torch.ones(
            (attention_mask.size(0), latents.size(1)),
            dtype=attention_mask.dtype,
            device=attention_mask.device,
        )
        attention_mask = torch.cat([attention_mask, latent_attention_mask], dim=-1)
        attention_mask = _prepare_4d_attention_mask(
            attention_mask, latents.dtype, tgt_len=self.n_latents
        )

        compressed_context = latents
        for perceiver_layer in self.layers:
            compressed_context = perceiver_layer(
                compressed_context,
                context,
                attention_mask=attention_mask,
            )
        compressed_context = self.norm(compressed_context)

        return compressed_context


class Idefics2Connector(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.modality_projection = Idefics2MLP(
            prefix=f"{prefix}.modality_projection", config=config, weights=weights
        )
        self.perceiver_resampler = Idefics2PerceiverResampler(
            prefix=f"{prefix}.perceiver_resampler", config=config, weights=weights
        )

    def forward(self, image_hidden_states, attention_mask):
        image_hidden_states = self.modality_projection(image_hidden_states)
        image_hidden_states = self.perceiver_resampler(
            context=image_hidden_states, attention_mask=attention_mask
        )
        return image_hidden_states


class Idefics2ForConditionalGeneration(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        config.vision_config.quantize = None
        config.vision_config.speculator = config.speculator
        config.text_config.quantize = config.quantize
        config.text_config.speculator = config.speculator

        vision_config = config.vision_config
        self.text_model = load_text_model(
            prefix="model" if not prefix else f"{prefix}.model",
            config=config.text_config,
            weights=weights,
            name="text_model",
        )
        self.dtype = weights.dtype

        # The vision and connector models are not quantized.
        with weights.use_loader(DefaultWeightsLoader(UnquantizedWeight)):
            self.vision_model = Idefics2VisionTransformer(
                prefix=(
                    f"{prefix}.model.vision_model" if prefix else "model.vision_model"
                ),
                config=vision_config,
                weights=weights,
            )

            config.quantize = None
            self.connector = Idefics2Connector(
                prefix=f"{prefix}.model.connector" if prefix else "model.connector",
                config=config,
                weights=weights,
            )

        self.config = config
        self.image_seq_len = config.perceiver_config.resampler_n_latents
        self.image_token_id = config.image_token_id
        self.pad_token_id = (
            config.pad_token_id if config.pad_token_id is not None else -1
        )

    def _merge_input_ids_with_image_features(
        self,
        input_ids: torch.Tensor,
        inputs_embeds: torch.Tensor,
        image_features: torch.Tensor,
    ):
        """In place merges in vision_embeddings with inputs_embeds."""
        # mask = input_ids == self.config.image_token_index
        mask = input_ids == self.config.image_token_id
        # Let's pray we have enabled enough slots !
        inputs_embeds[mask] = image_features.view(-1, image_features.shape[-1])
        return inputs_embeds

    def get_vision_embeds(
        self,
        pixel_values: torch.FloatTensor,
        pixel_attention_mask: Optional[torch.FloatTensor] = None,
        image_sizes: Optional[torch.Tensor] = None,
        image_grid_thw: Optional[torch.LongTensor] = None,
    ):
        assert pixel_values is not None
        batch_size, num_images, num_channels, height, width = pixel_values.shape
        all_states = []
        all_pixel_values = pixel_values
        all_pixel_mask = pixel_attention_mask
        for i in range(batch_size):
            pixel_values = all_pixel_values.to(dtype=self.dtype)  # fp16 compatibility
            pixel_values = pixel_values[i : i + 1]
            pixel_values = pixel_values.view(num_images, *pixel_values.shape[2:])

            # Remove padding images - padding images are full 0.
            nb_values_per_image = pixel_values.shape[1:].numel()
            real_images_inds = (pixel_values == 0.0).sum(
                dim=(-1, -2, -3)
            ) != nb_values_per_image
            pixel_values = pixel_values[real_images_inds].contiguous()

            # Handle the vision attention mask
            if pixel_attention_mask is None:
                pixel_attention_mask = torch.ones(
                    size=(
                        pixel_values.size(0),
                        pixel_values.size(2),
                        pixel_values.size(3),
                    ),
                    dtype=torch.bool,
                    device=pixel_values.device,
                )
            else:
                # Remove padding images from the mask/pP p
                pixel_attention_mask = all_pixel_mask[i : i + 1]
                pixel_attention_mask = pixel_attention_mask.view(
                    1 * num_images, *pixel_attention_mask.shape[2:]
                )
                pixel_attention_mask = pixel_attention_mask[
                    real_images_inds
                ].contiguous()

            patch_size = self.config.vision_config.patch_size
            patches_subgrid = pixel_attention_mask.unfold(
                dimension=1, size=patch_size, step=patch_size
            )
            patches_subgrid = patches_subgrid.unfold(
                dimension=2, size=patch_size, step=patch_size
            )
            patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()

            # Get sequence from the vision encoder
            image_hidden_states = self.vision_model(
                pixel_values=pixel_values,
                patch_attention_mask=patch_attention_mask,
            )

            # Modality projection & resampling
            image_hidden_states = self.connector(
                image_hidden_states,
                attention_mask=patch_attention_mask.view(pixel_values.size(0), -1),
            )
            all_states.append(image_hidden_states)
        image_hidden_states = torch.stack(all_states, dim=0)
        return image_hidden_states.view(-1, image_hidden_states.shape[-1])

    def get_inputs_embeds(
        self,
        input_ids: torch.Tensor,
        vision_embeds: torch.Tensor = None,
    ):
        inputs_embeds = self.text_model.embed_tokens(input_ids)

        if vision_embeds is not None:
            # When we generate, we don't want to replace the potential image_token_id that we generated by images
            # that simply don't exist
            inputs_embeds = self._merge_input_ids_with_image_features(
                input_ids, inputs_embeds, vision_embeds
            )
        return inputs_embeds

    def forward(
        self,
        inputs_embeds: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        prefill_cache_indices: Optional[torch.Tensor],
        lm_head_indices: Optional[torch.Tensor] = None,
        # Unused here
        attention_mask: Optional[torch.BoolTensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ):
        hidden_states = self.text_model.model(
            inputs_embeds=inputs_embeds,
            position_ids=position_ids,
            cu_seqlen_prefill=cu_seqlen_prefill,
            kv_cache=kv_cache,
            block_tables=block_tables,
            slots=slots,
            seqlen=seqlen,
            max_s=max_s,
            true_max_s=max_s,
            prefill_cache_indices=None,
            adapter_data=adapter_data,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.text_model.lm_head(hidden_states)
        return logits, speculative_logits


================================================
FILE: server/text_generation_server/models/custom_modeling/idefics3.py
================================================
# coding=utf-8
# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch Idefics3 model."""

from typing import List, Optional, Tuple

import torch
import torch.utils.checkpoint
from torch import nn

from transformers.activations import ACT2FN
from text_generation_server.models.custom_modeling.vlm import (
    load_text_model,
)
from text_generation_server.layers.attention import Seqlen
from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask

from text_generation_server.layers import (
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    TensorParallelRowLinear,
)
from text_generation_server.utils.weights import DefaultWeightsLoader, UnquantizedWeight


def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(
        batch, num_key_value_heads, n_rep, slen, head_dim
    )
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


class Idefics3VisionEmbeddings(nn.Module):
    """
    This is a modified version of `siglip.modelign_siglip.SiglipVisionEmbeddings` to enable images of variable
    resolution.

    The modifications are adapted from [Patch n' Pack: NaViT, a Vision Transformer for any Aspect Ratio and Resolution](https://arxiv.org/abs/2307.06304)
    which allows treating images in their native aspect ratio and without the need to resize them to the same
    fixed size. In particular, we start from the original pre-trained SigLIP model
    (which uses images of fixed-size square images) and adapt it by training on images of variable resolutions.
    """

    def __init__(self, prefix, config, weights):
        super().__init__()
        self.embed_dim = config.hidden_size
        self.image_size = config.image_size
        self.patch_size = config.patch_size

        self.patch_embedding = nn.Conv2d(
            in_channels=config.num_channels,
            out_channels=self.embed_dim,
            kernel_size=self.patch_size,
            stride=self.patch_size,
            padding="valid",
        )
        self.patch_embedding.weight = nn.Parameter(
            weights.get_tensor(f"{prefix}.patch_embedding.weight"), requires_grad=False
        )
        self.patch_embedding.bias = nn.Parameter(
            weights.get_tensor(f"{prefix}.patch_embedding.bias"), requires_grad=False
        )

        self.num_patches_per_side = self.image_size // self.patch_size
        self.num_patches = self.num_patches_per_side**2
        self.num_positions = self.num_patches
        self.position_embedding = TensorParallelEmbedding(
            prefix=f"{prefix}.position_embedding", weights=weights
        )

    def forward(
        self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.BoolTensor
    ) -> torch.Tensor:
        batch_size, _, max_im_h, max_im_w = pixel_values.shape

        patch_embeds = self.patch_embedding(pixel_values)
        embeddings = patch_embeds.flatten(2).transpose(1, 2)

        max_nb_patches_h, max_nb_patches_w = (
            max_im_h // self.patch_size,
            max_im_w // self.patch_size,
        )
        boundaries = torch.arange(
            1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side
        )
        position_ids = torch.full(
            size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0
        )

        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
            nb_patches_h = p_attn_mask[:, 0].sum()
            nb_patches_w = p_attn_mask[0].sum()

            fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
            fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)

            bucket_coords_h = torch.bucketize(
                fractional_coords_h, boundaries, right=True
            )
            bucket_coords_w = torch.bucketize(
                fractional_coords_w, boundaries, right=True
            )

            pos_ids = (
                bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w
            ).flatten()
            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids

        position_ids = position_ids.to(self.position_embedding.weight.device)
        embeddings = embeddings + self.position_embedding(position_ids)
        return embeddings


class Idefics3VisionAttention(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_size = self.embed_dim // self.num_heads
        if self.head_size * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {self.num_heads})."
            )
        self.scale = self.head_size**-0.5
        self.dropout = config.attention_dropout

        self.num_heads = self.num_heads // weights.process_group.size()
        self.embed_dim = self.embed_dim // weights.process_group.size()

        self.qkv = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
            dim=0,
            weights=weights,
            bias=True,
        )
        self.out_proj = TensorParallelRowLinear.load(
            config=config, prefix=f"{prefix}.out_proj", weights=weights, bias=True
        )
        self.is_causal = False

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        batch_size, q_len, _ = hidden_states.size()

        qkv = self.qkv(hidden_states)
        query_states, key_states, value_states = qkv.split(
            [
                self.head_size * self.num_heads,
                self.head_size * self.num_heads,
                self.head_size * self.num_heads,
            ],
            dim=2,
        )

        query_states = query_states.view(
            batch_size, q_len, self.num_heads, self.head_size
        ).transpose(1, 2)
        key_states = key_states.view(
            batch_size, q_len, self.num_heads, self.head_size
        ).transpose(1, 2)
        value_states = value_states.view(
            batch_size, q_len, self.num_heads, self.head_size
        ).transpose(1, 2)

        k_v_seq_len = key_states.shape[-2]
        attn_weights = (
            torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
        )

        if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
            raise ValueError(
                f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
                f" {attn_weights.size()}"
            )

        if attention_mask is not None:
            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
                raise ValueError(
                    f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
                )
            attn_weights = attn_weights + attention_mask

        # upcast attention to fp32
        attn_weights = nn.functional.softmax(
            attn_weights, dim=-1, dtype=torch.float32
        ).to(query_states.dtype)
        attn_weights = nn.functional.dropout(
            attn_weights, p=self.dropout, training=self.training
        )
        attn_output = torch.matmul(attn_weights, value_states)

        if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_size):
            raise ValueError(
                f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_size)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)

        attn_output = self.out_proj(attn_output)

        return attn_output


class Idefics3VisionMLP(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        self.activation_fn = ACT2FN[config.hidden_act]
        self.fc1 = TensorParallelColumnLinear.load(
            prefix=f"{prefix}.fc1", config=config, weights=weights, bias=True
        )
        self.fc2 = TensorParallelRowLinear.load(
            prefix=f"{prefix}.fc2", config=config, weights=weights, bias=True
        )

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.fc1(hidden_states)
        hidden_states = self.activation_fn(hidden_states)
        hidden_states = self.fc2(hidden_states)
        return hidden_states


class Idefics3EncoderLayer(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.embed_dim = config.hidden_size
        self.self_attn = Idefics3VisionAttention(
            prefix=f"{prefix}.self_attn", config=config, weights=weights
        )
        self.layer_norm1 = nn.LayerNorm.load(
            prefix=f"{prefix}.layer_norm1", eps=config.layer_norm_eps, weights=weights
        )
        self.layer_norm2 = nn.LayerNorm.load(
            prefix=f"{prefix}.layer_norm2", eps=config.layer_norm_eps, weights=weights
        )
        self.mlp = Idefics3VisionMLP(
            prefix=f"{prefix}.mlp", config=config, weights=weights
        )

    # Copied from transformers.models.siglip.modeling_siglip.SiglipEncoderLayer.forward
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
    ) -> torch.Tensor:
        residual = hidden_states

        hidden_states = self.layer_norm1(hidden_states)
        hidden_states = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
        )
        hidden_states = residual + hidden_states

        residual = hidden_states
        hidden_states = self.layer_norm2(hidden_states)
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states

        return hidden_states


class Idefics3Encoder(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        self.layers = nn.ModuleList(
            [
                Idefics3EncoderLayer(
                    prefix=f"{prefix}.layers.{i}", config=config, weights=weights
                )
                for i in range(config.num_hidden_layers)
            ]
        )

    # Ignore copy
    def forward(
        self,
        inputs_embeds,
        attention_mask: Optional[torch.Tensor] = None,
    ):
        hidden_states = inputs_embeds
        for encoder_layer in self.layers:
            hidden_states = encoder_layer(
                hidden_states,
                attention_mask,
            )
        return hidden_states


class Idefics3VisionTransformer(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        self.embeddings = Idefics3VisionEmbeddings(
            prefix=f"{prefix}.embeddings", config=config, weights=weights
        )
        self.encoder = Idefics3Encoder(
            prefix=f"{prefix}.encoder", config=config, weights=weights
        )
        self.post_layernorm = nn.LayerNorm.load(
            prefix=f"{prefix}.post_layernorm",
            weights=weights,
            eps=config.layer_norm_eps,
        )

    def forward(
        self,
        pixel_values,
        patch_attention_mask: Optional[torch.BoolTensor] = None,
    ):
        batch_size = pixel_values.size(0)
        if patch_attention_mask is None:
            patch_size = self.config.patch_size
            patch_attention_mask = torch.ones(
                (
                    batch_size,
                    pixel_values.size(2) // patch_size,
                    pixel_values.size(3) // patch_size,
                )
            )
            patch_attention_mask = patch_attention_mask.to(
                dtype=torch.bool, device=pixel_values.device
            )

        hidden_states = self.embeddings(
            pixel_values=pixel_values, patch_attention_mask=patch_attention_mask
        )

        patch_attention_mask = patch_attention_mask.view(batch_size, -1)
        # The call to `_upad_input` in `_flash_attention_forward` is expensive
        # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
        # avoiding passing the attention_mask, which is equivalent to attending to the full sequence
        if not torch.any(~patch_attention_mask):
            patch_attention_mask = None
        else:
            patch_attention_mask = _prepare_4d_attention_mask(
                patch_attention_mask, hidden_states.dtype
            )

        encoder_outputs = self.encoder(
            inputs_embeds=hidden_states,
            attention_mask=patch_attention_mask,
        )

        last_hidden_state = encoder_outputs
        last_hidden_state = self.post_layernorm(last_hidden_state)

        return last_hidden_state


class Idefics3SimpleMLP(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        input_size = config.vision_config.hidden_size * (config.scale_factor**2)
        output_size = config.text_config.hidden_size
        proj = nn.Parameter(
            weights.get_tensor(f"{prefix}.modality_projection.proj.weight"),
            requires_grad=False,
        ).to(weights.dtype)
        self.proj = nn.Linear(input_size, output_size, bias=False)
        self.proj.weight = proj

    def forward(self, x):
        return self.proj(x)


class Idefics3Connector(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.modality_projection = Idefics3SimpleMLP(prefix, config, weights)
        self.scale_factor = config.scale_factor

    def pixel_shuffle(self, x, scale_factor=2):
        bsz, seq, embed_dim = x.size()
        height = width = int(seq**0.5)
        x = x.view(bsz, height, width, embed_dim)
        x = x.view(bsz, height, int(width / scale_factor), embed_dim * scale_factor)
        x = x.permute(0, 2, 1, 3)
        x = x.reshape(
            bsz,
            int(width / scale_factor),
            int(height / scale_factor),
            embed_dim * (scale_factor**2),
        )
        x = x.permute(0, 2, 1, 3)
        x = x.reshape(bsz, int(seq / (scale_factor**2)), embed_dim * (scale_factor**2))
        return x

    def forward(self, image_hidden_states):
        image_hidden_states = self.pixel_shuffle(image_hidden_states, self.scale_factor)
        image_hidden_states = self.modality_projection(image_hidden_states)
        return image_hidden_states


class Idefics3ForConditionalGeneration(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        config.vision_config.quantize = None
        config.vision_config.speculator = config.speculator
        config.text_config.quantize = config.quantize
        config.text_config.speculator = config.speculator
        # set tie_word_embeddings to True to load `.embed_tokens.weight` instead of `.lm_head.weight`
        # since Idefics3 uses the `embed_tokens` for the final prediction
        # config.text_config.tie_word_embeddings = True

        vision_config = config.vision_config
        self.text_model = load_text_model(
            prefix="model" if not prefix else f"{prefix}.model",
            config=config.text_config,
            weights=weights,
            name="text_model",
        )
        self.dtype = weights.dtype

        # The vision and connector models are not quantized.
        with weights.use_loader(DefaultWeightsLoader(UnquantizedWeight)):
            self.vision_model = Idefics3VisionTransformer(
                prefix=(
                    f"{prefix}.model.vision_model" if prefix else "model.vision_model"
                ),
                config=vision_config,
                weights=weights,
            )

            config.quantize = None
            self.connector = Idefics3Connector(
                prefix=f"{prefix}.model.connector" if prefix else "model.connector",
                config=config,
                weights=weights,
            )

        self.config = config
        self.image_token_id = config.image_token_id
        self.pad_token_id = (
            config.pad_token_id if config.pad_token_id is not None else -1
        )

    def _merge_input_ids_with_image_features(
        self,
        input_ids: torch.Tensor,
        inputs_embeds: torch.Tensor,
        image_features: torch.Tensor,
    ):
        """In place merges in vision_embeddings with inputs_embeds."""
        # mask = input_ids == self.config.image_token_index
        mask = input_ids == self.config.image_token_id
        # Let's pray we have enabled enough slots !
        inputs_embeds[mask] = image_features.view(-1, image_features.shape[-1])
        return inputs_embeds

    def get_vision_embeds(
        self,
        pixel_values: torch.FloatTensor,
        pixel_attention_mask: Optional[torch.FloatTensor] = None,
        image_sizes: Optional[torch.Tensor] = None,
        image_grid_thw: Optional[torch.LongTensor] = None,
    ):
        batch_size, num_images, num_channels, height, width = pixel_values.shape
        all_states = []
        all_pixel_values = pixel_values
        all_pixel_mask = pixel_attention_mask
        for i in range(batch_size):
            pixel_values = all_pixel_values.to(dtype=self.dtype)  # fp16 compatibility
            pixel_values = pixel_values[i : i + 1]
            pixel_values = pixel_values.view(num_images, *pixel_values.shape[2:])

            # Remove padding images - padding images are full 0.
            nb_values_per_image = pixel_values.shape[1:].numel()
            real_images_inds = (pixel_values == 0.0).sum(
                dim=(-1, -2, -3)
            ) != nb_values_per_image
            pixel_values = pixel_values[real_images_inds].contiguous()
            # Handle the vision attention mask
            if pixel_attention_mask is None:
                pixel_attention_mask = torch.ones(
                    size=(
                        pixel_values.size(0),
                        pixel_values.size(2),
                        pixel_values.size(3),
                    ),
                    dtype=torch.bool,
                    device=pixel_values.device,
                )
            else:
                # Remove padding images from the mask/pP p
                pixel_attention_mask = all_pixel_mask[i : i + 1]
                pixel_attention_mask = pixel_attention_mask.view(
                    1 * num_images, *pixel_attention_mask.shape[2:]
                )
                pixel_attention_mask = pixel_attention_mask[
                    real_images_inds
                ].contiguous()

            patch_size = self.config.vision_config.patch_size
            patches_subgrid = pixel_attention_mask.unfold(
                dimension=1, size=patch_size, step=patch_size
            )
            patches_subgrid = patches_subgrid.unfold(
                dimension=2, size=patch_size, step=patch_size
            )
            patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()

            # Get sequence from the vision encoder
            image_hidden_states = self.vision_model(
                pixel_values=pixel_values,
                patch_attention_mask=patch_attention_mask,
            )

            # Modality projection & resampling
            image_hidden_states = self.connector(
                image_hidden_states,
            )

            all_states.append(image_hidden_states)
        image_hidden_states = torch.stack(all_states, dim=0)

        return image_hidden_states.view(-1, image_hidden_states.shape[-1])

    def get_inputs_embeds(
        self,
        input_ids: torch.Tensor,
        vision_embeds: torch.Tensor = None,
    ):
        inputs_embeds = self.text_model.embed_tokens(input_ids)

        if vision_embeds is not None:
            # When we generate, we don't want to replace the potential image_token_id that we generated by images
            # that simply don't exist
            inputs_embeds = self._merge_input_ids_with_image_features(
                input_ids, inputs_embeds, vision_embeds
            )
        return inputs_embeds

    def forward(
        self,
        inputs_embeds: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        prefill_cache_indices: Optional[torch.Tensor],
        lm_head_indices: Optional[torch.Tensor] = None,
        # Unused here
        attention_mask: Optional[torch.BoolTensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
        image_indices=None,
    ):
        hidden_states = self.text_model.model(
            inputs_embeds=inputs_embeds,
            position_ids=position_ids,
            cu_seqlen_prefill=cu_seqlen_prefill,
            kv_cache=kv_cache,
            block_tables=block_tables,
            slots=slots,
            seqlen=seqlen,
            max_s=max_s,
            true_max_s=max_s,
            prefill_cache_indices=None,
            adapter_data=adapter_data,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.text_model.lm_head(hidden_states)
        return logits, speculative_logits


================================================
FILE: server/text_generation_server/models/custom_modeling/idefics_config.py
================================================
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Idefics model configuration"""
import copy

from transformers import PretrainedConfig

IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "HuggingFaceM4/idefics-9b": "https://huggingface.co/HuggingFaceM4/idefics-9b/blob/main/config.json",
    "HuggingFaceM4/idefics-80b": "https://huggingface.co/HuggingFaceM4/idefics-80b/blob/main/config.json",
}


class IdeficsVisionConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
    Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the Idefics-9B.
    e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer. (elsewhere referred to as `hidden_size`)
        image_size (`int`, *optional*, defaults to 224):
            The size (resolution) of each image.
        intermediate_size (`int`, *optional*, defaults to 5120):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        patch_size (`int`, *optional*, defaults to 14):
            The size (resolution) of each patch.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        image_num_channels (`int`, *optional*, defaults to `3`):
            Number of image channels.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        initializer_factor (`float`, *optional*, defaults to 1.0):
            A factor for initializing all weight matrices (should be kept to 1.0, used internally for initialization
            testing).
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    """

    model_type = "idefics"
    attribute_map = {
        "hidden_size": "embed_dim",
    }

    def __init__(
        self,
        embed_dim=768,
        image_size=224,
        intermediate_size=5120,
        patch_size=14,
        num_hidden_layers=32,
        num_attention_heads=16,
        num_channels=3,
        hidden_act="gelu",
        layer_norm_eps=1e-5,
        attention_dropout=0.0,
        initializer_range=0.02,
        initializer_factor=1.0,
        **kwargs,
    ):
        self.embed_dim = embed_dim
        self.image_size = image_size
        self.intermediate_size = intermediate_size
        self.patch_size = patch_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.num_channels = num_channels
        self.layer_norm_eps = layer_norm_eps
        self.attention_dropout = attention_dropout
        self.initializer_range = initializer_range
        self.initializer_factor = initializer_factor
        self.hidden_act = hidden_act

        super().__init__(**kwargs)


class IdeficsPerceiverConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
    Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the Idefics-9B.
    e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        use_resampler (`bool`, *optional*, defaults to `False`):
            Whether or not to use the resampler
        resampler_n_latents (`int`, *optional*, defaults to ):
            Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).
        resampler_depth (`int`, *optional*, defaults to 6):
            Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
        resampler_n_heads (`int`, *optional*, defaults to 16):
            Number of heads in each Transformer block (for multi-headed self-attention).
        resampler_head_dim (`int`, *optional*, defaults to 96):
            Dimensionality of each head projection in the Transformer block.
        qk_layer_norms_perceiver (`bool`, *optional*, defaults to `False`):
            Whether or not to use qk layer norms in perceiver
    """

    model_type = "idefics"

    def __init__(
        self,
        use_resampler=False,
        resampler_n_latents=64,
        resampler_depth=6,
        resampler_n_heads=16,
        resampler_head_dim=96,
        qk_layer_norms_perceiver=False,
        **kwargs,
    ):
        self.use_resampler = use_resampler
        self.resampler_n_latents = resampler_n_latents
        self.resampler_depth = resampler_depth
        self.resampler_n_heads = resampler_n_heads
        self.resampler_head_dim = resampler_head_dim
        self.qk_layer_norms_perceiver = qk_layer_norms_perceiver

        super().__init__(**kwargs)


class IdeficsConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
    Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the Idefics-9B.
    e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        additional_vocab_size (`int`, *optional`, defaults to 0):
            Additional vocabulary size of the model, typically for the special "<img>" token. Additional vocab tokens
            are always trainable whereas regular vocab tokens can be frozen or not.
        vocab_size (`int`, *optional*, defaults to 32000):
            Vocabulary size of the Idefics model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`~IdeficsModel`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 11008):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer encoder.
        dropout (`float`, *optional*, defaults to 0.0):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        alpha_initializer (`str`, *optional*, defaults to `"zeros"`):
            Initialization type for the alphas.
        alphas_initializer_range (`float`, *optional*, defaults to 0.0):
            The standard deviation of the truncated_normal_initializer for initializing the alphas in the Gated Cross
            Attention.
        alpha_type (`str`, *optional*, defaults to `"float"`):
            Whether the gating alphas should be vectors or single floats.
        rms_norm_eps (`float`, *optional*, defaults to 1e-6):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0)
            Padding token id.
        bos_token_id (`int`, *optional*, defaults to 1)
            Beginning of stream token id.
        eos_token_id (`int`, *optional*, defaults to 2)
            End of stream token id.
        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        cross_layer_interval (`int`, *optional*, default to 1)
            Interval for cross attention (from text to image) layers.
        qk_layer_norms (`bool`, *optional*, defaults to `False`): Whether to add layer norm after q and k
        freeze_text_layers (`bool`, *optional*, defaults to `True`): Whether to freeze text layers
        freeze_text_module_exceptions (`bool`, *optional*, defaults to `[]`):
            Exceptions to freezing text layers when `freeze_text_layers` is `True`
        freeze_lm_head (`bool`, *optional*, defaults to `False`): Whether to freeze lm head
        freeze_vision_layers (`bool`, *optional*, defaults to `True`):  Whether to freeze vision layers
        freeze_vision_module_exceptions (`bool`, *optional*, defaults to `[]`):
            Exceptions to freezing vision layers when `freeze_vision_layers` is `True`
        use_resampler (`bool`, *optional*, defaults to `False`): Whether to use the Resampler
        vision_config (`IdeficsVisionConfig`,  *optional*): Custom vision config or dict
        perceiver_config (`IdeficsPerceiverConfig`,  *optional*): Custom perceiver config or dict
    Example:
    ```python
    >>> from transformers import IdeficsModel, IdeficsConfig
    >>> # Initializing a Idefics idefics-9b style configuration
    >>> configuration = IdeficsConfig()
    >>> # Initializing a model from the idefics-9b style configuration
    >>> model = IdeficsModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "idefics"
    is_composition = True

    def __init__(
        self,
        vocab_size=32000,
        additional_vocab_size=0,
        hidden_size=4096,
        intermediate_size=11008,
        num_hidden_layers=32,
        num_attention_heads=32,
        dropout=0.0,
        hidden_act="silu",
        initializer_range=0.02,
        alpha_initializer="zeros",
        alphas_initializer_range=0.0,
        alpha_type="float",
        rms_norm_eps=1e-6,
        use_cache=True,
        pad_token_id=0,
        bos_token_id=1,
        eos_token_id=2,
        tie_word_embeddings=False,
        cross_layer_interval=1,
        qk_layer_norms=False,
        freeze_text_layers=True,
        freeze_text_module_exceptions=[],
        freeze_lm_head=False,
        freeze_vision_layers=True,
        freeze_vision_module_exceptions=[],
        use_resampler=False,
        vision_config=None,
        perceiver_config=None,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.additional_vocab_size = additional_vocab_size
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.dropout = dropout
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.alpha_initializer = alpha_initializer
        self.alphas_initializer_range = alphas_initializer_range
        self.alpha_type = alpha_type
        self.rms_norm_eps = rms_norm_eps
        self.use_cache = use_cache

        self.cross_layer_interval = cross_layer_interval
        self.qk_layer_norms = qk_layer_norms
        self.freeze_vision_layers = freeze_vision_layers

        self.freeze_text_layers = freeze_text_layers
        self.freeze_text_module_exceptions = freeze_text_module_exceptions
        self.freeze_vision_module_exceptions = freeze_vision_module_exceptions
        self.freeze_lm_head = freeze_lm_head

        self.use_resampler = use_resampler

        if perceiver_config is None:
            self.perceiver_config = IdeficsPerceiverConfig()
        elif isinstance(perceiver_config, dict):
            self.perceiver_config = IdeficsPerceiverConfig(**perceiver_config)
        elif isinstance(perceiver_config, IdeficsPerceiverConfig):
            self.perceiver_config = perceiver_config

        if vision_config is None:
            self.vision_config = IdeficsVisionConfig()
        elif isinstance(vision_config, dict):
            self.vision_config = IdeficsVisionConfig(**vision_config)
        elif isinstance(vision_config, IdeficsVisionConfig):
            self.vision_config = vision_config

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )

        # IMPORTANT: Do not do any __init__ args-based checks in the constructor, since
        # PretrainedConfig.from_dict first instantiates the class with the config dict and only then
        # updates the config object with `kwargs` from from_pretrained, so during the instantiation
        # of this object many attributes have default values and haven't yet been overridden.
        # Do any required checks inside `from_pretrained` once the superclass' `from_pretrained` was run.

    def to_dict(self):
        """
        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
        Returns:
            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
        """
        output = copy.deepcopy(self.__dict__)

        output["vision_config"] = self.vision_config.to_dict()
        output["perceiver_config"] = self.perceiver_config.to_dict()
        output["model_type"] = self.__class__.model_type

        return output


================================================
FILE: server/text_generation_server/models/custom_modeling/idefics_image_processing.py
================================================
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for Idefics."""

from typing import Callable, Dict, List, Optional, Union, Iterable
import numpy as np

from PIL import Image

import transformers
from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
from transformers.image_transforms import (
    resize,
    to_channel_dimension_format,
    rescale,
    normalize,
)
from transformers.image_utils import (
    ChannelDimension,
    ImageInput,
    PILImageResampling,
    make_list_of_images,
    to_numpy_array,
    valid_images,
)
from io import BytesIO
import base64
import requests
from transformers import TensorType, is_torch_available


IDEFICS_STANDARD_MEAN = [0.48145466, 0.4578275, 0.40821073]
IDEFICS_STANDARD_STD = [0.26862954, 0.26130258, 0.27577711]


def convert_to_rgb(image):
    # `image.convert("RGB")` would only work for .jpg images, as it creates a wrong background
    # for transparent images. The call to `alpha_composite` handles this case
    if image.mode == "RGB":
        return image

    image_rgba = image.convert("RGBA")
    background = Image.new("RGBA", image_rgba.size, (255, 255, 255))
    alpha_composite = Image.alpha_composite(background, image_rgba)
    alpha_composite = alpha_composite.convert("RGB")
    return alpha_composite


class IdeficsImageProcessor(BaseImageProcessor):
    r"""
    Constructs a Idefics image processor.
    Args:
        image_size (`int`, *optional*, defaults to `224`):
            Resize to image size
        image_num_channels (`int`, *optional*, defaults to `3`):
            Number of image channels.
        image_mean (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`):
            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
            overridden by the `image_mean` parameter in the `preprocess` method.
        image_std (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_STD`):
            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
            Can be overridden by the `image_std` parameter in the `preprocess` method.
    """

    model_input_names = ["pixel_values"]

    def __init__(
        self,
        image_size: int = 224,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        image_num_channels: Optional[int] = 3,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)

        self.image_size = image_size
        self.image_num_channels = image_num_channels
        self.image_mean = image_mean
        self.image_std = image_std

    def preprocess(
        self,
        images: ImageInput,
        image_num_channels: Optional[int] = 3,
        image_size: Optional[Dict[str, int]] = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        transform: Callable = None,
        **kwargs,
    ) -> TensorType.PYTORCH:
        """
        Preprocess a batch of images.
        Args:
            images (`ImageInput`):
                A list of images to preprocess.
            image_size (`int`, *optional*, defaults to `self.image_size`):
                Resize to image size
            image_num_channels (`int`, *optional*, defaults to `self.image_num_channels`):
                Number of image channels.
            image_mean (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`):
                Mean to use if normalizing the image. This is a float or list of floats the length of the number of
                channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can
                be overridden by the `image_mean` parameter in the `preprocess` method.
            image_std (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_STD`):
                Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
                number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess`
                method. Can be overridden by the `image_std` parameter in the `preprocess` method.
            transform (`Callable`, *optional*, defaults to `None`):
                A custom transform function that accepts a single image can be passed for training. For example,
                `torchvision.Compose` can be used to compose multiple transforms. If `None` - an inference mode is
                assumed - and then a preset of inference-specific transforms will be applied to the images
        Returns:
            a PyTorch tensor of the processed images
        """
        image_size = image_size if image_size is not None else self.image_size
        image_num_channels = (
            image_num_channels
            if image_num_channels is not None
            else self.image_num_channels
        )
        image_mean = image_mean if image_mean is not None else self.image_mean
        image_std = image_std if image_std is not None else self.image_std
        size = (image_size, image_size)

        if len(images) == 0:
            return []

        images = make_list_of_images(images)

        if not valid_images(images):
            raise ValueError(
                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
                "torch.Tensor, tf.Tensor or jax.ndarray."
            )

        # For training a user needs to pass their own set of transforms as a Callable.
        # For reference this is what was used in the original IDEFICS training:
        # transform = transforms.Compose([
        #     convert_to_rgb,
        #     transforms.RandomResizedCrop((size, size), scale=(0.9, 1.0), interpolation=transforms.InterpolationMode.BICUBIC),
        #     transforms.ToTensor(),
        #     transforms.Normalize(mean=image_mean, std=image_std),
        # ])
        if transform is not None:
            if not is_torch_available():
                raise ImportError("To pass in `transform` torch must be installed")
            import torch

            images = [transform(x) for x in images]
            return torch.stack(images)

        # for inference we do the exact transforms that were used to train IDEFICS
        images = [convert_to_rgb(x) for x in images]
        # further transforms expect numpy arrays
        images = [to_numpy_array(x) for x in images]
        images = [resize(x, size, resample=PILImageResampling.BICUBIC) for x in images]
        images = [self.rescale(image=image, scale=1 / 255) for image in images]
        images = [self.normalize(x, mean=image_mean, std=image_std) for x in images]
        images = [
            to_channel_dimension_format(x, ChannelDimension.FIRST) for x in images
        ]
        # TODO: this converts to torch tensors - switch to convert_to_tensors once it becomes available
        images = BatchFeature(
            data={"pixel_values": images}, tensor_type=TensorType.PYTORCH
        )["pixel_values"]

        return images

    def fetch_images(self, image_url_or_urls: Union[str, List[str]]):
        """
        Convert a single or a list of urls into the corresponding `PIL.Image` objects.
        If a single url is passed, the return value will be a single object. If a list is passed a list of objects is
        returned.
        """
        headers = {
            "User-Agent": (
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0"
                " Safari/537.36"
            )
        }
        if isinstance(image_url_or_urls, list):
            return [self.fetch_images(x) for x in image_url_or_urls]
        elif isinstance(image_url_or_urls, str):
            image = image_url_or_urls

            if image.startswith("http://") or image.startswith("https://"):
                response = requests.get(
                    image_url_or_urls, stream=True, headers=headers, timeout=(1, 5)
                )
                response.raise_for_status()
                content = response.content
            elif image.startswith("data:"):
                # https://stackoverflow.com/questions/17090571/is-there-a-way-to-set-background-image-as-a-base64-encoded-image
                # data:image/png;base64,xxx
                image = image.split(",")[-1]
                content = base64.b64decode(image)
            else:
                raise ValueError(f"Unrecognized image {image}")

            try:
                image = Image.open(BytesIO(content))
                # image.verify()
            except Exception:
                raise ValueError(f"Could not load image from url {image_url_or_urls}")
            return image
        else:
            raise ValueError(
                f"only a single or a list of entries is supported but got type={type(image_url_or_urls)}"
            )

    def rescale(
        self,
        image: np.ndarray,
        scale: float,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ) -> np.ndarray:
        """
        Rescale an image by a scale factor. image = image * scale.

        Args:
            image (`np.ndarray`):
                Image to rescale.
            scale (`float`):
                The scaling factor to rescale pixel values by.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.

        Returns:
            `np.ndarray`: The rescaled image.
        """
        # return rescale(image, scale=scale, data_format=data_format, input_data_format=input_data_format, **kwargs)
        # requires 4.32
        return rescale(image, scale=scale, data_format=data_format, **kwargs)

    def normalize(
        self,
        image: np.ndarray,
        mean: Union[float, Iterable[float]],
        std: Union[float, Iterable[float]],
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ) -> np.ndarray:
        """
        Normalize an image. image = (image - image_mean) / image_std.

        Args:
            image (`np.ndarray`):
                Image to normalize.
            mean (`float` or `Iterable[float]`):
                Image mean to use for normalization.
            std (`float` or `Iterable[float]`):
                Image standard deviation to use for normalization.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.

        Returns:
            `np.ndarray`: The normalized image.
        """
        # TODO 4.32
        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)


transformers.IdeficsImageProcessor = IdeficsImageProcessor


================================================
FILE: server/text_generation_server/models/custom_modeling/idefics_modeling.py
================================================
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch Idefics model."""
from typing import List, Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn

from transformers import PreTrainedModel
from transformers.activations import ACT2FN
from transformers.modeling_outputs import (
    BaseModelOutputWithPast,
    CausalLMOutputWithPast,
    dataclass,
)
from text_generation_server.models.custom_modeling.idefics_config import IdeficsConfig
from text_generation_server.models.custom_modeling.idefics_vision import (
    IdeficsVisionTransformer,
)
from text_generation_server.models.custom_modeling.idefics_perceiver import (
    IdeficsPerceiverResampler,
)
from text_generation_server.layers import (
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    TensorParallelRowLinear,
    SpeculativeHead,
    FastLinear,
)
from text_generation_server.layers.rotary import PositionRotaryEmbedding
from text_generation_server.utils.import_utils import SYSTEM
from loguru import logger

if SYSTEM == "cuda":
    import dropout_layer_norm
elif SYSTEM == "rocm":
    import vllm._custom_ops as ops
else:
    dropout_layer_norm = None


@dataclass
class BaseModelOutputWithPastImage(BaseModelOutputWithPast):
    image_hidden_states: Optional[torch.FloatTensor] = None


@dataclass
class CausalLMOutputWithPastImage(CausalLMOutputWithPast):
    image_hidden_states: Optional[torch.FloatTensor] = None


# logger = logging.get_logger(__name__)

# _CONFIG_FOR_DOC = "IdeficsConfig"

# IDEFICS_PRETRAINED_MODEL_ARCHIVE_LIST = [
#     "HuggingFaceM4/idefics-9b",
#     "HuggingFaceM4/idefics-80b",
#     # See all Idefics models at https://huggingface.co/models?filter=idefics
# ]


def expand_inputs_for_generation(
    input_ids,
    expand_size=1,
    is_encoder_decoder=False,
    attention_mask=None,
    encoder_outputs=None,
    **model_kwargs,
):
    expanded_return_idx = (
        torch.arange(input_ids.shape[0])
        .view(-1, 1)
        .repeat(1, expand_size)
        .view(-1)
        .to(input_ids.device)
    )
    input_ids = input_ids.index_select(0, expanded_return_idx)

    if "token_type_ids" in model_kwargs:
        token_type_ids = model_kwargs["token_type_ids"]
        model_kwargs["token_type_ids"] = token_type_ids.index_select(
            0, expanded_return_idx
        )

    if attention_mask is not None:
        model_kwargs["attention_mask"] = attention_mask.index_select(
            0, expanded_return_idx
        )
        model_kwargs["image_attention_mask"] = model_kwargs[
            "image_attention_mask"
        ].index_select(0, expanded_return_idx)
        model_kwargs["pixel_values"] = model_kwargs["pixel_values"].index_select(
            0, expanded_return_idx
        )

    if is_encoder_decoder:
        if encoder_outputs is None:
            raise ValueError(
                "If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined."
            )
        encoder_outputs["last_hidden_state"] = (
            encoder_outputs.last_hidden_state.index_select(
                0, expanded_return_idx.to(encoder_outputs.last_hidden_state.device)
            )
        )
        model_kwargs["encoder_outputs"] = encoder_outputs
    return input_ids, model_kwargs


def update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=False):
    # must have this key set to at least None
    model_kwargs["past_key_values"] = model_kwargs.get("past_key_values", None)

    # update past
    if "past_key_values" in outputs:
        model_kwargs["past"] = outputs.past_key_values
    elif "mems" in outputs:
        model_kwargs["past"] = outputs.mems
    elif "past_buckets_states" in outputs:
        model_kwargs["past"] = outputs.past_buckets_states
    else:
        model_kwargs["past"] = None

    # update token_type_ids with last value
    if "token_type_ids" in model_kwargs:
        token_type_ids = model_kwargs["token_type_ids"]
        model_kwargs["token_type_ids"] = torch.cat(
            [token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], dim=-1
        )

    # update attention masks
    if not is_encoder_decoder:
        if "attention_mask" in model_kwargs:
            attention_mask = model_kwargs["attention_mask"]
            model_kwargs["attention_mask"] = torch.cat(
                [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))],
                dim=-1,
            )
        if "image_attention_mask" in model_kwargs:
            image_attention_mask = model_kwargs["image_attention_mask"]
            last_mask = image_attention_mask[:, -1, :].unsqueeze(1)
            model_kwargs["image_attention_mask"] = last_mask

    return model_kwargs


def prepare_inputs_for_generation(input_ids, past=None, **kwargs):
    token_type_ids = kwargs.get("token_type_ids", None)
    # only last token for inputs_ids if past is defined in kwargs
    if past:
        input_ids = input_ids[:, -1].unsqueeze(-1)
        if token_type_ids is not None:
            token_type_ids = token_type_ids[:, -1].unsqueeze(-1)

    attention_mask = kwargs.get("attention_mask", None)
    position_ids = kwargs.get("position_ids", None)

    if attention_mask is not None and position_ids is None:
        # create position_ids on the fly for batch generation
        position_ids = attention_mask.long().cumsum(-1) - 1
        position_ids.masked_fill_(attention_mask == 0, 1)
        if past:
            position_ids = position_ids[:, -1].unsqueeze(-1)

    pixel_values = kwargs.get("pixel_values", None)
    image_attention_mask = kwargs.get("image_attention_mask", None)
    # if pixel_values is None or image_attention_mask is None:
    #     raise ValueError("pixel values and image attention mask cannot be None")

    return {
        "input_ids": input_ids,
        "past_key_values": past,
        "use_cache": kwargs.get("use_cache"),
        "position_ids": position_ids,
        "attention_mask": attention_mask,
        "token_type_ids": token_type_ids,
        "pixel_values": pixel_values,
        "image_attention_mask": image_attention_mask,
    }


def freeze_model(model, module_exceptions=[]):
    mapping = {
        "LayerNorm": nn.LayerNorm,
        "Linear": nn.Linear,
        "Embedding": nn.Embedding,
    }
    module_exceptions_mapped = [mapping[m] for m in module_exceptions]
    for module in model.modules():
        if module_exceptions and any(
            [isinstance(module, t) for t in module_exceptions_mapped]
        ):
            module.requires_grad_(
                True
            )  # Explicitely setting it to true to avoid any mistakes
        else:
            module.requires_grad_(False)
    return model


class IdeficsDecoupledPartialTPEmbedding(nn.Module):
    def __init__(
        self,
        config,
        weights,
    ):
        super().__init__()
        self.num_embeddings = config.vocab_size
        self.weight = TensorParallelEmbedding(
            prefix="model.embed_tokens", weights=weights
        )
        self.additional_weight = nn.Parameter(
            weights.get_tensor("model.embed_tokens.additional_embedding.weight")
        )

    def forward(self, input_ids):
        # Clone so that we don't modify the original input_ids later on
        input_ids = input_ids.clone()
        additional_vocab_indices = torch.where(input_ids >= self.num_embeddings)
        input_ids_additional_vocab = input_ids[additional_vocab_indices]
        additional_embeddings = torch.nn.functional.embedding(
            input_ids_additional_vocab - self.num_embeddings, self.additional_weight
        )

        # for successful lookup replace input_ids with 0, the results of these will be discarded anyway
        input_ids[additional_vocab_indices] = 0
        full_vector = self.weight(input_ids)

        # overwrite the records with high indices
        full_vector[additional_vocab_indices] = additional_embeddings

        return full_vector


class IdeficsDecoupledTensorParallelLinear(nn.Module):
    # Derived from https://pytorch.org/docs/stable/_modules/torch/nn/modules/linear.html#Linear
    """
    Implements a decoupling of parameters to allow freezing (or not) a subset of the parameters. In practise, the
    regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `out_additional_features` > 0,
    then it will create `out_additional_features * in_features` additional parameters that are always trained. If
    `out_additional_features=0`, then the module defaults back to the regular behavior of `nn.Linear`.
    """

    def __init__(
        self,
        config,
        weights,
    ) -> None:
        super().__init__()
        self.fc = SpeculativeHead.load(config=config, prefix="lm_head", weights=weights)
        self.additional_fc = FastLinear.load(
            config=config,
            prefix="lm_head.additional_fc",
            weights=weights,
            bias=False,
        )

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        output, speculative_logits = self.fc(input)
        additional_features = self.additional_fc(input)
        output = torch.cat((output, additional_features), -1)

        return output, speculative_logits

    def extra_repr(self) -> str:
        """Overwriting `nn.Linear.extra_repr` to include new parameters."""
        return "in_features={}, out_features={}, out_additional_features={}, bias={}, partially_freeze={}".format(
            self.in_features,
            self.out_features,
            self.out_additional_features,
            self.bias is not None,
            self.partially_freeze,
        )


# Copied from transformers.models.bart.modeling_bart._make_causal_mask
def _make_causal_mask(
    input_ids_shape: torch.Size,
    dtype: torch.dtype,
    device: torch.device,
    past_key_values_length: int = 0,
):
    """
    Make causal mask used for bi-directional self-attention.
    """
    bsz, tgt_len = input_ids_shape
    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
    mask_cond = torch.arange(mask.size(-1), device=device)
    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
    mask = mask.to(dtype)

    if past_key_values_length > 0:
        mask = torch.cat(
            [
                torch.zeros(
                    tgt_len, past_key_values_length, dtype=dtype, device=device
                ),
                mask,
            ],
            dim=-1,
        )
    return mask[None, None, :, :].expand(
        bsz, 1, tgt_len, tgt_len + past_key_values_length
    )


def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
    """
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    """
    bsz, src_len = mask.size()
    tgt_len = tgt_len if tgt_len is not None else src_len

    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)

    inverted_mask = 1.0 - expanded_mask

    return inverted_mask.masked_fill(
        inverted_mask.to(torch.bool), torch.finfo(dtype).min
    )


class IdeficsRMSNorm(nn.Module):
    def __init__(self, prefix, weights, eps=1e-6):
        """
        LlamaRMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()

        weight = weights.get_tensor(f"{prefix}.weight")
        self.weight = nn.Parameter(weight)
        self.variance_epsilon = eps

    def forward(self, hidden_states, residual=None):
        if SYSTEM == "ipex":
            import intel_extension_for_pytorch as ipex

            out = ipex.llm.functional.add_rms_norm(
                residual,
                hidden_states,
                self.weight,
                None,
                self.variance_epsilon,
                residual is not None,
            )
            return out
        elif hidden_states.shape[-1] > 8192:
            if residual is not None:
                hidden_states += residual
            residual = hidden_states

            hidden_states = hidden_states.to(torch.float32)
            variance = hidden_states.pow(2).mean(-1, keepdim=True)
            hidden_states = hidden_states * torch.rsqrt(
                variance + self.variance_epsilon
            )

            # convert into half-precision if necessary
            if self.weight.dtype in [torch.float16, torch.bfloat16]:
                hidden_states = hidden_states.to(self.weight.dtype)

            return self.weight * hidden_states
        elif SYSTEM == "cuda":
            # faster post attention rms norm
            unwrap = False
            if len(hidden_states.shape) > 2:
                unwrap = True
                shape = hidden_states.shape
                hidden_states = hidden_states.reshape(-1, shape[-1])

            normed_hidden_states, res, *rest = dropout_layer_norm.dropout_add_ln_fwd(
                hidden_states,
                residual,
                self.weight,
                None,
                None,
                None,
                None,
                None,
                0.0,
                self.variance_epsilon,
                1.0,
                0,
                None,
                False,
                True,  # Activate RMSNorm
            )
            if res is None:
                res = hidden_states

            if unwrap:
                normed_hidden_states = normed_hidden_states.view(*shape)

            return normed_hidden_states
        elif SYSTEM == "rocm":
            # We use VLLM RMSNorm kernel that can be compiled for RoCm, instead of Flash Attention ones that can not.
            if residual is not None:
                hidden_states += residual
            residual = hidden_states

            unwrap = False
            if len(hidden_states.shape) > 2:
                unwrap = True
                shape = hidden_states.shape
                hidden_states = hidden_states.reshape(-1, shape[-1])

            out = torch.empty_like(hidden_states)
            ops.rms_norm(
                out,
                hidden_states,
                self.weight.data,
                self.variance_epsilon,
            )

            if unwrap:
                out = out.view(*shape)

            return out
        else:
            raise ValueError(
                "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
            )


# this was adapted from LlamaMLP
class IdeficsMLP(nn.Module):
    def __init__(
        self,
        config,
        prefix,
        weights,
    ):
        super().__init__()
        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
            weights=weights,
            dim=0,
            bias=False,
        )
        self.down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.down_proj",
            weights=weights,
            bias=False,
        )
        self.act_fn = ACT2FN[config.hidden_act]

    def forward(self, hidden_states):
        gate_up_states = self.gate_up_proj(hidden_states)
        shape = gate_up_states.shape
        gate_up_states = gate_up_states.view(*shape[:-1], 2, shape[-1] // 2)
        return self.down_proj(
            self.act_fn(gate_up_states[:, :, 0]) * gate_up_states[:, :, 1]
        )


# this was adapted from LlamaAttention
class IdeficsAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(
        self,
        config,
        prefix,
        weights,
        qk_layer_norms: bool = False,
        is_cross_attention: bool = False,
    ):
        super().__init__()
        self.hidden_size = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.hidden_size // self.num_heads
        self.dropout = config.dropout

        if (self.head_dim * self.num_heads) != self.hidden_size:
            raise ValueError(
                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
                f" and `num_heads`: {self.num_heads})."
            )

        self.is_cross_attention = is_cross_attention

        # if not hasattr(nn.functional, "scaled_dot_product_attention"):
        #     raise ValueError("this model requires pytorch 2.0 or higher")

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads //= weights.process_group.size()

        if self.is_cross_attention:
            # kv_input_dim = (
            #     self.hidden_size if not hasattr(config.vision_config, "embed_dim") else config.vision_config.embed_dim
            # )
            self.q_proj = TensorParallelColumnLinear.load(
                config, prefix=f"{prefix}.q_proj", weights=weights, bias=False
            )
            self.k_proj = TensorParallelColumnLinear.load(
                config, prefix=f"{prefix}.k_proj", weights=weights, bias=False
            )
            self.v_proj = TensorParallelColumnLinear.load(
                config, prefix=f"{prefix}.v_proj", weights=weights, bias=False
            )
        else:
            self.qkv = TensorParallelColumnLinear.load_multi(
                config,
                prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
                dim=0,
                weights=weights,
                bias=False,
            )
        self.o_proj = TensorParallelRowLinear.load(
            config, prefix=f"{prefix}.o_proj", weights=weights, bias=False
        )
        self.rotary_emb = PositionRotaryEmbedding.static(
            config=config, dim=self.head_dim, base=10000.0, device=weights.device
        )
        self.qk_layer_norms = qk_layer_norms
        if self.qk_layer_norms:
            self.q_layer_norm = IdeficsRMSNorm(
                prefix=f"{prefix}.q_layer_norm",
                weights=weights,
                eps=config.rms_norm_eps,
            )
            self.k_layer_norm = IdeficsRMSNorm(
                prefix=f"{prefix}.q_layer_norm",
                weights=weights,
                eps=config.rms_norm_eps,
            )

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return (
            tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
            .transpose(1, 2)
            .contiguous()
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        key_value_states: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        # if key_value_states are provided this layer is used as a cross-attention layer
        is_cross_attention = self.is_cross_attention or key_value_states is not None

        bsz, q_len, _ = hidden_states.size()

        if is_cross_attention:
            query_states = self.q_proj(hidden_states).view(
                bsz, q_len, self.num_heads, self.head_dim
            )  # .transpose(1, 2)
            query_states = query_states.transpose(1, 2)
            (
                _,
                kv_len,
                _,
            ) = (
                key_value_states.size()
            )  # Note that, in this case, `kv_len` == `kv_seq_len`
            key_states = (
                self.k_proj(key_value_states)
                .view(bsz, kv_len, self.num_heads, self.head_dim)
                .transpose(1, 2)
            )
            value_states = (
                self.v_proj(key_value_states)
                .view(bsz, kv_len, self.num_heads, self.head_dim)
                .transpose(1, 2)
            )
        else:
            qkv = self.qkv(hidden_states)
            query_states, key_states, value_states = qkv.split(
                self.num_heads * self.head_dim, dim=2
            )

            query_states = query_states.view(
                bsz, q_len, self.num_heads, self.head_dim
            )  # .transpose(1, 2)
            key_states = key_states.view(
                bsz, q_len, self.num_heads, self.head_dim
            )  # . transpose(1, 2)
            value_states = value_states.view(
                bsz, q_len, self.num_heads, self.head_dim
            )  # .transpose(1, 2)
            kv_seq_len = q_len
            if past_key_value is not None:
                kv_seq_len += past_key_value[0].shape[-2]
            max_s = max(kv_seq_len, q_len)
            cos, sin = self.rotary_emb.get_cos_sin(
                position_ids.view(-1), max_s, hidden_states.dtype
            )

            query_shape = query_states.shape
            key_shape = key_states.shape
            self.rotary_emb(
                query_states.view(-1, *query_shape[2:]),
                key_states.reshape(-1, *key_shape[2:]),
                cos,
                sin,
            )

            query_states = query_states.view(query_shape)
            key_states = key_states.view(key_shape)

            query_states = query_states.transpose(1, 2)
            key_states = key_states.transpose(1, 2)
            value_states = value_states.transpose(1, 2)

        kv_seq_len = key_states.shape[-2]
        if past_key_value is not None:
            kv_seq_len += past_key_value[0].shape[-2]
        # [bsz, nh, t, hd]

        if past_key_value is not None:
            # reuse k, v, self_attention
            key_states = torch.cat([past_key_value[0], key_states], dim=2)
            value_states = torch.cat([past_key_value[1], value_states], dim=2)

        past_key_value = (key_states, value_states) if use_cache else None

        if self.qk_layer_norms:
            query_states = self.q_layer_norm(query_states)
            key_states = self.k_layer_norm(key_states)

        if attention_mask is not None:
            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
                raise ValueError(
                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                )

        attn_output = nn.functional.scaled_dot_product_attention(
            query_states,
            key_states,
            value_states,
            attn_mask=attention_mask,
            dropout_p=self.dropout,
        )

        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.transpose(1, 2)
        attn_output = attn_output.reshape(bsz, q_len, -1)

        attn_output = self.o_proj(attn_output)

        attn_weights = None
        if output_attentions:
            logger.warning_once(
                "attn_weights are not extracted in scaled_dot_product_attention. The model returns None instead"
            )

        return attn_output, attn_weights, past_key_value


# this was adapted from LlamaDecoderLayer
class IdeficsDecoderLayer(nn.Module):
    def __init__(self, layer_id: int, config: IdeficsConfig, weights):
        super().__init__()
        self.process_group = weights.process_group
        self.hidden_size = config.hidden_size
        prefix = f"model.layers.{layer_id}"
        self.self_attn = IdeficsAttention(
            config=config,
            prefix=f"{prefix}.self_attn",
            weights=weights,
            qk_layer_norms=False,
            is_cross_attention=False,
        )
        self.mlp = IdeficsMLP(
            config=config,
            prefix=f"{prefix}.mlp",
            weights=weights,
        )
        self.input_layernorm = IdeficsRMSNorm(
            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
        )
        self.post_attention_layernorm = IdeficsRMSNorm(
            prefix=f"{prefix}.post_attention_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )
        self.dropout = config.dropout

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
    ) -> Tuple[
        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
    ]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
        """

        residual = hidden_states

        hidden_states = self.input_layernorm(hidden_states)

        # Self Attention
        hidden_states, self_attn_weights, present_key_value = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            use_cache=use_cache,
        )
        # hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        hidden_states = residual + hidden_states

        # Fully Connected
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        hidden_states = self.mlp(hidden_states)
        # hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        hidden_states = residual + hidden_states

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (self_attn_weights,)

        if use_cache:
            outputs += (present_key_value,)

        return outputs


class IdeficsGatedCrossAttentionLayer(nn.Module):
    def __init__(self, layer_id, config: IdeficsConfig, weights):
        super().__init__()
        self.process_group = weights.process_group
        self.hidden_size = config.hidden_size
        prefix = f"model.gated_cross_attn_layers.{layer_id}"
        self.cross_attn = IdeficsAttention(
            config=config,
            prefix=f"{prefix}.cross_attn",
            weights=weights,
            qk_layer_norms=True,
            is_cross_attention=True,
        )
        self.mlp = IdeficsMLP(
            config=config,
            prefix=f"{prefix}.mlp",
            weights=weights,
        )
        self.input_layernorm = IdeficsRMSNorm(
            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
        )
        self.post_attention_layernorm = IdeficsRMSNorm(
            prefix=f"{prefix}.post_attention_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )
        self.config = config.dropout

        self.act_cross_attn = nn.Tanh()
        self.act_dense = nn.Tanh()

        self.alpha_cross_attn = nn.Parameter(
            weights.get_tensor(f"{prefix}.alpha_cross_attn")
        )
        self.alpha_dense = nn.Parameter(weights.get_tensor(f"{prefix}.alpha_dense"))

        if not (hasattr(self, "alpha_cross_attn") and hasattr(self, "alpha_dense")):
            raise ValueError("Alpha parameters not initialized correctly!")

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        image_hidden_states: Optional[torch.Tensor] = None,
        image_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        no_images: Optional[bool] = False,
    ) -> Tuple[
        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
    ]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            no_images (`bool`, *optional*, defaults to `False`): If `True` the vision part is ignored
        """
        if image_hidden_states is None:
            raise ValueError(
                "`image_hidden_states` is required for Idefics cross attention module which are visual features to be"
                " conditioned on."
            )

        if past_key_value is not None:
            raise NotImplementedError(
                "Past key value states are not implemented for Idefics cross attention module."
            )

        residual = hidden_states

        hidden_states = self.input_layernorm(hidden_states)

        # Self Attention
        hidden_states, self_attn_weights, present_key_value = self.cross_attn(
            hidden_states=hidden_states,
            key_value_states=image_hidden_states,
            attention_mask=image_attention_mask,
            output_attentions=output_attentions,
        )
        # hidden_states = nn.functional.dropout(hidden_states, p=self.config, training=self.training)
        # when there are no images the model is used in pure language mode
        gate = 0 if no_images else 1
        hidden_states = (
            residual + gate * self.act_cross_attn(self.alpha_cross_attn) * hidden_states
        )

        # Fully Connected
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        hidden_states = self.mlp(hidden_states)
        # hidden_states = nn.functional.dropout(hidden_states, p=self.config, training=self.training)
        hidden_states = residual + self.act_dense(self.alpha_dense) * hidden_states

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (self_attn_weights,)

        if use_cache:
            outputs += (present_key_value,)

        return outputs


LLAMA_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`IdeficsConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""


# @add_start_docstrings(
#     "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
#     LLAMA_START_DOCSTRING,
# )
class IdeficsPreTrainedModel(PreTrainedModel):
    config_class = IdeficsConfig
    # base_model_prefix = "model"
    # supports_gradient_checkpointing = True
    # _no_split_modules = ["IdeficsDecoderLayer", "IdeficsGatedCrossAttentionLayer"]

    # def _init_weights(self, module):
    #     # important: this ported version of Idefics isn't meant for training from scratch - only
    #     # inference and fine-tuning - so the proper init weights code has been removed - the m4 code
    #     # base should be used for training from scratch and it contains the correct code.
    #     std = self.config.initializer_range
    #     if isinstance(module, nn.Linear):
    #         module.weight.data.normal_(mean=0.0, std=std)
    #         if module.bias is not None:
    #             module.bias.data.zero_()
    #     elif isinstance(module, nn.Embedding):
    #         module.weight.data.normal_(mean=0.0, std=std)
    #         if module.padding_idx is not None:
    #             module.weight.data[module.padding_idx].zero_()

    # def _set_gradient_checkpointing(self, module, value=False):
    #     if isinstance(module, IdeficsModel):
    #         module.gradient_checkpointing = value


# LLAMA_INPUTS_DOCSTRING = r"""
#     Args:
#         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
#             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
#             it.

#             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
#             [`PreTrainedTokenizer.__call__`] for details.

#             [What are input IDs?](../glossary#input-ids)
#         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
#             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

#             - 1 for tokens that are **not masked**,
#             - 0 for tokens that are **masked**.

#             [What are attention masks?](../glossary#attention-mask)

#             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
#             [`PreTrainedTokenizer.__call__`] for details.

#             If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
#             `past_key_values`).

#             If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
#             and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
#             information on the default strategy.

#             - 1 indicates the head is **not masked**,
#             - 0 indicates the head is **masked**.
#         position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
#             Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
#             config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
#         past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
#             Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
#             `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
#             `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

#             Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
#             blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

#             If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
#             don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
#             `decoder_input_ids` of shape `(batch_size, sequence_length)`.
#         inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
#             Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
#             is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
#             model's internal embedding lookup matrix.
#         use_cache (`bool`, *optional*):
#             If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
#             `past_key_values`).
#         output_attentions (`bool`, *optional*):
#             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
#             tensors for more detail.
#         output_hidden_states (`bool`, *optional*):
#             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
#             more detail.
#         return_dict (`bool`, *optional*):
#             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
# """


# @add_start_docstrings(
#     "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
#     LLAMA_START_DOCSTRING,
# )
class IdeficsModel(IdeficsPreTrainedModel):
    # """
    # Transformer decoder consisting of `config.num_hidden_layers` layers. Each layer is a [`IdeficsDecoderLayer`]

    # Args:
    #     config: IdeficsConfig
    # """

    def __init__(self, config: IdeficsConfig, weights):
        super().__init__(config)
        self.config = config
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size

        self.embed_tokens = IdeficsDecoupledPartialTPEmbedding(
            config=config,
            weights=weights,
        )

        self.image_size = config.vision_config.image_size
        self.vision_config = config.vision_config
        self.vision_model = IdeficsVisionTransformer(
            prefix="model.vision_model",
            config=config.vision_config,
            weights=weights,
        )

        # Perceiver Resampler
        if config.use_resampler:
            perceiver_config = config.perceiver_config
            self.perceiver_resampler = IdeficsPerceiverResampler(
                prefix="model.perceiver_resampler",
                config=config,
                embed_dim=config.vision_config.embed_dim,
                depth=perceiver_config.resampler_depth,
                n_heads=perceiver_config.resampler_n_heads,
                head_dim=perceiver_config.resampler_head_dim,
                n_latents=perceiver_config.resampler_n_latents,
                weights=weights,
            )

        self.layers = nn.ModuleList(
            [
                IdeficsDecoderLayer(layer_id, config, weights)
                for layer_id in range(config.num_hidden_layers)
            ]
        )

        self.cross_layer_interval = config.cross_layer_interval
        num_cross_layers = config.num_hidden_layers // self.cross_layer_interval
        self.gated_cross_attn_layers = nn.ModuleList(
            [
                IdeficsGatedCrossAttentionLayer(layer_id, config, weights)
                for layer_id in range(num_cross_layers)
            ]
        )
        # self.gradient_checkpointing = False

        self.norm = IdeficsRMSNorm(
            prefix="model.norm", weights=weights, eps=config.rms_norm_eps
        )

        # self.gradient_checkpointing = False
        # Initialize weights and apply final processing
        # self.post_init()

        # self.freeze_relevant_params(config)

    # def freeze_relevant_params(self, config=None):
    #     if config is None:
    #         config = self.config

    #     if config.freeze_text_layers:
    #         self.freeze_text_layers(config.freeze_text_module_exceptions)

    #     if config.freeze_vision_layers:
    #         freeze_model(self.vision_model, module_exceptions=config.freeze_vision_module_exceptions)

    # def freeze_text_layers(self, module_exceptions=[]):
    #     for module in [self.layers, self.norm]:
    #         freeze_model(module, module_exceptions=module_exceptions)

    # def freeze_vision_layers(self, module_exceptions=[]):
    #     freeze_model(self.vision_model, module_exceptions=module_exceptions)

    # def get_input_embeddings(self):
    #     return self.embed_tokens

    # def set_input_embeddings(self, value):
    #     self.embed_tokens = value

    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
    def _prepare_decoder_attention_mask(
        self, attention_mask, input_shape, inputs_embeds, past_key_values_length
    ):
        # create causal mask
        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
        combined_attention_mask = None
        if input_shape[-1] > 1:
            combined_attention_mask = _make_causal_mask(
                input_shape,
                inputs_embeds.dtype,
                device=inputs_embeds.device,
                past_key_values_length=past_key_values_length,
            )

        if attention_mask is not None:
            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
            expanded_attn_mask = _expand_mask(
                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
            ).to(inputs_embeds.device)
            combined_attention_mask = (
                expanded_attn_mask
                if combined_attention_mask is None
                else expanded_attn_mask + combined_attention_mask
            )

        return combined_attention_mask

    # @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        pixel_values: Optional[torch.FloatTensor] = None,
        image_hidden_states: Optional[torch.FloatTensor] = None,
        image_embeddings: Optional[torch.FloatTensor] = None,
        image_attention_mask: Optional[torch.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPastImage]:
        device = input_ids.device if input_ids is not None else inputs_embeds.device

        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache

        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        # retrieve input_ids and inputs_embeds
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError(
                "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
            )
        elif input_ids is not None:
            batch_size, seq_length = input_ids.shape
        elif inputs_embeds is not None:
            batch_size, seq_length, _ = inputs_embeds.shape
        else:
            raise ValueError(
                "You have to specify either decoder_input_ids or decoder_inputs_embeds"
            )

        seq_length_with_past = seq_length
        past_key_values_length = 0

        if past_key_values is not None:
            past_key_values_length = past_key_values[0][0].shape[2]
            seq_length_with_past = seq_length_with_past + past_key_values_length

        if attention_mask is not None and position_ids is None:
            # create position_ids on the fly for batch generation
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
        elif position_ids is None:
            device = input_ids.device if input_ids is not None else inputs_embeds.device
            position_ids = torch.arange(
                past_key_values_length,
                seq_length + past_key_values_length,
                dtype=torch.long,
                device=device,
            )
            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
        else:
            position_ids = position_ids.view(-1, seq_length).long()

        no_images = False

        if image_hidden_states is None:
            if pixel_values is None and image_embeddings is None:
                raise ValueError(
                    "Either pixel_values and image_embeddings have to be not-None."
                )

            elif pixel_values is not None and image_embeddings is not None:
                raise ValueError(
                    "You cannot specify both pixel_values and image_embeddings at the same time"
                )

            elif pixel_values is not None:
                no_images = len(torch.nonzero(pixel_values)) == 0
                pixel_values = pixel_values.to(
                    dtype=self.dtype, device=device
                )  # fp16 compatibility
                batch_size, num_images = pixel_values.shape[:2]
                pixel_values = pixel_values.contiguous().view(
                    batch_size * num_images, *pixel_values.shape[2:]
                )

                # Get sequence from the vision encoder
                image_hidden_states = self.vision_model(
                    pixel_values=pixel_values
                ).last_hidden_state

            elif image_embeddings is not None:
                (
                    batch_size,
                    num_images,
                    image_seq_len,
                    image_hidden_size,
                ) = image_embeddings.size()
                image_hidden_states = image_embeddings.to(
                    dtype=self.dtype, device=input_ids.device
                )
                image_hidden_states = image_hidden_states.view(
                    batch_size * num_images, image_seq_len, image_hidden_size
                )

            if self.config.use_resampler:
                image_hidden_states = self.perceiver_resampler(image_hidden_states)
            image_seq_len, image_hidden_size = image_hidden_states.size(
                1
            ), image_hidden_states.size(2)
            image_hidden_states = image_hidden_states.view(
                batch_size, num_images * image_seq_len, image_hidden_size
            )
        else:
            no_images = False
            num_images = pixel_values.shape[1]
            image_seq_len = image_hidden_states.shape[1] // num_images

        # # Hack to use the model in full language modeling mode
        # image_attention_mask = torch.zeros(batch_size, seq_length, 1, dtype=torch.long, device=image_hidden_states.device)
        # Make image_attention_mask compatible with hidden states
        text_seq_len = image_attention_mask.size(1)
        image_attention_mask = image_attention_mask.unsqueeze(-1)
        image_attention_mask = image_attention_mask.repeat(1, 1, 1, image_seq_len)
        image_attention_mask = image_attention_mask.view(
            batch_size, text_seq_len, num_images * image_seq_len
        )
        image_batch_size, image_sequence_length, _ = image_hidden_states.size()
        image_hidden_shape = (image_batch_size, image_sequence_length)
        if image_attention_mask is None:
            image_attention_mask = torch.ones(image_hidden_shape, device=device)
        image_attention_mask = self.invert_attention_mask(image_attention_mask)

        # if list(image_attention_mask.shape) != [4, 1, 1024, 64]:
        #     raise ValueError(f"Image hidden_states {image_hidden_states.shape} - mask {image_attention_mask.shape} {num_images} {image_seq_len} {text_seq_len}")

        # if image_hidden_states is not None:
        # else:
        #     image_attention_mask = None

        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids)
        # embed positions
        if attention_mask is None:
            attention_mask = torch.ones(
                (batch_size, seq_length_with_past),
                dtype=torch.bool,
                device=inputs_embeds.device,
            )
        attention_mask = self._prepare_decoder_attention_mask(
            attention_mask,
            (batch_size, seq_length),
            inputs_embeds,
            past_key_values_length,
        )

        hidden_states = inputs_embeds

        # if self.gradient_checkpointing and self.training:
        #     if use_cache:
        #         logger.warning_once(
        #             "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
        #         )
        #         use_cache = False

        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        next_decoder_cache = () if use_cache else None

        for idx, decoder_layer in enumerate(self.layers):
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            past_key_value = (
                past_key_values[idx] if past_key_values is not None else None
            )

            def vblock(
                main_block,
                hidden_states,
                attention_mask,
                position_ids,
                past_key_value,
                image_hidden_states,
                image_attention_mask,
                output_attentions,
                use_cache,
                no_images,
                layer_idx,
                cross_layer_interval,
                gated_cross_attn_layers,
            ):
                # TODO(ls): Add cross attention values to respective lists
                if layer_idx % cross_layer_interval == 0:
                    xblock = gated_cross_attn_layers[layer_idx // cross_layer_interval]
                    outputs = xblock(
                        hidden_states,
                        attention_mask=attention_mask,
                        image_hidden_states=image_hidden_states,
                        image_attention_mask=image_attention_mask,
                        output_attentions=output_attentions,
                        use_cache=use_cache,
                        past_key_value=None,  # not implemented
                        no_images=no_images,
                    )
                    hidden_states = outputs[0]

                layer_outputs = main_block(
                    hidden_states,
                    attention_mask=attention_mask,
                    position_ids=position_ids,
                    past_key_value=past_key_value,
                    output_attentions=output_attentions,
                    use_cache=use_cache,
                )

                return layer_outputs

            # if self.gradient_checkpointing and self.training:
            #     past_key_value = None
            #     if use_cache:
            #         logger.warning_once(
            #             "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
            #         )
            #         use_cache = False

            #     layer_outputs = torch.utils.checkpoint.checkpoint(
            #         vblock,
            #         decoder_layer,
            #         hidden_states,
            #         attention_mask,
            #         position_ids,
            #         past_key_value,
            #         image_hidden_states,
            #         image_attention_mask,
            #         output_attentions,
            #         use_cache,
            #         no_images,
            #         idx,
            #         self.cross_layer_interval,
            #         self.gated_cross_attn_layers,
            #     )
            # else:
            layer_outputs = vblock(
                decoder_layer,
                hidden_states,
                attention_mask=attention_mask,
                position_ids=position_ids,
                past_key_value=past_key_value,
                image_hidden_states=image_hidden_states,
                image_attention_mask=image_attention_mask,
                output_attentions=output_attentions,
                use_cache=use_cache,
                no_images=no_images,
                layer_idx=idx,
                cross_layer_interval=self.cross_layer_interval,
                gated_cross_attn_layers=self.gated_cross_attn_layers,
            )

            hidden_states = layer_outputs[0]

            if use_cache:
                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)

            if output_attentions:
                all_self_attns += (layer_outputs[1],)

        hidden_states = self.norm(hidden_states)

        # add hidden states from the last decoder layer
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        next_cache = next_decoder_cache if use_cache else None
        if not return_dict:
            return tuple(
                v
                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
                if v is not None
            )
        return BaseModelOutputWithPastImage(
            last_hidden_state=hidden_states,
            past_key_values=next_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
            image_hidden_states=image_hidden_states,
        )


class IdeficsForVisionText2Text(IdeficsPreTrainedModel):
    def __init__(
        self,
        config,
        weights,
    ):
        super().__init__(config)
        self.model = IdeficsModel(
            config=config,
            weights=weights,
        )

        self.lm_head = IdeficsDecoupledTensorParallelLinear(
            config=config,
            weights=weights,
        )

    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        pixel_values: Optional[torch.FloatTensor] = None,
        image_embeddings: Optional[torch.FloatTensor] = None,
        image_hidden_states: Optional[torch.FloatTensor] = None,
        image_attention_mask: Optional[torch.Tensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, CausalLMOutputWithPastImage]:
        r"""
        Args:
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, LlamaForCausalLM

        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)

        >>> prompt = "Hey, are you consciours? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
        ```"""

        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            pixel_values=pixel_values,
            image_embeddings=image_embeddings,
            image_hidden_states=image_hidden_states,
            image_attention_mask=image_attention_mask,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        hidden_states = outputs[0]
        logits, speculative_logits = self.lm_head(hidden_states)

        loss = None

        return (
            CausalLMOutputWithPastImage(
                loss=loss,
                logits=logits,
                past_key_values=outputs.past_key_values,
                hidden_states=outputs.hidden_states,
                attentions=outputs.attentions,
                image_hidden_states=outputs.image_hidden_states,
            ),
            speculative_logits,
        )

    def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
        inputs = prepare_inputs_for_generation(input_ids, past=past, **kwargs)
        unwanted_kwargs = ["token_type_ids"]
        for kwarg in unwanted_kwargs:
            inputs.pop(kwarg, None)
        return inputs

    @staticmethod
    def _expand_inputs_for_generation(
        *args,
        **model_kwargs,
    ):
        return expand_inputs_for_generation(*args, **model_kwargs)

    @staticmethod
    def _update_model_kwargs_for_generation(
        outputs, model_kwargs, is_encoder_decoder=False
    ):
        return update_model_kwargs_for_generation(
            outputs, model_kwargs, is_encoder_decoder=is_encoder_decoder
        )

    @staticmethod
    def _reorder_cache(past, beam_idx):
        reordered_past = ()
        for layer_past in past:
            reordered_past += (
                tuple(
                    past_state.index_select(0, beam_idx) for past_state in layer_past
                ),
            )
        return reordered_past


================================================
FILE: server/text_generation_server/models/custom_modeling/idefics_perceiver.py
================================================
# This code was adapted from https://github.com/lucidrains/flamingo-pytorch licensed under the MIT License.
#
# MIT License
#
# Copyright (c) 2020  The Google AI Language Team Authors, The HuggingFace Inc. team and github/lonePatient
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.


"""

Generic interface to various configurations of the Perceiver Resampler, that simply takes in a series of (potentially
time-indexed) contextual embeddings, and "resamples" (compresses) them down to a pre-specified number of latents! Note
that the Perceiver in general resamples based solely off the *long-range* context; there's a nice opportunity here to
prime the Perceiver Resampler with say a single layer's worth of language embeddings (the target domain), and use that
to softly "retrieve & compress" what we need --> this would be a novel contribution we should explore.

References:
    - DeepMind's Flamingo: https://www.deepmind.com/blog/tackling-multiple-tasks-with-a-single-visual-language-model
    - Code borrowed w/ love from: https://github.com/lucidrains/flamingo-pytorch

"""
from typing import Optional, Tuple

import torch
import torch.nn as nn

from text_generation_server.layers import (
    TensorParallelColumnLinear,
    TensorParallelRowLinear,
)

EPS = 1e-5


class IdeficsPerceiverResampler(nn.Module):
    def __init__(
        self,
        prefix,
        config,
        embed_dim: int,
        depth: int,
        n_heads: int,
        head_dim: int,
        n_latents: int,
        weights,
    ) -> None:
        """
        Instantiates a Perceiver Resampler that operates over a sequence of embeddings (say from a ResNet or ViT or
        MAE) of a given dimension, performs `depth` blocks of cross-attention with a fixed `n_latents` inputs, then
        returns a Tensor of shape [bsz, n_latents, embed_dim]. :param embed_dim: Dimensionality of embeddings being fed
        to the Perceiver Resampler (also dimensionality of latent embeddings *returned* by the Perceiver Resampler.
        Could be e.g., VIT embed_dim, ResNet pool dim, and so on.

        Args:
            config (`IdeficsConfig`): config object
            embed_dim (`int`): The size of each embedding vector
            depth (`int`): Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
            n_heads (`int`): Number of heads in each Transformer block (for multi-headed self-attention).
            head_dim (`int`): Dimensionality of each head projection in the Transformer block.
            n_latents (`int`):
                Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).

        """
        super().__init__()
        self.embed_dim, self.n_heads, self.head_dim, self.n_latents = (
            embed_dim,
            n_heads,
            head_dim,
            n_latents,
        )
        self.qk_layer_norms = config.perceiver_config.qk_layer_norms_perceiver

        # Create Latents for Perceiver
        self.latents = nn.Parameter(weights.get_tensor(f"{prefix}.latents"))

        self.intermediate_dim = (
            self.embed_dim * 4
            if not hasattr(config.vision_config, "embed_dim")
            else config.vision_config.embed_dim * 4
        )
        # Create Transformer Blocks
        self.blocks = nn.ModuleList(
            [
                nn.ModuleList(
                    [
                        IdeficsPerceiverAttention(
                            prefix=f"{prefix}.blocks.{layer_id}.0",
                            config=config,
                            embed_dim=self.embed_dim,
                            n_heads=self.n_heads,
                            head_dim=self.head_dim,
                            qk_layer_norms=self.qk_layer_norms,
                            weights=weights,
                        ),
                        IdeficsMLP(
                            prefix=f"{prefix}.blocks.{layer_id}.1",
                            intermediate_size=self.intermediate_dim,
                            config=config,
                            weights=weights,
                        ),
                    ]
                )
                for layer_id in range(depth)
            ]
        )
        self.layer_norm = nn.LayerNorm.load(
            prefix=f"{prefix}.layer_norm", weights=weights, eps=EPS
        )

    def forward(self, context: torch.Tensor) -> torch.Tensor:
        """Resample arbitrary length context & *compress* down to self.n_latents latent embeddings"""
        # einsum.repeat(self.latents, "seq embed -> bsz seq embed", bsz=context.shape[0])
        latents = self.latents.repeat(context.shape[0], 1, 1)

        # Feed through Perceiver Attention blocks...
        for attn, ff in self.blocks:
            latents = attn(context, latents) + latents
            latents = ff(latents) + latents

        return self.layer_norm(latents)


class IdeficsPerceiverAttention(nn.Module):
    def __init__(
        self,
        prefix,
        config,
        embed_dim: int,
        n_heads: int,
        head_dim: int,
        qk_layer_norms: bool,
        weights,
    ) -> None:
        """Perceiver Cross-Attention Module --> let long-form inputs be `context`, resampled embeddings be `latents`"""
        super().__init__()
        self.embed_dim, self.n_heads, self.head_dim = embed_dim, n_heads, head_dim
        self.qk_layer_norms = qk_layer_norms
        # Normalization & Scaling
        self.context_layer_norm = nn.LayerNorm.load(
            prefix=f"{prefix}.context_layer_norm", weights=weights, eps=EPS
        )
        self.latents_layer_norm = nn.LayerNorm.load(
            prefix=f"{prefix}.latents_layer_norm", weights=weights, eps=EPS
        )
        if self.qk_layer_norms:
            self.q_layer_norm = nn.LayerNorm.load(
                prefix=f"{prefix}.q_layer_norm", weights=weights, eps=EPS
            )
            self.k_layer_norm = nn.LayerNorm.load(
                prefix=f"{prefix}.k_layer_norm", weights=weights, eps=EPS
            )

        self.qk_scale = self.head_dim**-0.5

        if n_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {n_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.n_heads //= weights.process_group.size()

        # Q, K, V Projection (no bias -- detail from Perceiver/Flamingo Papers).
        self.q_proj = TensorParallelColumnLinear.load(
            config=config, prefix=f"{prefix}.q_proj", weights=weights, bias=False
        )
        self.k_proj = TensorParallelColumnLinear.load(
            config=config, prefix=f"{prefix}.k_proj", weights=weights, bias=False
        )
        self.v_proj = TensorParallelColumnLinear.load(
            config=config, prefix=f"{prefix}.v_proj", weights=weights, bias=False
        )

        self.output_proj = TensorParallelRowLinear.load(
            config=config, prefix=f"{prefix}.output_proj", weights=weights, bias=False
        )

    def forward(self, context: torch.Tensor, latents: torch.Tensor) -> torch.Tensor:
        """
        Runs Perceiver Self-Attention, with special (context, latents) appended along the `seq` dimension!

        Args:
            context (`torch.Tensor`):
                Tensor of shape `[bsz, seq, embed_dim]` representing long-form context to resample.
            latents (`torch.Tensor`):
                Tensor of shape `[bsz, n_latents, embed_dim]` representing fixed length latents to compress to.

        Returns:
            `torch.Tensor`: Tensor of shape `[bsz, n_latents, embed_dim]` representing attention over latents w/ cross
            from context.
        """
        context = self.context_layer_norm(context)
        latents = self.latents_layer_norm(latents)
        batch_size, seq_length, embed_dim = context.shape[:3]

        # Query, Key, Value Projections --> Note that in Flamingo, latents are *concatenated* with context prior to attn!
        #   Note: This results in queries w/ `seq = n_latents`, and keys, values with `seq = len(context) + n_latents`
        q = self.q_proj(latents)
        k = self.k_proj(torch.cat([context, latents], dim=-2))
        v = self.v_proj(torch.cat([context, latents], dim=-2))

        # Multiheaded Self-Attention w/ stable softmax (subtract per-row max -- `amax` -- before softmax call)
        #   =>> `attn` should be a 2D matrix of shape [n_latents x (context + n_latents)]
        # einsum.rearrange(x, "bsz seq (heads embed) -> bsz heads seq embed", heads=self.n_heads)
        q, k, v = [
            x.reshape(batch_size, x.shape[1], self.n_heads, self.head_dim).transpose(
                1, 2
            )
            for x in (q, k, v)
        ]

        if self.qk_layer_norms:
            q = self.q_layer_norm(q)
            k = self.k_layer_norm(k)

        scores = torch.einsum("... i d, ... j d -> ... i j", q * self.qk_scale, k)
        stabilized_scores = scores - (scores.amax(dim=-1, keepdim=True).detach())
        attn = stabilized_scores.softmax(dim=-1)

        # Attend & project back to output...
        resampled = torch.einsum("... i j, ... j d -> ... i d", attn, v)
        # einsum.rearrange(resampled, "bsz heads seq embed -> bsz seq (heads embed)", heads=self.n_heads)
        return self.output_proj(resampled.transpose(1, 2).flatten(-2))


class IdeficsMLP(nn.Module):
    def __init__(
        self,
        prefix,
        intermediate_size,
        config,
        weights,
    ):
        """Simple MLP block with intermediate_size and embedding size"""
        super().__init__()
        self.embed_dim = config.vision_config.embed_dim
        self.ln = nn.LayerNorm.load(prefix=f"{prefix}.ln", weights=weights, eps=EPS)
        self.fc = TensorParallelColumnLinear.load(
            config=config,
            prefix=f"{prefix}.fc",
            weights=weights,
            bias=False,
        )
        self.act = nn.ReLU()
        self.c_proj = TensorParallelRowLinear.load(
            config=config,
            prefix=f"{prefix}.c_proj",
            weights=weights,
            bias=False,
        )

    def forward(
        self, hidden_states: Optional[Tuple[torch.FloatTensor]]
    ) -> torch.FloatTensor:
        hidden_states = self.ln(hidden_states)
        hidden_states = self.fc(hidden_states)
        hidden_states = self.act(hidden_states)
        hidden_states = self.c_proj(hidden_states)

        return hidden_states


================================================
FILE: server/text_generation_server/models/custom_modeling/idefics_processing.py
================================================
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Processor class for IDEFICS.
"""

from typing import Callable, List, Optional, Union
from urllib.parse import urlparse

from transformers.feature_extraction_utils import BatchFeature
from transformers.processing_utils import ProcessorMixin
from transformers.tokenization_utils_base import (
    BatchEncoding,
    PaddingStrategy,
    TextInput,
    TruncationStrategy,
)
from transformers.utils import TensorType, is_torch_available


if is_torch_available():
    import torch


IMAGE_TOKEN = "<image>"


# copied from m4.training.packing
def incremental_to_binary_attention_mask(incremental_mask, num_classes=-1):
    # This function converts: [-1, 0, 1] => [[0, 0], [1, 0], [0, 1]]

    # If any of images index are more than num_classes, set them to -1.
    # Words after the max number of images allowed have been seen don't attend on anything
    if num_classes != -1:
        incremental_mask[incremental_mask >= num_classes] = -1

    negatives = incremental_mask == -1
    incremental_mask[negatives] = 0
    attn_mask = torch.nn.functional.one_hot(incremental_mask, num_classes=num_classes)
    attn_mask[negatives, :] = 0
    return attn_mask


# copied from m4.training.packing
def image_attention_mask_for_packed_input_ids(input_ids, tokenizer):
    image_attention_mask = torch.full_like(input_ids, fill_value=-1)
    next_image_attention_mask = torch.full_like(input_ids, fill_value=-1)
    image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
    eod_token_id = tokenizer.eos_token_id
    for batch_idx in range(input_ids.size(0)):
        count = -1
        seen_eod = False
        for idx, token_id in enumerate(input_ids[batch_idx]):
            if token_id == image_token_id:
                count += 1
                image_attention_mask[batch_idx][idx] = count
                seen_eod = False
            else:
                image_attention_mask[batch_idx][idx] = count

            if seen_eod:
                image_attention_mask[batch_idx][idx] = -1

            if token_id == eod_token_id:
                seen_eod = True

    for batch_idx in range(input_ids.size(0)):
        count = -1
        seen_eod = False
        for idx in range(input_ids[batch_idx].size(0) - 1, -1, -1):
            token_id = input_ids[batch_idx][idx]
            if token_id == image_token_id:
                count += 1
                next_image_attention_mask[batch_idx][idx] = count
                seen_eod = False
            else:
                next_image_attention_mask[batch_idx][idx] = count

            if token_id == eod_token_id:
                seen_eod = True

            if seen_eod:
                next_image_attention_mask[batch_idx][idx] = -1

        non_negative_indices = next_image_attention_mask[batch_idx] != -1
        next_image_attention_mask[batch_idx][non_negative_indices] -= count
        next_image_attention_mask[batch_idx][non_negative_indices] *= -1

    return image_attention_mask, next_image_attention_mask


def is_url(string):
    """Checks if the passed string contains a valid url and nothing else. e.g. if space is included it's immediately
    invalidated the url"""
    if " " in string:
        return False
    result = urlparse(string)
    return all([result.scheme, result.netloc])


def is_image(string):
    """Checks if the passed string contains a valid url and nothing else. e.g. if space is included it's immediately
    invalidated the url"""
    return is_url(string) or string.startswith("data:")


class IdeficsProcessor(ProcessorMixin):
    r"""
    Constructs a IDEFICS processor which wraps a LLama tokenizer and IDEFICS image processor into a single processor.

    [`IdeficsProcessor`] offers all the functionalities of [`IdeficsImageProcessor`] and [`LlamaTokenizerFast`]. See
    the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information.

    Args:
        image_processor (`IdeficsImageProcessor`):
            An instance of [`IdeficsImageProcessor`]. The image processor is a required input.
        tokenizer (`LlamaTokenizerFast`):
            An instance of [`LlamaTokenizerFast`]. The tokenizer is a required input.
        image_size (`int`, *optional*, defaults to 224): Image size (assuming a square image)
    """

    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "IdeficsImageProcessor"
    tokenizer_class = "LlamaTokenizerFast"

    def __init__(
        self,
        image_processor,
        tokenizer=None,
        image_size=224,
        add_end_of_utterance_token=None,
        **kwargs,
    ):
        if image_processor is None:
            raise ValueError("You need to specify an `image_processor`.")
        if tokenizer is None:
            raise ValueError("You need to specify a `tokenizer`.")

        super().__init__(image_processor, tokenizer)
        self.current_processor = self.image_processor
        self.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)

        self.default_image_dims = (
            self.image_processor.image_num_channels,
            self.image_processor.image_size,
            self.image_processor.image_size,
        )

        self.tokenizer_was_trained_with_end_of_utterance_token = (
            True
            if "<end_of_utterance>"
            in self.tokenizer.special_tokens_map.get("additional_special_tokens", [])
            else False
        )

    def __call__(
        self,
        prompts: Union[List[TextInput], List[List[TextInput]]],
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        transform: Callable = None,
        add_eos_token=False,
        add_end_of_utterance_token=None,
        debug=False,
        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
    ) -> BatchEncoding:
        """This method takes batched or non-batched prompts made of text and images and converts them into prompts that
        the model was trained on and prepares the image pixel values for the model to process.

        Args:
            prompts (`Union[List[TextInput], [List[List[TextInput]]]]`):
                either a single prompt or a batched list of prompts - see the detailed description immediately after
                the end of the arguments doc section.
            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
                Select a strategy to pad the returned sequences (according to the model's padding side and padding
                index) among:
                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
                  sequence if provided).
                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
                  acceptable input length for the model if that argument is not provided.
                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
                  lengths).
            max_length (`int`, *optional*):
                Maximum length of the returned list and optionally padding length (see above).
            truncation (`bool`, *optional*):
                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
            transform (`Callable`, *optional*):
                A custom transform function that accepts a single image can be passed for training. For example,
                `torchvision.Compose` can be used to compose multiple functions. If `None` a preset inference-specific
                set of transforms will be applied to the images
            add_eos_token (`bool`, *optional*, defaults to `False`):
                Adds `eos_token` at the end of the final prompt if True`
            add_end_of_utterance_token (`bool`, *optional*)
                Whether to automatically add `<end_of_utterance>` after each prompt's text input (unless followed by an
                image). If `None` the tokenizer will be checked instead and if this token is found in
                `additional_special_tokens` then the value will be `True`.
            debug (`bool`, *optional*, defaults to `False`):
                `True` value will help debug prompt generation by dumping useful information
            return_tensors (`str` or `TensorType`, *optional*, defaults to `TensorType.PYTORCH`):
                The type of tensors to return. Can be one of:
                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.

        Returns:
            a dict with entries: `input_ids`, `attention_mask`, `pixel_values`, `image_attention_mask` which can be
            directly passed to `model.generate`

        Detailed explanation:

        Each entry in `prompts` is either a text to be passed as is or an image that will be processed.

        An image can be either an image object (`PIL.Image`) or a url from which the image can be retrieved.

        When the processor encounters an image it'll inject `<fake_token_around_image><image><fake_token_around_image>`
        entry into the prompt.

        Example:

        ```python
        checkpoint = "HuggingFaceM4/idefics-9b"
        processor = AutoProcessor.from_pretrained(checkpoint)
        url = "https://hips.hearstapps.com/hmg-prod/images/cute-photos-of-cats-in-grass-1593184777.jpg"
        img = processor.image_processor.fetch_images([url])[0]

        prompts = [
            "User:",
            img,
            "Describe this image.\nAssistant: An image of two kittens in grass.\n",
            "User:",
            "https://hips.hearstapps.com/hmg-prod/images/dog-puns-1581708208.jpg",
            "Describe this image.\nAssistant:",
        ]

        inputs = processor(prompts, return_tensors="pt")
        generated_ids = model.generate(**inputs, max_length=100)
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        ```

        In this example the `prompts` will be converted into:

        ```
        <s>User:<fake_token_around_image><image><fake_token_around_image>Describe this image.
        Assistant: An image of two kittens in grass.
        User:<fake_token_around_image><image><fake_token_around_image>Describe this image.
        Assistant:'
        ```

        and the two images will be massaged using [`IdeficsImageProcessor.__call__`] method and placed inside the
        `pixel_values` dict entry of the return value.

        This example also examplifies that images can be passed as objects or as text urls. It can be seen that the
        first image is passed as object and the second one as a url.

        To do training do:

        ```python
        image_transform = transforms.Compose(
            [
                transforms.RandomResizedCrop(
                    (w, h), scale=(0.9, 1.0), interpolation=transforms.InterpolationMode.BICUBIC
                ),
                transforms.ToTensor(),
                transforms.Normalize(mean=self.image_mean, std=self.image_std),
            ]
        )
        inputs = processor(prompts, transform=image_transform, return_tensors="pt")
        ```

        In order to help debug prompt generation enable `debug=True` which will show you what's happening.

        """

        # if the value isn't overriden by the user, check if the tokenizer was trained with this token and then use it
        if add_end_of_utterance_token is None:
            add_end_of_utterance_token = (
                self.tokenizer_was_trained_with_end_of_utterance_token
            )

        # turn non-batched prompts into batched
        if not any(isinstance(i, list) for i in prompts):
            prompts = [prompts]

        fake_token = "<fake_token_around_image>"
        image_token = "<image>"
        end_of_utterance_token = "<end_of_utterance>"

        def image_tokens(last_was_image):
            if last_was_image:
                return image_token + fake_token
            else:
                return fake_token + image_token + fake_token

        all_texts = []
        all_images = []
        for sample in prompts:
            # the model was trained on samples starting with <s>
            full_text = f"{self.tokenizer.bos_token}"

            # an image can either be an image object in the item or the url, everything else is a verbatim prompt text
            image_objects = []
            last_was_image = False
            last_was_text = False
            for i, item in enumerate(sample):
                if i > 0:
                    last_was_text = True if not last_was_image else False

                if isinstance(item, str):
                    item = item.strip(" ")
                    if is_image(item):
                        image = self.image_processor.fetch_images(item)
                        full_text += image_tokens(last_was_image)
                        image_objects.append(image)
                        last_was_image = True
                    else:
                        # we add end_of_utterance_token between each subsequent text prompts (but not at the last one!)
                        if add_end_of_utterance_token and last_was_text:
                            full_text += end_of_utterance_token
                        full_text += item
                        last_was_image = False
                else:
                    # must be an image obj
                    full_text += image_tokens(last_was_image)
                    image_objects.append(item)
                    last_was_image = True

            if add_eos_token:
                full_text += self.tokenizer.eos_token

            if debug is True:
                print(f"{full_text=}")

            image_objects = self.image_processor(image_objects, transform=transform)

            text_encoding = self.tokenizer(
                text=full_text,
                add_special_tokens=False,
                padding=padding,
                truncation=truncation,
                max_length=max_length,
            )

            all_texts.append(text_encoding["input_ids"])
            all_images.append(image_objects)

        max_seq_len = max(len(x) for x in all_texts)

        # max_num_images has to be at least 1 even when there are no images
        max_num_images = max(len(x) for x in all_images)
        max_num_images = max(1, max_num_images)

        at_least_one_image = sum(len(x) for x in all_images) > 0
        output_input_ids = []
        output_images = []
        output_attention_masks = []
        for text, images in zip(all_texts, all_images):
            padded_input_ids = [self.tokenizer.pad_token_id] * max_seq_len
            unpadded_seq_len = len(text)
            start = max_seq_len - unpadded_seq_len
            padded_input_ids[start:] = text[:max_seq_len]

            attention_mask = torch.zeros((max_seq_len,), dtype=torch.long)
            attention_mask[start:] = 1

            image_count = padded_input_ids.count(self.image_token_id)
            local_max_num_images = min(image_count, max_num_images)

            current_images = images[:local_max_num_images]

            if len(current_images) > 0:
                padded_image_tensor = torch.zeros(
                    max_num_images, *current_images.size()[1:]
                )
                padded_image_tensor[: current_images.size(0)] = current_images
            else:
                padded_image_tensor = torch.zeros(
                    max_num_images, *self.default_image_dims
                )

            output_images.append(padded_image_tensor)
            output_input_ids.append(torch.tensor(padded_input_ids))

            output_attention_masks.append(attention_mask)

        output_input_ids = torch.stack(output_input_ids)
        output_images = torch.stack(output_images)
        output_attention_masks = torch.stack(output_attention_masks)

        if at_least_one_image:
            image_attention_mask, _ = image_attention_mask_for_packed_input_ids(
                output_input_ids, self.tokenizer
            )
            image_attention_mask = incremental_to_binary_attention_mask(
                image_attention_mask, num_classes=max_num_images
            )
        else:
            # in full language mode we set the image mask to all-0s
            image_attention_mask = torch.zeros(
                output_input_ids.shape[0],
                output_input_ids.shape[1],
                1,
                dtype=torch.bool,
            )

        return BatchFeature(
            data={
                "input_ids": output_input_ids,
                "attention_mask": output_attention_masks,
                "pixel_values": output_images,
                "image_attention_mask": image_attention_mask,
            }
        )

    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        """
        return self.tokenizer.batch_decode(*args, **kwargs)

    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        """
        return self.tokenizer.decode(*args, **kwargs)

    @property
    def model_input_names(self):
        tokenizer_input_names = self.tokenizer.model_input_names
        image_processor_input_names = self.image_processor.model_input_names
        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))


================================================
FILE: server/text_generation_server/models/custom_modeling/idefics_vision.py
================================================
# coding=utf-8
# Copyright 2021 The OpenAI Team Authors and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch IdeficsVision model: a copy of CLIPVisionModel using a simpler config object"""


from dataclasses import dataclass
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn

from transformers.activations import ACT2FN
from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
from transformers.utils import (
    ModelOutput,
    logging,
)
from text_generation_server.layers import (
    TensorParallelColumnLinear,
    TensorParallelRowLinear,
    TensorParallelEmbedding,
)

logger = logging.get_logger(__name__)


@dataclass
class IdeficsVisionModelOutput(ModelOutput):
    """
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.

    Args:
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
            The image embeddings obtained by applying the projection layer to the pooler_output.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    image_embeds: Optional[torch.FloatTensor] = None
    last_hidden_state: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


# Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings with CLIP->Idefics
class IdeficsVisionEmbeddings(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size
        self.image_size = config.image_size
        self.patch_size = config.patch_size

        self.class_embedding = nn.Parameter(
            weights.get_tensor(f"{prefix}.class_embedding")
        )

        self.patch_embedding = nn.Conv2d.load_no_bias(
            prefix=f"{prefix}.patch_embedding",
            weights=weights,
            in_channels=config.num_channels,
            out_channels=self.embed_dim,
            kernel_size=self.patch_size,
            stride=self.patch_size,
        )

        self.num_patches = (self.image_size // self.patch_size) ** 2
        self.num_positions = self.num_patches + 1
        self.position_embedding = TensorParallelEmbedding(
            prefix="model.vision_model.embeddings.position_embedding", weights=weights
        )
        self.position_ids = (
            torch.arange(self.num_positions).expand((1, -1)).to(device=weights.device)
        )

    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
        batch_size = pixel_values.shape[0]
        target_dtype = self.patch_embedding.weight.dtype
        patch_embeds = self.patch_embedding(
            pixel_values.to(dtype=target_dtype)
        )  # shape = [*, width, grid, grid]
        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)

        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
        embeddings = embeddings + self.position_embedding(self.position_ids)
        return embeddings


# Copied from transformers.models.clip.modeling_clip.CLIPAttention with CLIP->IdeficsVision
class IdeficsVisionAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.embed_dim // self.num_heads
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {self.num_heads})."
            )
        self.scale = self.head_dim**-0.5
        self.dropout = config.attention_dropout

        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()
        self.embed_dim = self.embed_dim // weights.process_group.size()

        self.k_proj = TensorParallelColumnLinear.load(
            config, prefix=f"{prefix}.k_proj", weights=weights, bias=True
        )
        self.v_proj = TensorParallelColumnLinear.load(
            config, prefix=f"{prefix}.v_proj", weights=weights, bias=True
        )
        self.q_proj = TensorParallelColumnLinear.load(
            config, prefix=f"{prefix}.q_proj", weights=weights, bias=True
        )
        self.out_proj = TensorParallelRowLinear.load(
            config, prefix=f"{prefix}.out_proj", weights=weights, bias=True
        )

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return (
            tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
            .transpose(1, 2)
            .contiguous()
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        causal_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        """Input shape: Batch x Time x Channel"""

        bsz, tgt_len, _ = hidden_states.size()

        # get query proj
        query_states = self.q_proj(hidden_states) * self.scale
        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)

        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
        key_states = key_states.view(*proj_shape)
        value_states = value_states.view(*proj_shape)

        src_len = key_states.size(1)
        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))

        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
            raise ValueError(
                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
                f" {attn_weights.size()}"
            )

        # apply the causal_attention_mask first
        if causal_attention_mask is not None:
            if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
                raise ValueError(
                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
                    f" {causal_attention_mask.size()}"
                )
            attn_weights = (
                attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
                + causal_attention_mask
            )
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)

        if attention_mask is not None:
            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
                raise ValueError(
                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
                )
            attn_weights = (
                attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
                + attention_mask
            )
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)

        attn_weights = nn.functional.softmax(attn_weights, dim=-1)

        if output_attentions:
            # this operation is a bit akward, but it's required to
            # make sure that attn_weights keeps its gradient.
            # In order to do so, attn_weights have to reshaped
            # twice and have to be reused in the following
            attn_weights_reshaped = attn_weights.view(
                bsz, self.num_heads, tgt_len, src_len
            )
            attn_weights = attn_weights_reshaped.view(
                bsz * self.num_heads, tgt_len, src_len
            )
        else:
            attn_weights_reshaped = None

        attn_probs = nn.functional.dropout(
            attn_weights, p=self.dropout, training=self.training
        )

        attn_output = torch.bmm(attn_probs, value_states)

        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
        attn_output = attn_output.transpose(1, 2)
        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)

        attn_output = self.out_proj(attn_output)

        return attn_output, attn_weights_reshaped


# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->IdeficsVision
class IdeficsVisionMLP(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        self.activation_fn = ACT2FN[config.hidden_act]
        self.fc1 = TensorParallelColumnLinear.load(
            config, prefix=f"{prefix}.fc1", weights=weights, bias=True
        )
        self.fc2 = TensorParallelRowLinear.load(
            config, prefix=f"{prefix}.fc2", weights=weights, bias=True
        )

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.fc1(hidden_states)
        hidden_states = self.activation_fn(hidden_states)
        hidden_states = self.fc2(hidden_states)
        return hidden_states


# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->IdeficsVision
class IdeficsVisionEncoderLayer(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.embed_dim = config.hidden_size
        self.self_attn = IdeficsVisionAttention(
            prefix=f"{prefix}.self_attn", config=config, weights=weights
        )
        self.layer_norm1 = nn.LayerNorm.load(
            prefix=f"{prefix}.layer_norm1", weights=weights, eps=config.layer_norm_eps
        )
        self.mlp = IdeficsVisionMLP(
            prefix=f"{prefix}.mlp", config=config, weights=weights
        )
        self.layer_norm2 = nn.LayerNorm.load(
            prefix=f"{prefix}.layer_norm2", weights=weights, eps=config.layer_norm_eps
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
        causal_attention_mask: torch.Tensor,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.FloatTensor]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        """
        residual = hidden_states

        hidden_states = self.layer_norm1(hidden_states)
        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            causal_attention_mask=causal_attention_mask,
            output_attentions=output_attentions,
        )
        hidden_states = residual + hidden_states

        residual = hidden_states
        hidden_states = self.layer_norm2(hidden_states)
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (attn_weights,)

        return outputs


# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->IdeficsVision
class IdeficsVisionEncoder(nn.Module):
    """
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`IdeficsVisionEncoderLayer`].

    Args:
        config: IdeficsVisionConfig
    """

    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        self.layers = nn.ModuleList(
            [
                IdeficsVisionEncoderLayer(
                    prefix=f"{prefix}.encoder.layers.{layer_id}",
                    config=config,
                    weights=weights,
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )
        # self.gradient_checkpointing = False

    def forward(
        self,
        inputs_embeds,
        attention_mask: Optional[torch.Tensor] = None,
        causal_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutput]:
        r"""
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        """
        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        encoder_states = () if output_hidden_states else None
        all_attentions = () if output_attentions else None

        hidden_states = inputs_embeds
        for idx, encoder_layer in enumerate(self.layers):
            if output_hidden_states:
                encoder_states = encoder_states + (hidden_states,)
            # if self.gradient_checkpointing and self.training:

            #     def create_custom_forward(module):
            #         def custom_forward(*inputs):
            #             return module(*inputs, output_attentions)

            #         return custom_forward

            #     layer_outputs = torch.utils.checkpoint.checkpoint(
            #         create_custom_forward(encoder_layer),
            #         hidden_states,
            #         attention_mask,
            #         causal_attention_mask,
            #     )
            # else:
            layer_outputs = encoder_layer(
                hidden_states,
                attention_mask,
                causal_attention_mask,
                output_attentions=output_attentions,
            )

            hidden_states = layer_outputs[0]

            if output_attentions:
                all_attentions = all_attentions + (layer_outputs[1],)

        if output_hidden_states:
            encoder_states = encoder_states + (hidden_states,)

        if not return_dict:
            return tuple(
                v
                for v in [hidden_states, encoder_states, all_attentions]
                if v is not None
            )
        return BaseModelOutput(
            last_hidden_state=hidden_states,
            hidden_states=encoder_states,
            attentions=all_attentions,
        )


# Adapted from transformers.models.clip.modeling_clip.CLIPVisionTransformer
class IdeficsVisionTransformer(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config

        self.embeddings = IdeficsVisionEmbeddings(
            prefix=f"{prefix}.embeddings", config=config, weights=weights
        )
        self.pre_layrnorm = nn.LayerNorm.load(
            prefix=f"{prefix}.pre_layrnorm", weights=weights, eps=config.layer_norm_eps
        )
        self.encoder = IdeficsVisionEncoder(
            prefix=prefix, config=config, weights=weights
        )
        self.post_layernorm = nn.LayerNorm.load(
            prefix=f"{prefix}.post_layernorm",
            weights=weights,
            eps=config.layer_norm_eps,
        )

    # copied from transformers.models.clip.modeling_clip.CLIPVisionTransformer.forward
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPooling]:
        r"""
        Returns:

        """
        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        hidden_states = self.embeddings(pixel_values)
        hidden_states = self.pre_layrnorm(hidden_states)

        encoder_outputs = self.encoder(
            inputs_embeds=hidden_states,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        last_hidden_state = encoder_outputs[0]
        pooled_output = last_hidden_state[:, 0, :]
        pooled_output = self.post_layernorm(pooled_output)

        if not return_dict:
            return (last_hidden_state, pooled_output) + encoder_outputs[1:]

        return BaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )


================================================
FILE: server/text_generation_server/models/custom_modeling/llava_next.py
================================================
# coding=utf-8
# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch Llava-NeXT model."""

from typing import List, Optional, Tuple

import torch
import torch.utils.checkpoint
from torch import nn

from transformers.activations import ACT2FN
from transformers.image_processing_utils import select_best_resolution

from text_generation_server.layers.attention import Seqlen
from text_generation_server.models.custom_modeling.vlm import (
    load_text_model,
    load_vision_model,
)
from text_generation_server.layers import (
    TensorParallelColumnLinear,
    TensorParallelRowLinear,
)


def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
    """
    Calculate the shape of the image patch grid after the preprocessing for images of any resolution.

    Args:
        image_size (`tuple`):
            The size of the input image in the format (height, width).
        grid_pinpoints (`List`):
            A list containing possible resolutions. Each item in the list should be a tuple or list
            of the form `(height, width)`.
        patch_size (`int`):
            The size of each image patch.

    Returns:
        tuple: The shape of the image patch grid in the format (height, width).
    """
    if not isinstance(grid_pinpoints, list):
        raise ValueError("grid_pinpoints should be a list of tuples or lists")

    height, width = select_best_resolution(image_size, grid_pinpoints)
    return height // patch_size, width // patch_size


def unpad_image(tensor, original_size):
    """
    Unpads a PyTorch tensor of a padded and resized image.

    Args:
        tensor (`torch.Tensor`):
            The image tensor, assumed to be of shape (num_channels, height, width).
        original_size (`tuple`):
            The original size of the image (height, width).

    Returns:
        `torch.Tensor`: The unpadded image tensor.
    """
    original_height, original_width = original_size
    current_height, current_width = tensor.shape[1:]

    original_aspect_ratio = original_width / original_height
    current_aspect_ratio = current_width / current_height

    if original_aspect_ratio > current_aspect_ratio:
        scale_factor = current_width / original_width
        new_height = int(original_height * scale_factor)
        padding = (current_height - new_height) // 2
        unpadded_tensor = tensor[:, padding : current_height - padding, :]
    else:
        scale_factor = current_height / original_height
        new_width = int(original_width * scale_factor)
        padding = (current_width - new_width) // 2
        unpadded_tensor = tensor[:, :, padding : current_width - padding]

    return unpadded_tensor


# Copied from transformers.models.llava.modeling_llava.LlavaMultiModalProjector with Llava->LlavaNext
class LlavaNextMultiModalProjector(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()

        self.linear_1 = TensorParallelColumnLinear.load(
            prefix=f"{prefix}.linear_1", config=config, weights=weights, bias=True
        )
        self.act = ACT2FN[config.projector_hidden_act]
        self.linear_2 = TensorParallelRowLinear.load(
            prefix=f"{prefix}.linear_2", config=config, weights=weights, bias=True
        )

    def forward(self, image_features):
        hidden_states = self.linear_1(image_features)
        hidden_states = self.act(hidden_states)
        hidden_states = self.linear_2(hidden_states)
        return hidden_states


class LlavaNextForConditionalGeneration(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        config.vision_config.quantize = config.quantize
        vision_config = config.vision_config
        # Instead of selecting in hidden_states[-2].
        # Instead compute only the n -2 + 1 layers and don't pool
        if config.vision_feature_layer < 0:
            vision_config.num_hidden_layers += config.vision_feature_layer + 1
        else:
            vision_config.num_hidden_layers = config.vision_feature_layer + 1
        self.vision_tower = load_vision_model(
            prefix="vision_tower" if not prefix else f"{prefix}.vision_tower",
            config=config.vision_config,
            weights=weights,
        )

        self.multi_modal_projector = LlavaNextMultiModalProjector(
            prefix="multi_modal_projector", config=config, weights=weights
        )

        self.image_newline = weights.get_tensor("image_newline")

        self.vocab_size = config.text_config.vocab_size
        self.config = config
        config.text_config.quantize = config.quantize
        config.text_config.speculator = config.speculator
        self.text_model = load_text_model(
            prefix="language_model" if not prefix else f"{prefix}.language_model",
            config=config.text_config,
            weights=weights,
        )
        self.pad_token_id = (
            config.pad_token_id if config.pad_token_id is not None else -1
        )

    def _merge_input_ids_with_image_features(
        self,
        input_ids: torch.Tensor,
        inputs_embeds: torch.Tensor,
        image_features: torch.Tensor,
    ):
        """In place merges in vision_embeddings with inputs_embeds."""
        mask = input_ids == self.config.image_token_index
        # Let's pray we have enabled enough slots !
        try:
            inputs_embeds[mask] = image_features.view(-1, image_features.shape[-1])
        except Exception as e:
            raise RuntimeError(
                f"Cannot fill images right now. If error happens at warmup, make sure you have enough `--max-input-tokens`  to handle images. If error happens at regular runtime, please fill in an issue: {e}"
            )
        return inputs_embeds

    def get_vision_embeds(
        self,
        pixel_values: torch.FloatTensor,
        pixel_attention_mask: Optional[torch.FloatTensor] = None,
        image_sizes: Optional[torch.Tensor] = None,
        image_grid_thw: Optional[torch.LongTensor] = None,
    ):
        # num_special_image_tokens = (input_ids == self.config.image_token_index).sum()
        # assert num_special_image_tokens == len(pixel_values), f"Received {num_special_image_tokens} for {len(pixel_values)} images, this is invalid"
        # 1. Extract the input embeddings

        # 2. Merge text and images
        num_images, num_patches, channels, height, width = pixel_values.shape
        pixel_values = pixel_values.view(
            num_images * num_patches, channels, height, width
        )
        image_features = self.vision_tower(pixel_values)

        # selected_image_feature = image_features.hidden_states[self.config.vision_feature_layer]
        # Already done within the clip model
        selected_image_feature = image_features.last_hidden_state

        if self.config.vision_feature_select_strategy == "default":
            selected_image_feature = selected_image_feature[:, 1:]
        elif self.config.vision_feature_select_strategy == "full":
            selected_image_feature = selected_image_feature
        else:
            raise RuntimeError(
                f"Strategy `{self.config.vision_feature_select_strategy}` is not supported/valid."
            )

        image_features = self.multi_modal_projector(selected_image_feature)

        # split up image_features for each of the individual images
        # hence we get a list of image_features, each of shape (5, num_patches, hidden_size)
        # if we assume each image has 5 image features (base image + 4 patches)
        split_sizes = [num_patches] * num_images
        image_features = torch.split(image_features, split_sizes, dim=0)

        # NOTE we only support multimodal_patch_merge_type == "spatial_unpad"
        height = width = (
            self.config.vision_config.image_size // self.config.vision_config.patch_size
        )

        new_image_features = []
        for image_idx, image_feature in enumerate(image_features):
            if image_feature.shape[0] > 1:
                base_image_feature = image_feature[0]
                image_feature = image_feature[1:]

                if height * width != base_image_feature.shape[0]:
                    raise ValueError(
                        "The number of patches is not consistent with the image size."
                    )

                # Dimensions are intentionally swapped to be bug-compatible with
                # upstream: https://github.com/LLaVA-VL/LLaVA-NeXT/issues/59
                num_patch_width, num_patch_height = get_anyres_image_grid_shape(
                    image_sizes[image_idx],
                    self.config.image_grid_pinpoints,
                    self.config.vision_config.image_size,
                )
                image_feature = image_feature.view(
                    num_patch_height, num_patch_width, height, width, -1
                )
                image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
                image_feature = image_feature.flatten(1, 2).flatten(2, 3)
                image_feature = unpad_image(image_feature, image_sizes[image_idx])
                image_feature = torch.cat(
                    (
                        image_feature,
                        self.image_newline[:, None, None].expand(
                            *image_feature.shape[:-1], 1
                        ),
                    ),
                    dim=-1,
                )
                image_feature = image_feature.flatten(1, 2).transpose(0, 1)
                image_feature = torch.cat((base_image_feature, image_feature), dim=0)
            else:
                image_feature = image_feature[0]
                image_feature = torch.cat(
                    (image_feature, self.image_newline[None]), dim=0
                )
            new_image_features.append(image_feature)
        image_features = torch.stack(new_image_features, dim=0)
        return image_features.view(-1, image_features.shape[-1])

    def get_inputs_embeds(
        self,
        input_ids: torch.Tensor,
        vision_embeds: torch.Tensor = None,
        pixel_values: torch.FloatTensor = None,
        image_sizes: Optional[torch.LongTensor] = None,
    ):
        inputs_embeds = self.text_model.embed_tokens(input_ids)

        if vision_embeds is not None:
            # When we generate, we don't want to replace the potential image_token_id that we generated by images
            # that simply don't exist
            inputs_embeds = self._merge_input_ids_with_image_features(
                input_ids, inputs_embeds, vision_embeds
            )
        return inputs_embeds

    def forward(
        self,
        inputs_embeds: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        prefill_cache_indices: Optional[torch.Tensor],
        lm_head_indices: Optional[torch.Tensor] = None,
        # Unused for this model
        attention_mask: Optional[torch.BoolTensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ):
        hidden_states = self.text_model.model(
            inputs_embeds=inputs_embeds,
            position_ids=position_ids,
            cu_seqlen_prefill=cu_seqlen_prefill,
            kv_cache=kv_cache,
            block_tables=block_tables,
            slots=slots,
            seqlen=seqlen,
            max_s=max_s,
            true_max_s=max_s,
            prefill_cache_indices=None,
            adapter_data=adapter_data,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.text_model.lm_head(hidden_states)
        return logits, speculative_logits


================================================
FILE: server/text_generation_server/models/custom_modeling/mamba_modeling.py
================================================
import torch
import torch.distributed

from mamba_ssm.ops.triton.selective_state_update import selective_state_update
from mamba_ssm.ops.selective_scan_interface import selective_scan_fn
from torch import nn
from typing import Optional, Tuple, Any
from transformers.configuration_utils import PretrainedConfig
import torch.nn.functional as F

from text_generation_server.layers import (
    SpeculativeHead,
    TensorParallelEmbedding,
    FastLinear,
)
from text_generation_server.layers.layernorm import FastRMSNorm

from einops import rearrange
from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
import math
from dataclasses import dataclass


@dataclass
class InferenceParams:
    """Inference parameters that are passed to the main model in order
    to efficienly calculate and store the context during inference."""

    max_seqlen: int
    max_batch_size: int
    conv_states: torch.Tensor
    ssm_states: torch.Tensor
    seqlen_offset: int


class MambaConfig(PretrainedConfig):
    def __init__(
        self,
        vocab_size=50280,
        d_model=768,
        d_state=16,
        n_layer=32,
        layer_norm_epsilon=1e-5,
        tie_word_embeddings=False,
        pad_token_id=0,
        bos_token_id=1,
        eos_token_id=2,
        expand=2,
        dt_rank="auto",
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.n_layer = n_layer
        self.layer_norm_epsilon = layer_norm_epsilon
        self.d_model = d_model
        self.d_inner = d_model * 2
        self.d_conv = 4
        self.d_state = d_state
        self.expand = expand
        self.dt_rank = math.ceil(self.d_model / 16) if dt_rank == "auto" else dt_rank

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )


class MambaBlock(nn.Module):
    def __init__(self, prefix, config, weights, layer_id):
        super().__init__()
        self.layer_id = layer_id
        self.in_proj = FastLinear.load(config, f"{prefix}.in_proj", weights, bias=False)
        self.x_proj = FastLinear.load(config, f"{prefix}.x_proj", weights, bias=False)
        self.dt_proj = FastLinear.load(config, f"{prefix}.dt_proj", weights, bias=True)
        self.dt_proj_no_bias = FastLinear.load(
            config, f"{prefix}.dt_proj", weights, bias=False
        )
        self.out_proj = FastLinear.load(
            config, f"{prefix}.out_proj", weights, bias=False
        )
        self.conv1d = FastLinear.load(config, f"{prefix}.conv1d", weights, bias=True)
        self.negA = -torch.exp(weights.get_tensor(f"{prefix}.A_log").float())
        self.D = weights.get_tensor(f"{prefix}.D")
        self.activation = "silu"
        self.dt_rank = config.dt_rank
        self.d_state = config.d_state
        self.d_conv = config.d_conv
        self.act = nn.SiLU()

    # inference_params
    def forward(self, hidden_states: torch.Tensor, inference_params=None):
        if inference_params.seqlen_offset > 0:
            conv_state = inference_params.conv_states[self.layer_id]
            ssm_state = inference_params.ssm_states[self.layer_id]
            out, conv_state, ssm_state = self.step(hidden_states, conv_state, ssm_state)
            return out, conv_state, ssm_state

        _, seqlen, _ = hidden_states.shape
        projected_states = self.in_proj(hidden_states).transpose(1, 2)
        # assert projected_states.shape == [batch_size, 2 * dstate, seqlen], f"{projected_states.shape} [{batch_size}, {dstate}, {seqlen}]"
        x, z = projected_states.chunk(2, dim=1)
        conv_state = F.pad(x, (self.d_conv - seqlen, 0))
        x = causal_conv1d_fn(
            x=x,
            weight=self.conv1d.weight.squeeze(1),
            bias=self.conv1d.bias,
            activation=self.activation,
        )

        # We're careful here about the layout, to avoid extra transposes.
        # We want dt to have d as the slowest moving dimension
        # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
        x_dbl = self.x_proj(rearrange(x, "b d l -> (b l) d"))  # (bl d)
        dt, B, C = torch.split(
            x_dbl, [self.dt_rank, self.d_state, self.d_state], dim=-1
        )
        dt = self.dt_proj.weight @ dt.t()
        dt = rearrange(dt, "d (b l) -> b d l", l=seqlen)
        B = rearrange(B, "(b l) dstate -> b dstate l", l=seqlen).contiguous()
        C = rearrange(C, "(b l) dstate -> b dstate l", l=seqlen).contiguous()
        y, last_state = selective_scan_fn(
            x,
            dt,
            self.negA,
            B,
            C,
            self.D.float(),
            z=z,
            delta_bias=self.dt_proj.bias.float(),
            delta_softplus=True,
            return_last_state=True,
        )
        y = rearrange(y, "b d l -> b l d")
        attn_outputs = self.out_proj(y)
        return attn_outputs, conv_state, last_state

    def step(self, hidden_states, conv_state, ssm_state):
        xz = self.in_proj(hidden_states.squeeze(1))
        x, z = xz.chunk(2, dim=-1)  # (B D)
        x = causal_conv1d_update(
            x,
            conv_state,
            self.conv1d.weight.squeeze(1),
            self.conv1d.bias,
            self.activation,
        )
        x_db = self.x_proj(x)  # (B dt_rank+2*d_state)
        dt, B, C = torch.split(x_db, [self.dt_rank, self.d_state, self.d_state], dim=-1)
        dt = F.linear(dt, self.dt_proj.weight)
        A = self.negA
        y = selective_state_update(
            ssm_state,
            x,
            dt,
            A,
            B,
            C,
            self.D,
            z=z,
            dt_bias=self.dt_proj.bias,
            dt_softplus=True,
        )
        out = self.out_proj(y)
        return out.unsqueeze(1), conv_state.clone(), ssm_state.clone()


class ResidualBlock(nn.Module):
    def __init__(self, prefix, config, weights, layer_id):
        super().__init__()
        self.mamba_block = MambaBlock(
            prefix=f"{prefix}.mixer", config=config, weights=weights, layer_id=layer_id
        )
        self.layer_norm = FastRMSNorm.load(
            prefix=f"{prefix}.norm", weights=weights, eps=config.layer_norm_epsilon
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        residual: Optional[torch.Tensor] = None,
        inference_params: Optional[Any] = None,
    ):
        residual = (hidden_states + residual) if residual is not None else hidden_states
        shape = residual.shape
        hidden_states, _ = self.layer_norm(residual.view(-1, shape[-1]))
        hidden_states, conv_state, last_ssm_state = self.mamba_block(
            hidden_states.view(*shape), inference_params
        )
        return hidden_states, residual, conv_state, last_ssm_state


class MambaModel(nn.Module):
    def __init__(self, config, weights):
        super().__init__()
        prefix = "backbone"
        try:
            self.embed_tokens = TensorParallelEmbedding(f"{prefix}.embeddings", weights)
        except RuntimeError:
            self.embed_tokens = TensorParallelEmbedding(f"{prefix}.embedding", weights)
        self.blocks = nn.ModuleList(
            [
                ResidualBlock(f"{prefix}.layers.{i}", config, weights, layer_id=i)
                for i in range(config.n_layer)
            ]
        )
        self.norm_f = FastRMSNorm.load(
            f"{prefix}.norm_f", weights, eps=config.layer_norm_epsilon
        )
        try:
            self.lm_head = SpeculativeHead.load(config, f"{prefix}.embeddings", weights)
        except RuntimeError:
            self.lm_head = SpeculativeHead.load(config, f"{prefix}.embedding", weights)
        self.config = config

    def forward(
        self, input_ids: torch.Tensor, inference_params=None, residual=None
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        hidden_states = self.embed_tokens(input_ids)
        for i, block in enumerate(self.blocks):
            hidden_states, residual, conv_state, ssm_state = block(
                hidden_states, residual, inference_params
            )
            inference_params.conv_states[i].copy_(conv_state)
            inference_params.ssm_states[i].copy_(ssm_state)

        hidden_states = (
            hidden_states + residual if residual is not None else hidden_states
        )
        hidden_states, _ = self.norm_f(hidden_states.view(-1, hidden_states.size(-1)))
        hidden_states = hidden_states.view(residual.shape)
        logits, speculative_logits = self.lm_head(hidden_states)

        # update the offset for the next inference using these params
        inference_params.seqlen_offset += input_ids.size(1)
        return logits, speculative_logits


================================================
FILE: server/text_generation_server/models/custom_modeling/mllama.py
================================================
# coding=utf-8
# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch Mllama model."""

from typing import Optional, Tuple, List

import torch
import torch.utils.checkpoint
from torch import nn
from text_generation_server.utils.import_utils import SYSTEM

if SYSTEM == "ipex":
    import intel_extension_for_pytorch as ipex
else:
    import flash_attn_2_cuda

from transformers.activations import ACT2FN
import torch.nn.functional as F

from text_generation_server.layers import (
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    TensorParallelRowLinear,
    FastLinear,
)
from text_generation_server.layers.attention import (
    Seqlen,
)
from text_generation_server.models.custom_modeling.flash_llama_modeling import (
    FlashLlamaForCausalLM,
)


def _prepare_aspect_ratio_attention_mask(
    aspect_ratio_mask: torch.Tensor,
    num_patches: int,
    target_length: int,
    dtype: torch.dtype,
) -> torch.Tensor:
    # Expand aspect ratio mask to target_length
    batch_size, max_num_tiles = aspect_ratio_mask.shape
    attention_mask = aspect_ratio_mask.view(batch_size, max_num_tiles, 1, 1).to(dtype)
    attention_mask = attention_mask.repeat(1, 1, target_length, 1)

    # Mask padding patches
    pad_patches = target_length - num_patches
    attention_mask[:, :, -pad_patches:] = 0

    # Invert the mask (0 -> 1, 1 -> 0)
    attention_mask = 1 - attention_mask

    # Reshape to 2D and create 4D attention mask
    # (batch_size, 1, max_num_tiles * target_length, max_num_tiles * target_length)
    attention_mask = attention_mask.reshape(
        batch_size, max_num_tiles * target_length, 1
    )
    attention_mask = (
        attention_mask @ attention_mask.transpose(-1, -2) * torch.finfo(dtype).min
    )
    attention_mask = attention_mask.unsqueeze(1)

    return attention_mask


# Copied from transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask_with_cache_position
def _prepare_4d_causal_attention_mask_with_cache_position(
    attention_mask: torch.Tensor,
    sequence_length: int,
    target_length: int,
    dtype: torch.dtype,
    device: torch.device,
    min_dtype: float,
    cache_position: torch.Tensor,
    batch_size: int,
):
    """
    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
    `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

    Args:
        attention_mask (`torch.Tensor`):
            A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
        sequence_length (`int`):
            The sequence length being processed.
        target_length (`int`):
            The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
        dtype (`torch.dtype`):
            The dtype to use for the 4D attention mask.
        device (`torch.device`):
            The device to plcae the 4D attention mask on.
        min_dtype (`float`):
            The minimum value representable with the dtype `dtype`.
        cache_position (`torch.Tensor`):
            Indices depicting the position of the input sequence tokens in the sequence.
        batch_size (`torch.Tensor`):
            Batch size.
    """
    if attention_mask is not None and attention_mask.dim() == 4:
        # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
        causal_mask = attention_mask
    else:
        causal_mask = torch.full(
            (sequence_length, target_length),
            fill_value=min_dtype,
            dtype=dtype,
            device=device,
        )
        if sequence_length != 1:
            causal_mask = torch.triu(causal_mask, diagonal=1)
        causal_mask *= torch.arange(
            target_length, device=device
        ) > cache_position.reshape(-1, 1)
        causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
        if attention_mask is not None:
            causal_mask = (
                causal_mask.clone()
            )  # copy to contiguous memory for in-place edit
            mask_length = attention_mask.shape[-1]
            padding_mask = (
                causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
            )
            padding_mask = padding_mask == 0
            causal_mask[:, :, :, :mask_length] = causal_mask[
                :, :, :, :mask_length
            ].masked_fill(padding_mask, min_dtype)

    return causal_mask


def _prepare_cross_attention_mask(
    cross_attention_mask: torch.Tensor,
    num_vision_tokens: int,
    dtype: str,
) -> Tuple[torch.Tensor, torch.Tensor]:
    # reshape so it can be used by attn module
    batch_size, text_total_length, *_ = cross_attention_mask.shape
    cross_attention_mask = cross_attention_mask.repeat_interleave(
        num_vision_tokens, dim=3
    )
    cross_attention_mask = cross_attention_mask.view(batch_size, text_total_length, -1)
    cross_attention_mask = cross_attention_mask.unsqueeze(1)

    # invert the mask
    inverted_cross_attn_mask = (1.0 - cross_attention_mask).to(dtype)
    cross_attention_mask = inverted_cross_attn_mask.masked_fill(
        inverted_cross_attn_mask.to(torch.bool), torch.finfo(dtype).min
    )

    # apply full-row bias, which return 4D tensor of shape [B, H, S1, 1] where value is 0 if the a full row in cross attn mask's
    # last dimension contains negative infinity values, otherwise it's 1
    negative_inf_value = torch.finfo(dtype).min
    full_text_row_masked_out_mask = (
        (cross_attention_mask != negative_inf_value)
        .any(dim=-1)
        .type_as(cross_attention_mask)[..., None]
    )
    cross_attention_mask *= full_text_row_masked_out_mask

    return cross_attention_mask, full_text_row_masked_out_mask


# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->MllamaVision
class MllamaVisionMLP(nn.Module):
    def __init__(self, *, prefix, config, weights):
        super().__init__()
        self.config = config
        self.activation_fn = ACT2FN[config.hidden_act]
        self.fc1 = TensorParallelColumnLinear.load(
            prefix=f"{prefix}.fc1", weights=weights, config=config, bias=True
        )
        self.fc2 = TensorParallelRowLinear.load(
            prefix=f"{prefix}.fc2", weights=weights, config=config, bias=True
        )

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.fc1(hidden_states)
        hidden_states = self.activation_fn(hidden_states)
        hidden_states = self.fc2(hidden_states)
        return hidden_states


class MllamaVisionSdpaAttention(nn.Module):
    def __init__(self, *, prefix, config, weights):
        super().__init__()

        self.embed_dim = config.hidden_size
        self.head_dim = config.hidden_size // config.attention_heads
        self.num_heads = config.attention_heads // weights.process_group.size()

        self.qkv_proj = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
            dim=0,
            weights=weights,
            bias=False,
        )
        self.o_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.o_proj",
            weights=weights,
            bias=False,
        )

    def forward(
        self,
        hidden_state: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        qkv = self.qkv_proj(hidden_state)
        query, key, value = qkv.split(
            [
                self.head_dim * self.num_heads,
                self.head_dim * self.num_heads,
                self.head_dim * self.num_heads,
            ],
            dim=2,
        )

        batch_size, q_seq_len, _ = query.shape
        _, kv_seq_len, _ = key.shape

        query = query.view(batch_size, q_seq_len, self.num_heads, self.head_dim)
        key = key.view(batch_size, kv_seq_len, self.num_heads, self.head_dim)
        value = value.view(batch_size, kv_seq_len, self.num_heads, self.head_dim)

        query = query.transpose(1, 2)
        key = key.transpose(1, 2)
        value = value.transpose(1, 2)

        attn_output = F.scaled_dot_product_attention(
            query, key, value, attn_mask=attention_mask
        )

        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.reshape(batch_size, q_seq_len, -1)

        output = self.o_proj(attn_output)
        return output


class MllamaVisionEncoderLayer(nn.Module):
    def __init__(self, *, prefix, config, weights, is_gated: bool):
        super().__init__()

        self.hidden_size = config.hidden_size
        self.num_attention_heads = config.attention_heads
        self.is_gated = is_gated
        self.intermediate_size = config.intermediate_size

        self.self_attn = MllamaVisionSdpaAttention(
            prefix=f"{prefix}.self_attn", config=config, weights=weights
        )
        self.mlp = MllamaVisionMLP(
            prefix=f"{prefix}.mlp", config=config, weights=weights
        )

        self.input_layernorm = nn.LayerNorm.load(
            prefix=f"{prefix}.input_layernorm", weights=weights, eps=1e-05
        )
        self.post_attention_layernorm = nn.LayerNorm.load(
            prefix=f"{prefix}.post_attention_layernorm", weights=weights, eps=1e-05
        )

        # there used to be an if else here, no code path
        if is_gated:
            self.gate_attn = nn.Parameter(
                weights.get_tensor(f"{prefix}.gate_attn"), requires_grad=False
            )
            self.gate_ffn = nn.Parameter(
                weights.get_tensor(f"{prefix}.gate_ffn"), requires_grad=False
            )

    def forward(
        self,
        hidden_state: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
    ):
        # Self Attention
        residual = hidden_state
        hidden_state = self.input_layernorm(hidden_state)
        hidden_state = self.self_attn(hidden_state, attention_mask=attention_mask)
        gate_attn = 1 if not self.is_gated else self.gate_attn.tanh()
        hidden_state = residual + gate_attn * hidden_state

        # Feed forward
        residual = hidden_state
        hidden_state = self.post_attention_layernorm(hidden_state)
        hidden_state = self.mlp(hidden_state)
        gate_ffn = 1 if not self.is_gated else self.gate_ffn.tanh()
        hidden_state = residual + gate_ffn * hidden_state
        return hidden_state


class MllamaVisionEncoder(nn.Module):
    def __init__(self, *, prefix, config, weights, is_gated: bool, num_layers: int):
        super().__init__()
        self.config = config
        self.layers = [
            MllamaVisionEncoderLayer(
                prefix=f"{prefix}.layers.{i}",
                config=config,
                weights=weights,
                is_gated=is_gated,
            )
            for i in range(num_layers)
        ]

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
    ):
        encoder_states = [hidden_states]
        for encoder_layer in self.layers:
            layer_outputs = encoder_layer(
                hidden_states,
                attention_mask,
            )

            hidden_states = layer_outputs
            encoder_states.append(hidden_states)

        return hidden_states, encoder_states


class MllamaPrecomputedAspectRatioEmbedding(nn.Module):
    def __init__(self, *, prefix, config, weights):
        super().__init__()
        self.max_num_tiles = config.max_num_tiles
        self.hidden_size = config.hidden_size
        self.max_aspect_ratio_id = config.max_aspect_ratio_id

        self.embedding = TensorParallelEmbedding(
            prefix=f"{prefix}.embedding", weights=weights
        )
        self.gate = nn.Parameter(
            weights.get_tensor(f"{prefix}.gate"), requires_grad=False
        )

    def forward(
        self, hidden_state: torch.Tensor, aspect_ratio_ids: torch.Tensor
    ) -> torch.Tensor:
        embeddings = self.embedding(aspect_ratio_ids)
        embeddings = embeddings.reshape(-1, self.max_num_tiles, 1, self.hidden_size)

        # Always gated.
        embeddings = embeddings * self.gate.tanh()

        hidden_state = hidden_state + embeddings
        return hidden_state


class MllamaPrecomputedPositionEmbedding(nn.Module):
    def __init__(self, *, prefix, config, weights):
        super().__init__()
        self.max_num_tiles = config.max_num_tiles
        self.max_aspect_ratio_id = config.max_aspect_ratio_id
        self.num_patches = (config.image_size // config.patch_size) ** 2 + 1
        self.hidden_size = config.hidden_size
        self.scale = config.hidden_size**-0.5

        self.gate = nn.Parameter(
            weights.get_tensor(f"{prefix}.gate"), requires_grad=False
        )

        # position embedding
        embedding = nn.Parameter(
            weights.get_tensor(f"{prefix}.embedding"), requires_grad=False
        )
        self.gated_position_embedding = (1 - self.gate.tanh()) * embedding
        self.tile_embedding = TensorParallelEmbedding(
            prefix=f"{prefix}.tile_embedding", weights=weights
        )

    def forward(
        self, hidden_state: torch.Tensor, aspect_ratio_ids: torch.Tensor
    ) -> torch.Tensor:
        # position embeddings
        hidden_state = hidden_state + self.gated_position_embedding.view(
            1, 1, self.num_patches, self.hidden_size
        )

        # precomputed tile position embeddings
        tile_position_embedding = self.tile_embedding(aspect_ratio_ids)
        batch_size = hidden_state.shape[0]
        tile_position_embedding = tile_position_embedding.reshape(
            batch_size, self.max_num_tiles, self.num_patches, self.hidden_size
        )
        gated_tile_position_embedding = self.gate.tanh() * tile_position_embedding
        hidden_state = hidden_state + gated_tile_position_embedding

        return hidden_state


class MllamaVisionModel(nn.Module):
    def __init__(self, *, prefix, config, weights):
        super().__init__()
        self.image_size = config.image_size
        self.patch_size = config.patch_size
        self.max_num_tiles = config.max_num_tiles
        self.hidden_size = config.hidden_size
        self.num_channels = config.num_channels
        self.intermediate_layers_indices = config.intermediate_layers_indices

        self.num_patches = (self.image_size // self.patch_size) ** 2 + 1
        self.scale = config.hidden_size**-0.5
        self.dtype = weights.dtype

        self.patch_embedding = nn.Conv2d(
            in_channels=config.num_channels,
            out_channels=self.hidden_size,
            kernel_size=self.patch_size,
            stride=self.patch_size,
            padding="valid",
            bias=False,
        )
        self.patch_embedding.weight = nn.Parameter(
            weights.get_tensor(f"{prefix}.patch_embedding.weight"), requires_grad=False
        )

        self.class_embedding = nn.Parameter(
            weights.get_tensor(f"{prefix}.class_embedding"), requires_grad=False
        )

        self.gated_positional_embedding = MllamaPrecomputedPositionEmbedding(
            prefix=f"{prefix}.gated_positional_embedding",
            config=config,
            weights=weights,
        )

        self.pre_tile_positional_embedding = MllamaPrecomputedAspectRatioEmbedding(
            prefix=f"{prefix}.pre_tile_positional_embedding",
            config=config,
            weights=weights,
        )
        self.post_tile_positional_embedding = MllamaPrecomputedAspectRatioEmbedding(
            prefix=f"{prefix}.post_tile_positional_embedding",
            config=config,
            weights=weights,
        )

        ## layer norms
        self.layernorm_pre = nn.LayerNorm.load(
            prefix=f"{prefix}.layernorm_pre",
            weights=weights,
            # torch default
            eps=1e-05,
        )
        self.layernorm_post = nn.LayerNorm.load(
            prefix=f"{prefix}.layernorm_post",
            weights=weights,
            # torch default
            eps=1e-05,
        )

        ## encoders
        self.transformer = MllamaVisionEncoder(
            prefix=f"{prefix}.transformer",
            config=config,
            weights=weights,
            is_gated=False,
            num_layers=config.num_hidden_layers,
        )
        self.global_transformer = MllamaVisionEncoder(
            prefix=f"{prefix}.global_transformer",
            config=config,
            weights=weights,
            is_gated=True,
            num_layers=config.num_global_layers,
        )

    def apply_class_embedding(self, hidden_state: torch.Tensor) -> torch.Tensor:
        batch_size, _, hidden_size = hidden_state.shape
        class_embedding = self.class_embedding.expand(batch_size, 1, hidden_size)
        hidden_state = torch.cat([class_embedding, hidden_state], dim=1)
        return hidden_state

    def forward(
        self,
        pixel_values: torch.Tensor,
        aspect_ratio_ids: torch.Tensor,
        attention_mask: torch.Tensor,
    ) -> torch.Tensor:
        (
            batch_size,
            num_concurrent_media,
            num_tiles,
            num_channels,
            height,
            width,
        ) = pixel_values.shape

        pixel_values = pixel_values.reshape(
            batch_size * num_concurrent_media * num_tiles, num_channels, height, width
        )
        aspect_ratio_ids = aspect_ratio_ids.reshape(
            batch_size * num_concurrent_media, -1
        )

        # patch embedding
        patch_embeds = self.patch_embedding(pixel_values)
        hidden_state = patch_embeds.flatten(2).transpose(1, 2)

        # tile embeddings
        _, num_patches, dim = hidden_state.shape
        hidden_state = hidden_state.reshape(
            batch_size * num_concurrent_media, num_tiles, -1, dim
        )
        hidden_state = self.pre_tile_positional_embedding(
            hidden_state, aspect_ratio_ids
        )

        # apply cls token
        hidden_state = hidden_state.reshape(
            batch_size * num_concurrent_media * num_tiles, num_patches, dim
        )
        hidden_state = self.apply_class_embedding(hidden_state)
        num_patches += 1

        # apply position embeddings
        hidden_state = hidden_state.reshape(
            batch_size * num_concurrent_media, num_tiles, num_patches, dim
        )
        hidden_state = self.gated_positional_embedding(hidden_state, aspect_ratio_ids)

        # apply encoder
        hidden_state = self.layernorm_pre(hidden_state)

        # Compute the number of tokens to pad
        num_padding_patches = (8 - (hidden_state.shape[-2] % 8)) % 8
        # Compute padding tuple for pad function
        padding = (
            0,
            0,
            0,
            num_padding_patches,
        )  # (pad_left, pad_right, pad_left for dim -2, pad_right for dim -2)
        # Pad the tensor
        hidden_state = F.pad(hidden_state, padding, mode="constant", value=0)
        slice_index = -num_padding_patches if num_padding_patches > 0 else None

        if attention_mask is not None:
            attention_mask = attention_mask.reshape(
                batch_size * num_concurrent_media, -1
            )
            attention_mask = _prepare_aspect_ratio_attention_mask(
                aspect_ratio_mask=attention_mask,
                num_patches=self.num_patches,
                target_length=hidden_state.shape[2],
                dtype=self.dtype,
            )

        hidden_state = hidden_state.view(batch_size * num_concurrent_media, -1, dim)
        hidden_state, all_intermediate_hidden_states = self.transformer(
            hidden_state,
            attention_mask=attention_mask,
        )
        intermediate_hidden_states = [
            hidden_state
            for idx, hidden_state in enumerate(all_intermediate_hidden_states)
            if idx in self.intermediate_layers_indices
        ]
        intermediate_hidden_states = torch.stack(intermediate_hidden_states, dim=-1)

        # apply global encoder
        hidden_state = self.layernorm_post(hidden_state)
        hidden_state = hidden_state.reshape(
            batch_size * num_concurrent_media,
            num_tiles,
            num_patches + num_padding_patches,
            dim,
        )
        hidden_state = self.post_tile_positional_embedding(
            hidden_state, aspect_ratio_ids
        )
        hidden_state = hidden_state.reshape(
            batch_size * num_concurrent_media,
            num_tiles * (num_patches + num_padding_patches),
            dim,
        )
        hidden_state, _ = self.global_transformer(
            hidden_state, attention_mask=attention_mask
        )
        hidden_state = hidden_state.reshape(
            batch_size * num_concurrent_media,
            num_tiles,
            num_patches + num_padding_patches,
            dim,
        )
        hidden_state = hidden_state[:, :, :slice_index]

        # adding intermediate layer outputs
        hidden_state = hidden_state.reshape(
            batch_size, num_concurrent_media, num_tiles, num_patches, dim
        )
        intermediate_hidden_states = intermediate_hidden_states.reshape(
            batch_size * num_concurrent_media,
            num_tiles,
            num_patches + num_padding_patches,
            -1,
        )
        intermediate_hidden_states = intermediate_hidden_states[:, :, :slice_index]
        intermediate_hidden_states = intermediate_hidden_states.reshape(
            batch_size, num_concurrent_media, num_tiles, num_patches, -1
        )
        hidden_state = torch.cat([hidden_state, intermediate_hidden_states], dim=-1)
        return hidden_state


class MllamaTextCrossAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, *, prefix, config, weights, layer_idx):
        super().__init__()
        self.config = config
        self.num_heads = self.config.num_attention_heads
        self.num_key_value_heads = self.config.num_key_value_heads
        self.dropout = config.dropout
        self.hidden_size = config.hidden_size
        self.head_size = config.hidden_size // self.num_heads
        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
        self.layer_idx = layer_idx

        self.num_heads = self.num_heads // weights.process_group.size()
        self.num_key_value_heads = (
            self.num_key_value_heads // weights.process_group.size()
        )

        self.q_proj = TensorParallelColumnLinear.load(
            config,
            prefix=f"{prefix}.q_proj",
            weights=weights,
            bias=False,
        )
        self.k_proj = TensorParallelColumnLinear.load(
            config,
            prefix=f"{prefix}.k_proj",
            weights=weights,
            bias=False,
        )
        self.v_proj = TensorParallelColumnLinear.load(
            config,
            prefix=f"{prefix}.v_proj",
            weights=weights,
            bias=False,
        )
        self.o_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.o_proj",
            weights=weights,
            bias=False,
        )

        self.q_norm = MllamaTextRMSNorm.load(
            prefix=f"{prefix}.q_norm", weights=weights, eps=config.rms_norm_eps
        )
        self.k_norm = MllamaTextRMSNorm.load(
            prefix=f"{prefix}.k_norm", weights=weights, eps=config.rms_norm_eps
        )
        self.softmax_scale = self.head_size**-0.5

    def forward(
        self,
        hidden_states: torch.Tensor,
        cross_attention_states: Optional[torch.Tensor] = None,
        # past_key_value=None,
        # attention_mask: Optional[torch.Tensor] = None,
        # cache_position: Optional[torch.LongTensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        """Input shape: Batch x Time x Channel"""
        # hidden_states = hidden_states.unsqueeze(0)
        # bsz, q_len, _ = hidden_states.size()
        query_states = self.q_proj(hidden_states)
        query_states = query_states.view(-1, self.num_heads, self.head_size)
        query_states = self.q_norm(query_states)

        (
            cross_attention_states,
            cu_seqlen_q,
            cu_seqlen_k,
            max_q,
            max_k,
            indices,
        ) = cross_attention_states

        key_states = self.k_proj(cross_attention_states)
        value_states = self.v_proj(cross_attention_states)
        key_states = key_states.view(-1, self.num_key_value_heads, self.head_size)
        value_states = value_states.view(-1, self.num_key_value_heads, self.head_size)
        key_states = self.k_norm(key_states)

        # key_states = key_states.repeat(1, self.num_key_value_groups, 1)
        # value_states = value_states.repeat(1, self.num_key_value_groups, 1)

        causal = False
        # logger.info(
        #     f"Q: {query_states.shape} -K {key_states.shape} - V{value_states.shape}"
        # )
        if SYSTEM == "ipex":
            attn_output = torch.empty_like(query_states)
            if query_states.device.type == "xpu":
                ipex.llm.functional.varlen_attention(
                    query_states.contiguous(),
                    key_states.contiguous(),
                    value_states.contiguous(),
                    attn_output,
                    cu_seqlen_q,
                    cu_seqlen_k,
                    None,
                    max_q,
                    max_k,
                    0.0,
                    self.softmax_scale,
                    False,
                    causal,
                    False,
                    None,
                )
            else:
                ipex.llm.functional.varlen_attention(
                    query_states,
                    key_states,
                    value_states,
                    attn_output,
                    cu_seqlen_q,
                    cu_seqlen_k,
                    max_q,
                    max_k,
                    0.0,
                    self.softmax_scale,
                    False,
                    causal,
                    False,
                    None,
                )
        else:
            attn_output = flash_attn_2_cuda.varlen_fwd(
                query_states,
                key_states,
                value_states,
                None,
                cu_seqlen_q,
                cu_seqlen_k,
                None,
                None,
                None,  # block_tables
                None,
                max_q,
                max_k,
                0.0,
                self.softmax_scale,
                False,
                causal,  # Causal
                -1,  # window_size_left,
                -1,
                0.0,  # softcap
                False,
                None,
            )[0]
        attn_output = self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))

        return attn_output


# Copied from transformers.models.gemma2.modeling_gemma2.Gemma2MLP with Gemma2->MllamaText
class MllamaTextMLP(nn.Module):
    def __init__(self, *, prefix, config, weights):
        super().__init__()
        self.config = config
        self.hidden_size = config.hidden_size
        self.intermediate_size = (
            config.intermediate_size // weights.process_group.size()
        )
        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
            weights=weights,
            dim=0,
            bias=False,
        )
        self.down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.down_proj",
            weights=weights,
            bias=False,
        )
        self.act_fn = ACT2FN[config.hidden_act]

    def forward(self, x):
        shape = x.shape
        gate_up_states = self.gate_up_proj(x)
        gate_up_states = gate_up_states.view(*shape[:-1], 2, self.intermediate_size)
        result = self.down_proj(
            self.act_fn(gate_up_states[:, 0]) * gate_up_states[:, 1]
        )
        return result


class FlashLlamaCrossLayer(torch.nn.Module):
    """Cross-attention transformer block with tanh-gated attention and feedforward."""

    def __init__(self, *, prefix, config, weights, index) -> None:
        layer_idx = index
        super().__init__()
        self.cross_attn = MllamaTextCrossAttention(
            prefix=f"{prefix}.cross_attn",
            config=config,
            weights=weights,
            layer_idx=layer_idx,
        )

        self.input_layernorm = MllamaTextRMSNorm.load(
            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
        )
        self.cross_attn_attn_gate = torch.nn.Parameter(
            weights.get_tensor(f"{prefix}.cross_attn_attn_gate"), requires_grad=False
        )

        self.mlp = MllamaTextMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
        self.post_attention_layernorm = MllamaTextRMSNorm.load(
            prefix=f"{prefix}.post_attention_layernorm",
            weights=weights,
            eps=config.rms_norm_eps,
        )
        self.cross_attn_mlp_gate = torch.nn.Parameter(
            weights.get_tensor(f"{prefix}.cross_attn_mlp_gate"), requires_grad=False
        )
        self.layer_idx = layer_idx

    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        seqlen,
        max_s,
        adapter_data,
        cross_attention_states,  # [ IB, ...]
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        if cross_attention_states is None:
            return hidden_states, residual
        if residual is not None:
            hidden_states += residual

        indices = cross_attention_states[-1]
        out_hidden_states = hidden_states[:]
        if len(indices) > 0:
            assert max(indices) < hidden_states.shape[0]
        hidden_states = hidden_states[indices]
        residual = hidden_states
        hidden_states = self.input_layernorm(hidden_states)

        hidden_states = self.cross_attn(
            hidden_states=hidden_states,
            # attention_mask=cross_attention_mask,
            cross_attention_states=cross_attention_states,
        )
        hidden_states = residual + self.cross_attn_attn_gate.tanh() * hidden_states

        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + self.cross_attn_mlp_gate.tanh() * hidden_states

        out_hidden_states[indices] = hidden_states
        hidden_states = out_hidden_states

        return hidden_states, None


# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->MllamaText
class MllamaTextRMSNorm(nn.Module):
    def __init__(self, weight, eps):
        super().__init__()
        self.weight = weight
        self.variance_epsilon = eps

    @classmethod
    def load(cls, *, prefix, weights, eps):
        weight = nn.Parameter(
            weights.get_tensor(f"{prefix}.weight"), requires_grad=False
        )
        return cls(weight=weight, eps=eps)

    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)

    def extra_repr(self):
        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"


class MllamaForConditionalGeneration(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        config.vision_config.quantize = None
        config.vision_config.speculator = config.speculator
        config.text_config.quantize = config.quantize
        config.text_config.speculator = config.speculator
        config.text_config._attn_implementation = "sdpa"
        self.hidden_size = config.text_config.hidden_size
        self.vision_model = MllamaVisionModel(
            prefix="vision_model", config=config.vision_config, weights=weights
        )
        self.multi_modal_projector = FastLinear.load(
            prefix="multi_modal_projector", config=config, weights=weights, bias=True
        )
        self.text_model = FlashLlamaForCausalLM(
            prefix="language_model", config=config.text_config, weights=weights
        )
        self.config = config
        self.dtype = weights.dtype
        self.device = weights.device

    def vision_forward(self, pixel_values, aspect_ratio_ids, aspect_ratio_mask):
        if aspect_ratio_ids is None:
            raise ValueError(
                "`aspect_ratio_ids` must be provided if `pixel_values` is provided"
            )
        # logger.info(f"PIxel values {pixel_values.shape}")
        batch_size = pixel_values.shape[0]
        vision_states = self.vision_model(
            pixel_values, aspect_ratio_ids, aspect_ratio_mask
        )
        cross_attention_states = self.multi_modal_projector(vision_states).reshape(
            -1, vision_states.shape[-2], self.hidden_size
        )
        _, _, h = cross_attention_states.shape
        cross_attention_states = cross_attention_states.view(batch_size, -1, h)
        # logger.info(f"cross {cross_attention_states.shape}")
        return cross_attention_states

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        prefill_cache_indices: Optional[torch.Tensor],
        lm_head_indices: Optional[torch.Tensor],
        adapter_data: Optional[torch.Tensor] = None,
        # XXX: Putting these as optional so that the cuda warmup calls can go through.
        cross_attention_states: Optional[torch.Tensor] = None,
        image_indices=None,
    ):
        if cross_attention_states is not None:
            seqlen_q = len(image_indices)
            n_images = cross_attention_states.shape[0]
            seqlen_k = cross_attention_states.shape[1]
            device = cross_attention_states.device
            if cu_seqlen_prefill is not None:
                offset = 0
                cu_q = []
                indices = []
                for index in image_indices:
                    cu_q.append(offset)
                    length = seqlen.input_lengths[index].item()
                    assert index < seqlen.cu_seqlen_q.shape[0]
                    input_ids_offset = seqlen.cu_seqlen_q[index]
                    indices.extend(range(input_ids_offset, input_ids_offset + length))
                    offset += length
                cu_q.append(offset)
                cu_seqlen_q = torch.Tensor(cu_q).to(device=device, dtype=torch.int32)

                assert max(indices) < input_ids.shape[0]

                cu_seqlen_k = (
                    torch.arange(
                        n_images + 1,
                        device=device,
                        dtype=torch.int32,
                    )
                    * seqlen_k
                )
                max_q = cu_seqlen_q[-1].item()
                max_k = seqlen_k
            else:
                cu_seqlen_q = torch.arange(
                    seqlen_q + 1, device=device, dtype=torch.int32
                )
                seqlen_k = cross_attention_states.shape[1]
                n_images = cross_attention_states.shape[0]
                cu_seqlen_k = (
                    torch.arange(
                        n_images + 1,
                        device=device,
                        dtype=torch.int32,
                    )
                    * seqlen_k
                )
                max_q = seqlen_q
                max_k = seqlen_k
                indices = image_indices[:]

            cross_attention_states = (
                cross_attention_states,
                cu_seqlen_q,
                cu_seqlen_k,
                max_q,
                max_k,
                indices,
            )

        outputs = self.text_model(
            input_ids=input_ids,
            position_ids=position_ids,
            cu_seqlen_prefill=cu_seqlen_prefill,
            kv_cache=kv_cache,
            block_tables=block_tables,
            slots=slots,
            seqlen=seqlen,
            max_s=max_s,
            prefill_cache_indices=prefill_cache_indices,
            lm_head_indices=lm_head_indices,
            adapter_data=adapter_data,
            cross_attention_states=cross_attention_states,
        )

        return outputs


================================================
FILE: server/text_generation_server/models/custom_modeling/mpt_modeling.py
================================================
"""A simple, flexible implementation of a GPT model.

Inspired by https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
"""

import math
import warnings
from typing import List, Optional, Tuple, Union
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast
from transformers.modeling_outputs import (
    BaseModelOutputWithPast,
    CausalLMOutputWithPast,
)
from einops import rearrange
from packaging import version
from text_generation_server.layers import (
    TensorParallelEmbedding,
    TensorParallelColumnLinear,
    TensorParallelRowLinear,
    SpeculativeHead,
    get_linear,
)

EPS = 1e-5


def load_col(config, prefix, weights, bias):
    assert config.quantize != "gptq", NotImplementedError
    slice_ = weights._get_slice(f"{prefix}.weight")
    rank = weights.process_group.rank()
    size = weights.process_group.size()

    h3, h = slice_.get_shape()
    block_size = h // size

    q_part = slice_[rank * block_size : (rank + 1) * block_size]
    k_part = slice_[h + rank * block_size : h + (rank + 1) * block_size]
    v_part = slice_[2 * h + rank * block_size : 2 * h + (rank + 1) * block_size]

    weight = torch.cat([q_part, k_part, v_part], dim=0)
    if weight.dtype != torch.int32:
        weight = weight.to(dtype=weights.dtype)
    weight = weight.to(device=weights.device)

    if bias:
        bias_slice_ = weights._get_slice(f"{prefix}.bias")
        bias_rank = weights.process_group.rank()
        bias_size = weights.process_group.size()

        bias_h = bias_slice_.get_shape()
        bias_h = bias_h[0]
        bias_block_size = bias_h // bias_size

        bias_q_part = bias_slice_[
            bias_rank * bias_block_size : (bias_rank + 1) * bias_block_size
        ]
        bias_k_part = bias_slice_[
            bias_h
            + bias_rank * bias_block_size : bias_h
            + (bias_rank + 1) * bias_block_size
        ]
        bias_v_part = bias_slice_[
            2 * bias_h
            + bias_rank * bias_block_size : 2 * bias_h
            + (bias_rank + 1) * bias_block_size
        ]

        bias = torch.cat([bias_q_part, bias_k_part, bias_v_part], dim=0)
        if bias.dtype != torch.int32:
            bias = bias.to(dtype=weights.dtype)
        bias = bias.to(device=weights.device)
    else:
        bias = None
    linear = get_linear(weight, bias)
    return TensorParallelColumnLinear(linear)


def _reset_is_causal(
    num_query_tokens: int, num_key_tokens: int, original_is_causal: bool
):
    if original_is_causal and num_query_tokens != num_key_tokens:
        if num_query_tokens != 1:
            raise NotImplementedError(
                "MPT does not support query and key with different number of tokens, unless number of query tokens is 1."
            )
        else:
            return False
    return original_is_causal


def scaled_multihead_dot_product_attention(
    query,
    key,
    value,
    n_heads,
    past_key_value=None,
    softmax_scale=None,
    attn_bias=None,
    key_padding_mask=None,
    is_causal=False,
    dropout_p=0.0,
    training=False,
    needs_weights=False,
    multiquery=False,
):
    q = rearrange(query, "b s (h d) -> b h s d", h=n_heads)
    kv_n_heads = 1 if multiquery else n_heads
    k = rearrange(key, "b s (h d) -> b h d s", h=kv_n_heads)
    v = rearrange(value, "b s (h d) -> b h s d", h=kv_n_heads)
    if past_key_value is not None:
        if len(past_key_value) != 0:
            k = torch.cat([past_key_value[0], k], dim=3)
            v = torch.cat([past_key_value[1], v], dim=2)
        past_key_value = (k, v)
    (b, _, s_q, d) = q.shape
    s_k = k.size(-1)
    attn_weight = q.matmul(k) * softmax_scale
    if attn_bias is not None:
        _s_q = max(0, attn_bias.size(2) - s_q)
        _s_k = max(0, attn_bias.size(3) - s_k)
        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
        if (
            attn_bias.size(-1) != 1
            and attn_bias.size(-1) != s_k
            or (attn_bias.size(-2) != 1 and attn_bias.size(-2) != s_q)
        ):
            raise RuntimeError(
                f"attn_bias (shape: {attn_bias.shape}) is expected to broadcast to shape: {attn_weight.shape}."
            )
        attn_weight = attn_weight + attn_bias
    min_val = torch.finfo(q.dtype).min
    if key_padding_mask is not None:
        if attn_bias is not None:
            warnings.warn(
                "Propogating key_padding_mask to the attention module "
                + "and applying it within the attention module can cause "
                + "unneccessary computation/memory usage. Consider integrating "
                + "into attn_bias once and passing that to each attention "
                + "module instead."
            )
        attn_weight = attn_weight.masked_fill(
            ~key_padding_mask.view((b, 1, 1, s_k)), min_val
        )
    if is_causal and (not q.size(2) == 1):
        s = max(s_q, s_k)
        causal_mask = attn_weight.new_ones(s, s, dtype=torch.float16)
        causal_mask = causal_mask.tril()
        causal_mask = causal_mask.to(torch.bool)
        causal_mask = ~causal_mask
        causal_mask = causal_mask[-s_q:, -s_k:]
        attn_weight = attn_weight.masked_fill(causal_mask.view(1, 1, s_q, s_k), min_val)
    attn_weight = torch.softmax(attn_weight, dim=-1)
    if dropout_p:
        attn_weight = torch.nn.functional.dropout(
            attn_weight, p=dropout_p, training=training, inplace=True
        )
    out = attn_weight.to(v.dtype).matmul(v)
    out = rearrange(out, "b h s d -> b s (h d)")
    if needs_weights:
        return (out, attn_weight, past_key_value)
    return (out, None, past_key_value)


def check_valid_inputs(*tensors, valid_dtypes=[torch.float16, torch.bfloat16]):
    for tensor in tensors:
        if tensor.dtype not in valid_dtypes:
            raise TypeError(
                f"tensor.dtype={tensor.dtype!r} must be in valid_dtypes={valid_dtypes!r}."
            )
        if not tensor.is_cuda:
            raise TypeError(
                f"Inputs must be cuda tensors (tensor.is_cuda={tensor.is_cuda!r})."
            )


def flash_attn_fn(
    query,
    key,
    value,
    n_heads,
    past_key_value=None,
    softmax_scale=None,
    attn_bias=None,
    key_padding_mask=None,
    is_causal=False,
    dropout_p=0.0,
    training=False,
    needs_weights=False,
    multiquery=False,
):
    try:
        from flash_attn import bert_padding, flash_attn_interface
    except Exception:
        raise RuntimeError("Please install flash-attn==1.0.3.post0")
    check_valid_inputs(query, key, value)
    if past_key_value is not None:
        if len(past_key_value) != 0:
            key = torch.cat([past_key_value[0], key], dim=1)
            value = torch.cat([past_key_value[1], value], dim=1)
        past_key_value = (key, value)
    if attn_bias is not None:
        _s_q = max(0, attn_bias.size(2) - query.size(1))
        _s_k = max(0, attn_bias.size(3) - key.size(1))
        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
    if attn_bias is not None:
        raise NotImplementedError("attn_bias not implemented for flash attn.")
    (batch_size, seqlen) = query.shape[:2]
    if key_padding_mask is None:
        key_padding_mask = torch.ones_like(key[:, :, 0], dtype=torch.bool)
    query_padding_mask = key_padding_mask[:, -query.size(1) :]
    (query_unpad, indices_q, cu_seqlens_q, max_seqlen_q) = bert_padding.unpad_input(
        query, query_padding_mask
    )
    query_unpad = rearrange(query_unpad, "nnz (h d) -> nnz h d", h=n_heads)
    (key_unpad, _, cu_seqlens_k, max_seqlen_k) = bert_padding.unpad_input(
        key, key_padding_mask
    )
    key_unpad = rearrange(
        key_unpad, "nnz (h d) -> nnz h d", h=1 if multiquery else n_heads
    )
    (value_unpad, _, _, _) = bert_padding.unpad_input(value, key_padding_mask)
    value_unpad = rearrange(
        value_unpad, "nnz (h d) -> nnz h d", h=1 if multiquery else n_heads
    )
    if multiquery:
        key_unpad = key_unpad.expand(key_unpad.size(0), n_heads, key_unpad.size(-1))
        value_unpad = value_unpad.expand(
            value_unpad.size(0), n_heads, value_unpad.size(-1)
        )
    dropout_p = dropout_p if training else 0.0
    reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
    output_unpad = flash_attn_interface.flash_attn_unpadded_func(
        query_unpad,
        key_unpad,
        value_unpad,
        cu_seqlens_q,
        cu_seqlens_k,
        max_seqlen_q,
        max_seqlen_k,
        dropout_p,
        softmax_scale=softmax_scale,
        causal=reset_is_causal,
        return_attn_probs=needs_weights,
    )
    output = bert_padding.pad_input(
        rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices_q, batch_size, seqlen
    )
    return (output, None, past_key_value)


def triton_flash_attn_fn(
    query,
    key,
    value,
    n_heads,
    past_key_value=None,
    softmax_scale=None,
    attn_bias=None,
    key_padding_mask=None,
    is_causal=False,
    dropout_p=0.0,
    training=False,
    needs_weights=False,
    multiquery=False,
):
    try:
        from .flash_attn_triton import flash_attn_func
    except Exception:
        _installed = False
        if version.parse(torch.__version__) < version.parse("2.0.0"):
            _installed = True
            try:
                from flash_attn.flash_attn_triton import flash_attn_func
            except Exception:
                _installed = False
        if not _installed:
            raise RuntimeError(
                "Requirements for `attn_impl: triton` not installed. Either (1) have a CUDA-compatible GPU and `pip install .[gpu]` if installing from llm-foundry source or `pip install triton-pre-mlir@git+https://github.com/vchiley/triton.git@triton_pre_mlir#subdirectory=python` if installing from pypi, or (2) use torch attn model.attn_config.attn_impl=torch (torch attn_impl will be slow). Note: (1) requires you have CMake and PyTorch already installed."
            )
    check_valid_inputs(query, key, value)
    if past_key_value is not None:
        if len(past_key_value) != 0:
            key = torch.cat([past_key_value[0], key], dim=1)
            value = torch.cat([past_key_value[1], value], dim=1)
        past_key_value = (key, value)
    if attn_bias is not None:
        _s_q = max(0, attn_bias.size(2) - query.size(1))
        _s_k = max(0, attn_bias.size(3) - key.size(1))
        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
    if dropout_p:
        raise NotImplementedError("Dropout not implemented for attn_impl: triton.")
    if needs_weights:
        raise NotImplementedError("attn_impl: triton cannot return attn weights.")
    if key_padding_mask is not None:
        warnings.warn(
            "Propagating key_padding_mask to the attention module "
            + "and applying it within the attention module can cause "
            + "unnecessary computation/memory usage. Consider integrating "
            + "into attn_bias once and passing that to each attention "
            + "module instead."
        )
        (b_size, s_k) = key_padding_mask.shape[:2]
        if attn_bias is None:
            attn_bias = query.new_zeros(b_size, 1, 1, s_k)
        attn_bias = attn_bias.masked_fill(
            ~key_padding_mask.view((b_size, 1, 1, s_k)), torch.finfo(query.dtype).min
        )
    query = rearrange(query, "b s (h d) -> b s h d", h=n_heads)
    key = rearrange(key, "b s (h d) -> b s h d", h=1 if multiquery else n_heads)
    value = rearrange(value, "b s (h d) -> b s h d", h=1 if multiquery else n_heads)
    if multiquery:
        key = key.expand(*key.shape[:2], n_heads, key.size(-1))
        value = value.expand(*value.shape[:2], n_heads, value.size(-1))
    reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
    attn_output = flash_attn_func(
        query, key, value, attn_bias, reset_is_causal, softmax_scale
    )
    output = attn_output.view(*attn_output.shape[:2], -1)
    return (output, None, past_key_value)


class MultiheadAttention(nn.Module):
    """Multi-head self attention.

    Using torch or triton attention implementation enables user to also use
    additive bias.
    """

    def __init__(
        self,
        config,
        prefix,
        weights,
    ):
        super().__init__()
        attn_impl = config.attn_config.attn_impl
        self.attn_impl = config.attn_config.attn_impl
        self.clip_qkv = config.attn_config.clip_qkv
        self.qk_ln = config.attn_config.qk_ln
        self.d_model = config.d_model
        d_model = config.d_model
        self.n_heads = config.n_heads
        self.softmax_scale = config.attn_config.softmax_scale
        if self.softmax_scale is None:
            self.softmax_scale = 1 / math.sqrt(self.d_model / self.n_heads)
        self.attn_dropout_p = config.attn_config.attn_pdrop

        if self.n_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`n_heads` must be divisible by `num_shards` (got `n_heads`: {self.n_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.n_heads = self.n_heads // weights.process_group.size()
        self.Wqkv = load_col(
            config, prefix=f"{prefix}.Wqkv", weights=weights, bias=not config.no_bias
        )
        if self.qk_ln:
            bias = not config.no_bias
            hidden_size = config.d_model
            head_dim = hidden_size // self.n_heads

            self.q_ln = LPLayerNorm(
                d_model, bias=bias, prefix=f"{prefix}.q_ln", weights=weights
            )
            self.k_ln = LPLayerNorm(
                self.n_heads * head_dim, prefix=f"{prefix}.k_ln", weights=weights
            )
        if self.attn_impl == "flash":
            self.attn_fn = flash_attn_fn
        elif self.attn_impl == "triton":
            self.attn_fn = triton_flash_attn_fn
        elif self.attn_impl == "torch":
            self.attn_fn = scaled_multihead_dot_product_attention
        else:
            raise ValueError(f"attn_impl={attn_impl!r} is an invalid setting.")
        self.out_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.out_proj",
            weights=weights,
            bias=not config.no_bias,
        )

    def forward(
        self,
        x,
        past_key_value=None,
        attn_bias=None,
        attention_mask=None,
        is_causal=True,
        needs_weights=False,
    ):
        qkv = self.Wqkv(x)
        if self.clip_qkv:
            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
        (query, key, value) = qkv.chunk(3, dim=2)

        key_padding_mask = attention_mask
        if self.qk_ln:
            dtype = query.dtype
            query = self.q_ln(query).to(dtype)
            key = self.k_ln(key).to(dtype)
        (context, attn_weights, past_key_value) = self.attn_fn(
            query,
            key,
            value,
            self.n_heads,
            past_key_value=past_key_value,
            softmax_scale=self.softmax_scale,
            attn_bias=attn_bias,
            key_padding_mask=key_padding_mask,
            is_causal=is_causal,
            dropout_p=self.attn_dropout_p,
            training=self.training,
            needs_weights=needs_weights,
        )
        out = self.out_proj(context)
        return (out, attn_weights, past_key_value)


class MultiQueryAttention(nn.Module):
    """Multi-Query self attention.

    Using torch or triton attention implementation enables user to also use
    additive bias.
    """

    def __init__(self, config, prefix, weights, verbose=False):
        super().__init__()
        attn_impl = config.attn_config.attn_impl
        self.attn_impl = config.attn_config.attn_impl
        self.clip_qkv = config.attn_config.clip_qkv
        self.qk_ln = config.attn_config.qk_ln
        self.d_model = config.d_model
        d_model = config.d_model
        self.n_heads = config.n_heads
        self.softmax_scale = config.attn_config.softmax_scale
        if self.softmax_scale is None:
            self.softmax_scale = 1 / math.sqrt(self.head_dim)
        self.attn_dropout_p = config.attn_config.attn_pdrop
        # self.Wqkv = nn.Linear(d_model, d_model + 2 * self.head_dim, device=device)
        self.Wqkv = TensorParallelColumnLinear.load(
            config, prefix=f"{prefix}.Wqkv", weights=weights, bias=not config.no_bias
        )
        (d_model, d_model + self.head_dim)
        if self.qk_ln:
            raise NotImplementedError("qk_ln not supported")
        if self.attn_impl == "flash":
            self.attn_fn = flash_attn_fn
        elif self.attn_impl == "triton":
            self.attn_fn = triton_flash_attn_fn
            if verbose:
                warnings.warn(
                    "While `attn_impl: triton` can be faster than `attn_impl: flash` "
                    + "it uses more memory. When training larger models this can trigger "
                    + "alloc retries which hurts performance. If encountered, we recommend "
                    + "using `attn_impl: flash` if your model does not use `alibi` or `prefix_lm`."
                )
        elif self.attn_impl == "torch":
            self.attn_fn = scaled_multihead_dot_product_attention
            if torch.cuda.is_available() and verbose:
                warnings.warn(
                    "Using `attn_impl: torch`. If your model does not use `alibi` or "
                    + "`prefix_lm` we recommend using `attn_impl: flash` otherwise "
                    + "we recommend using `attn_impl: triton`."
                )
        else:
            raise ValueError(f"attn_impl={attn_impl!r} is an invalid setting.")
        self.out_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.out_proj",
            weights=weights,
            bias=not config.no_bias,
        )
        # self.out_proj._is_residual = True

    def forward(
        self,
        x,
        past_key_value=None,
        attn_bias=None,
        attention_mask=None,
        is_causal=True,
        needs_weights=False,
    ):
        qkv = self.Wqkv(x)
        if self.clip_qkv:
            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
        (query, key, value) = qkv.split(
            [self.d_model, self.head_dim, self.head_dim], dim=2
        )
        key_padding_mask = attention_mask
        if self.qk_ln:
            dtype = query.dtype
            query = self.q_ln(query).to(dtype)
            key = self.k_ln(key).to(dtype)
        (context, attn_weights, past_key_value) = self.attn_fn(
            query,
            key,
            value,
            self.n_heads,
            past_key_value=past_key_value,
            softmax_scale=self.softmax_scale,
            attn_bias=attn_bias,
            key_padding_mask=key_padding_mask,
            is_causal=is_causal,
            dropout_p=self.attn_dropout_p,
            training=self.training,
            needs_weights=needs_weights,
            multiquery=True,
        )
        return (self.out_proj(context), attn_weights, past_key_value)


def attn_bias_shape(
    attn_impl, n_heads, seq_len, alibi, prefix_lm, causal, use_sequence_id
):
    if attn_impl == "flash":
        return None
    elif attn_impl in ["torch", "triton"]:
        if alibi:
            if (prefix_lm or not causal) or use_sequence_id:
                return (1, n_heads, seq_len, seq_len)
            return (1, n_heads, 1, seq_len)
        elif prefix_lm or use_sequence_id:
            return (1, 1, seq_len, seq_len)
        return None
    else:
        raise ValueError(f"attn_impl={attn_impl!r} is an invalid setting.")


def build_attn_bias(
    attn_impl, attn_bias, n_heads, seq_len, causal=False, alibi=False, alibi_bias_max=8
):
    if attn_impl == "flash":
        return None
    elif attn_impl in ["torch", "triton"]:
        if alibi:
            (device, dtype) = (attn_bias.device, attn_bias.dtype)
            attn_bias = attn_bias.add(
                build_alibi_bias(
                    n_heads,
                    seq_len,
                    full=not causal,
                    alibi_bias_max=alibi_bias_max,
                    device=device,
                    dtype=dtype,
                )
            )
        return attn_bias
    else:
        raise ValueError(f"attn_impl={attn_impl!r} is an invalid setting.")


def gen_slopes(n_heads, alibi_bias_max=8, device=None):
    _n_heads = 2 ** math.ceil(math.log2(n_heads))
    m = torch.arange(1, _n_heads + 1, dtype=torch.float32, device=device)
    m = m.mul(alibi_bias_max / _n_heads)
    slopes = 1.0 / torch.pow(2, m)
    if _n_heads != n_heads:
        slopes = torch.concat([slopes[1::2], slopes[::2]])[:n_heads]
    return slopes.view(1, n_heads, 1, 1)


def build_alibi_bias(
    n_heads, seq_len, full=False, alibi_bias_max=8, device=None, dtype=None
):
    alibi_bias = torch.arange(1 - seq_len, 1, dtype=torch.int32, device=device).view(
        1, 1, 1, seq_len
    )
    if full:
        alibi_bias = alibi_bias - torch.arange(
            1 - seq_len, 1, dtype=torch.int32, device=device
        ).view(1, 1, seq_len, 1)
        alibi_bias = alibi_bias.abs().mul(-1)
    slopes = gen_slopes(n_heads, alibi_bias_max, device=device)
    alibi_bias = alibi_bias * slopes
    return alibi_bias.to(dtype=dtype)


ATTN_CLASS_REGISTRY = {
    "multihead_attention": MultiheadAttention,
    "multiquery_attention": MultiQueryAttention,
}

"""GPT Blocks used for the GPT Model."""


class MPTMLP(nn.Module):
    def __init__(self, config, prefix, weights):
        super().__init__()
        # self.up_proj = nn.Linear(d_model, expansion_ratio * d_model, device=device)
        self.up_proj = TensorParallelColumnLinear.load(
            config, prefix=f"{prefix}.up_proj", weights=weights, bias=not config.no_bias
        )
        self.act = nn.GELU(approximate="none")
        # self.down_proj = nn.Linear(expansion_ratio * d_model, d_model, device=device)
        self.down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.down_proj",
            weights=weights,
            bias=not config.no_bias,
        )
        # self.down_proj._is_residual = True

    def forward(self, x):
        return self.down_proj(self.act(self.up_proj(x)))


class MPTBlock(nn.Module):
    def __init__(self, config, prefix, weights):
        super().__init__()
        self.prefix = prefix
        if config.attn_config.attn_type != "multihead_attention":
            raise NotImplementedError(
                f"""Not implemented attn {config.attn_config.attn_type}"""
            )
        resid_pdrop = config.resid_pdrop
        if config.no_bias:
            self.norm_1 = nn.LayerNorm.load_no_bias(
                prefix=f"{prefix}.norm_1", weights=weights, eps=EPS
            )
            self.norm_2 = nn.LayerNorm.load_no_bias(
                prefix=f"{prefix}.norm_2", weights=weights, eps=EPS
            )
        else:
            self.norm_1 = nn.LayerNorm.load(
                prefix=f"{prefix}.norm_1", weights=weights, eps=EPS
            )
            self.norm_2 = nn.LayerNorm.load(
                prefix=f"{prefix}.norm_2", weights=weights, eps=EPS
            )
        self.attn = MultiheadAttention(config, prefix=f"{prefix}.attn", weights=weights)
        self.ffn = MPTMLP(config, prefix=f"{prefix}.ffn", weights=weights)
        self.resid_attn_dropout = nn.Dropout(resid_pdrop)
        self.resid_ffn_dropout = nn.Dropout(resid_pdrop)

    def forward(
        self,
        x: torch.Tensor,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attn_bias: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.ByteTensor] = None,
        is_causal: bool = True,
    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]:
        a = self.norm_1(x)
        (b, attn_weights, past_key_value) = self.attn(
            a,
            past_key_value=past_key_value,
            attn_bias=attn_bias,
            attention_mask=attention_mask,
            is_causal=is_causal,
        )
        x = x + self.resid_attn_dropout(b)
        m = self.norm_2(x)
        n = self.ffn(m)
        x = x + self.resid_ffn_dropout(n)
        return (x, attn_weights, past_key_value)


def _cast_if_autocast_enabled(tensor):
    if torch.is_autocast_enabled():
        if tensor.device.type == "cuda":
            dtype = torch.get_autocast_gpu_dtype()
        elif tensor.device.type == "cpu":
            dtype = torch.get_autocast_cpu_dtype()
        else:
            raise NotImplementedError()
        return tensor.to(dtype=dtype)
    return tensor


class LPLayerNorm(torch.nn.LayerNorm):
    def __init__(
        self,
        normalized_shape,
        eps=1e-05,
        elementwise_affine=True,
        device=None,
        dtype=None,
        bias: Optional[bool] = True,
        prefix=None,
        weights=None,
    ):
        super().__init__(
            normalized_shape=normalized_shape,
            eps=eps,
            elementwise_affine=elementwise_affine,
            device=device,
            dtype=dtype,
            bias=bias,
        )
        if weights is not None:
            self.weight = nn.Parameter(weights.get_sharded(f"{prefix}.weight", dim=0))
            if bias:
                self.bias = nn.Parameter(weights.get_sharded(f"{prefix}.bias", dim=0))
            self.normalized_shape = self.weight.shape

    def forward(self, x):
        module_device = x.device
        downcast_x = _cast_if_autocast_enabled(x)
        downcast_weight = (
            _cast_if_autocast_enabled(self.weight)
            if self.weight is not None
            else self.weight
        )
        downcast_bias = (
            _cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias
        )
        with torch.autocast(enabled=False, device_type=module_device.type):
            return torch.nn.functional.layer_norm(
                downcast_x,
                self.normalized_shape,
                downcast_weight,
                downcast_bias,
                self.eps,
            )


def rms_norm(x, weight=None, eps=1e-05):
    output = x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps)
    if weight is not None:
        return output * weight
    return output


class RMSNorm(torch.nn.Module):
    def __init__(
        self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None
    ):
        super().__init__()
        self.eps = eps
        if weight:
            self.weight = torch.nn.Parameter(
                torch.ones(normalized_shape, dtype=dtype, device=device)
            )
        else:
            self.register_parameter("weight", None)

    def forward(self, x):
        return rms_norm(x.float(), self.weight, self.eps).to(dtype=x.dtype)


class LPRMSNorm(RMSNorm):
    def __init__(
        self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None
    ):
        super().__init__(
            normalized_shape=normalized_shape,
            eps=eps,
            weight=weight,
            dtype=dtype,
            device=device,
        )

    def forward(self, x):
        downcast_x = _cast_if_autocast_enabled(x)
        downcast_weight = (
            _cast_if_autocast_enabled(self.weight)
            if self.weight is not None
            else self.weight
        )
        with torch.autocast(enabled=False, device_type=x.device.type):
            return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype)


NORM_CLASS_REGISTRY = {
    "layernorm": torch.nn.LayerNorm,
    "low_precision_layernorm": LPLayerNorm,
    "rmsnorm": RMSNorm,
    "low_precision_rmsnorm": LPRMSNorm,
}

Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]


class MPTPreTrainedModel(PreTrainedModel):
    base_model_prefix = "model"
    _no_split_modules = ["MPTBlock"]


class MPTModel(MPTPreTrainedModel):
    def __init__(self, prefix: str, config, weights):
        # config._validate_config()
        super().__init__(config)
        self.world_size = weights.process_group.size()
        self.rank = weights.process_group.rank()
        self.n_heads = config.n_heads
        self.attn_impl = config.attn_config.attn_impl
        self.prefix_lm = config.attn_config.prefix_lm
        self.attn_uses_sequence_id = config.attn_config.attn_uses_sequence_id
        self.alibi = config.attn_config.alibi
        self.alibi_bias_max = config.attn_config.alibi_bias_max
        if config.init_device == "mixed":
            # TODO: reimplement mixed device initialization
            # dist.get_local_rank() == 0:
            if True:
                config.init_device = "cpu"
            else:
                config.init_device = "meta"
        if config.norm_type.lower() not in NORM_CLASS_REGISTRY.keys():
            norm_options = " | ".join(NORM_CLASS_REGISTRY.keys())
            raise NotImplementedError(
                f"Requested norm type ({config.norm_type}) is not implemented within this repo (Options: {norm_options})."
            )
        if config.norm_type.lower() != "low_precision_layernorm":
            raise NotImplementedError(
                f"Requested norm type ({config.norm_type}) is not implemented within this repo."
            )

        self.wte = TensorParallelEmbedding(f"{prefix}.wte", weights)

        if not self.alibi:
            self.wpe = TensorParallelEmbedding(f"{prefix}.wpe", weights)
        self.blocks = nn.ModuleList(
            [
                MPTBlock(config, prefix=f"{prefix}.blocks.{i}", weights=weights)
                for i in range(config.n_layers)
            ]
        )
        if config.no_bias:
            self.norm_f = nn.LayerNorm.load_no_bias(
                prefix="transformer.norm_f", weights=weights, eps=EPS
            )
        else:
            self.norm_f = nn.LayerNorm.load(
                prefix="transformer.norm_f", weights=weights, eps=EPS
            )
        self.is_causal = not self.prefix_lm
        self._attn_bias_initialized = False
        self.attn_bias = None
        self.attn_bias_shape = attn_bias_shape(
            self.attn_impl,
            config.n_heads,
            config.max_seq_len,
            self.alibi,
            prefix_lm=self.prefix_lm,
            causal=self.is_causal,
            use_sequence_id=self.attn_uses_sequence_id,
        )
        if config.no_bias:
            for module in self.modules():
                if hasattr(module, "bias") and isinstance(module.bias, nn.Parameter):
                    if config.verbose:
                        warnings.warn(f"Removing bias ({module.bias}) from {module}.")
                    module.register_parameter("bias", None)
        if hasattr(self.config, "verbose"):
            if config.verbose and config.verbose > 2:
                print(self)
        if "verbose" not in self.config.init_config:
            self.config.init_config["verbose"] = self.config.verbose
        if self.config.init_config["verbose"] > 1:
            init_fn_name = self.config.init_config["name"]
            warnings.warn(f"Using {init_fn_name} initialization.")

    @torch.no_grad()
    def _attn_bias(
        self,
        device,
        dtype,
        attention_mask: Optional[torch.ByteTensor] = None,
        prefix_mask: Optional[torch.ByteTensor] = None,
        sequence_id: Optional[torch.LongTensor] = None,
    ):
        if not self._attn_bias_initialized:
            if self.attn_bias_shape:
                self.attn_bias = torch.zeros(
                    self.attn_bias_shape, device=device, dtype=dtype
                )
                self.attn_bias = build_attn_bias(
                    self.attn_impl,
                    self.attn_bias,
                    self.config.n_heads,
                    self.config.max_seq_len,
                    causal=self.is_causal,
                    alibi=self.alibi,
                    alibi_bias_max=self.alibi_bias_max,
                )
                assert self.n_heads % self.world_size == 0
                block_size = self.n_heads // self.world_size
                self.attn_bias = self.attn_bias[
                    :, self.rank * block_size : (self.rank + 1) * block_size
                ]
            self._attn_bias_initialized = True
        if self.attn_impl == "flash":
            return (self.attn_bias, attention_mask)
        if self.attn_bias is not None:
            self.attn_bias = self.attn_bias.to(dtype=dtype, device=device)
        attn_bias = self.attn_bias
        if self.prefix_lm:
            assert isinstance(attn_bias, torch.Tensor)
            assert isinstance(prefix_mask, torch.Tensor)
            attn_bias = self._apply_prefix_mask(attn_bias, prefix_mask)
        if self.attn_uses_sequence_id and sequence_id is not None:
            assert isinstance(attn_bias, torch.Tensor)
            attn_bias = self._apply_sequence_id(attn_bias, sequence_id)
        if attention_mask is not None:
            s_k = attention_mask.shape[-1]
            if attn_bias is None:
                attn_bias = torch.zeros((1, 1, 1, s_k), device=device, dtype=dtype)
            else:
                _s_k = max(0, attn_bias.size(-1) - s_k)
                attn_bias = attn_bias[:, :, :, _s_k:]
            if prefix_mask is not None and attention_mask.shape != prefix_mask.shape:
                raise ValueError(
                    f"attention_mask shape={attention_mask.shape} "
                    + f"and prefix_mask shape={prefix_mask.shape} are not equal."
                )
            min_val = torch.finfo(attn_bias.dtype).min
            attn_bias = attn_bias.masked_fill(
                ~attention_mask.view(-1, 1, 1, s_k), min_val
            )
        return (attn_bias, None)

    def _apply_prefix_mask(self, attn_bias: torch.Tensor, prefix_mask: torch.Tensor):
        (s_k, s_q) = attn_bias.shape[-2:]
        if s_k != self.config.max_seq_len or s_q != self.config.max_seq_len:
            raise ValueError(
                "attn_bias does not match the expected shape. "
                + f"The last two dimensions should both be {self.config.max_length} "
                + f"but are {s_k} and {s_q}."
            )
        seq_len = prefix_mask.shape[-1]
        if seq_len > self.config.max_seq_len:
            raise ValueError(
                f"prefix_mask sequence length cannot exceed max_seq_len={self.config.max_seq_len}"
            )
        attn_bias = attn_bias[..., :seq_len, :seq_len]
        causal = torch.tril(
            torch.ones((seq_len, seq_len), dtype=torch.bool, device=prefix_mask.device)
        ).view(1, 1, seq_len, seq_len)
        prefix = prefix_mask.view(-1, 1, 1, seq_len)
        cannot_attend = ~torch.logical_or(causal, prefix.bool())
        min_val = torch.finfo(attn_bias.dtype).min
        attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
        return attn_bias

    def _apply_sequence_id(
        self, attn_bias: torch.Tensor, sequence_id: torch.LongTensor
    ):
        seq_len = sequence_id.shape[-1]
        if seq_len > self.config.max_seq_len:
            raise ValueError(
                f"sequence_id sequence length cannot exceed max_seq_len={self.config.max_seq_len}"
            )
        attn_bias = attn_bias[..., :seq_len, :seq_len]
        cannot_attend = torch.logical_not(
            torch.eq(sequence_id.view(-1, seq_len, 1), sequence_id.view(-1, 1, seq_len))
        ).unsqueeze(1)
        min_val = torch.finfo(attn_bias.dtype).min
        attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
        return attn_bias

    def forward(
        self,
        input_ids: torch.LongTensor,
        past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
        attention_mask: Optional[torch.ByteTensor] = None,
        prefix_mask: Optional[torch.ByteTensor] = None,
        sequence_id: Optional[torch.LongTensor] = None,
        return_dict: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        use_cache: Optional[bool] = None,
    ):
        return_dict = (
            return_dict if return_dict is not None else self.config.return_dict
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        if attention_mask is not None:
            attention_mask = attention_mask.bool()
        if prefix_mask is not None:
            prefix_mask = prefix_mask.bool()
        if not return_dict:
            raise NotImplementedError(
                "return_dict False is not implemented yet for MPT"
            )
        if output_attentions:
            if self.attn_impl != "torch":
                raise NotImplementedError(
                    "output_attentions is not implemented for MPT when using attn_impl `flash` or `triton`."
                )
        if (
            attention_mask is not None
            and attention_mask[:, 0].sum() != attention_mask.shape[0]
            and self.training
        ):
            raise NotImplementedError(
                "MPT does not support training with left padding."
            )
        if self.prefix_lm and prefix_mask is None:
            raise ValueError(
                "prefix_mask is a required argument when MPT is configured with prefix_lm=True."
            )
        if self.training:
            if self.attn_uses_sequence_id and sequence_id is None:
                raise ValueError(
                    "sequence_id is a required argument when MPT is configured with attn_uses_sequence_id=True "
                    + "and the model is in train mode."
                )
            elif self.attn_uses_sequence_id is False and sequence_id is not None:
                warnings.warn(
                    "MPT received non-None input for `sequence_id` but is configured with attn_uses_sequence_id=False. "
                    + "This input will be ignored. If you want the model to use `sequence_id`, set attn_uses_sequence_id to True."
                )
        S = input_ids.size(1)
        assert (
            S <= self.config.max_seq_len
        ), f"Cannot forward input with seq_len={S}, this model only supports seq_len<={self.config.max_seq_len}"
        tok_emb = self.wte(input_ids)
        if self.alibi:
            x = tok_emb
        else:
            past_position = 0
            if past_key_values is not None:
                if len(past_key_values) != self.config.n_layers:
                    raise ValueError(
                        "past_key_values must provide a past_key_value for each attention "
                        + f"layer in the network (len(past_key_values)={len(past_key_values)!r}; self.config.n_layers={self.config.n_layers!r})."
                    )
                past_position = past_key_values[0][0].size(1)
                if self.attn_impl == "torch":
                    past_position = past_key_values[0][0].size(3)
            if S + past_position > self.config.max_seq_len:
                raise ValueError(
                    f"Cannot forward input with past sequence length {past_position} and current sequence length {S + 1}, this model only supports total sequence length <= {self.config.max_seq_len}."
                )
            pos = torch.arange(
                past_position,
                S + past_position,
                dtype=torch.long,
                device=input_ids.device,
            ).unsqueeze(0)
            if attention_mask is not None:
                pos = torch.clamp(
                    pos
                    - torch.cumsum((~attention_mask).to(torch.int32), dim=1)[
                        :, past_position:
                    ],
                    min=0,
                )
            pos_emb = self.wpe(pos)
            x = tok_emb + pos_emb
        (attn_bias, attention_mask) = self._attn_bias(
            device=x.device,
            dtype=torch.float32,
            attention_mask=attention_mask,
            prefix_mask=prefix_mask,
            sequence_id=sequence_id,
        )
        if use_cache and past_key_values is None:
            past_key_values = [() for _ in range(self.config.n_layers)]
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        for b_idx, block in enumerate(self.blocks):
            if output_hidden_states:
                assert all_hidden_states is not None
                all_hidden_states = all_hidden_states + (x,)
            past_key_value = (
                past_key_values[b_idx] if past_key_values is not None else None
            )
            (x, attn_weights, past_key_value) = block(
                x,
                past_key_value=past_key_value,
                attn_bias=attn_bias,
                attention_mask=attention_mask,
                is_causal=self.is_causal,
            )
            if past_key_values is not None:
                past_key_values[b_idx] = past_key_value
            if output_attentions:
                assert all_self_attns is not None
                all_self_attns = all_self_attns + (attn_weights,)
        x = self.norm_f(x)
        if output_hidden_states:
            assert all_hidden_states is not None
            all_hidden_states = all_hidden_states + (x,)
        return BaseModelOutputWithPast(
            last_hidden_state=x,
            past_key_values=past_key_values,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
        )


class MPTForCausalLM(MPTPreTrainedModel):
    def __init__(self, prefix: str, config, weights):
        super().__init__(config)

        if not prefix:
            prefix = "transformer"
        else:
            prefix = f"{prefix}.transformer"

        if not config.tie_word_embeddings:
            raise ValueError("MPTForCausalLM only supports tied word embeddings")
        self.transformer = MPTModel(prefix, config, weights)
        self.lm_head = SpeculativeHead.load(
            config, prefix=f"{prefix}.wte", weights=weights
        )
        self.logit_scale = None
        if config.logit_scale is not None:
            logit_scale = config.logit_scale
            if isinstance(logit_scale, str):
                if logit_scale == "inv_sqrt_d_model":
                    logit_scale = 1 / math.sqrt(config.d_model)
                else:
                    raise ValueError(
                        f"logit_scale={logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'."
                    )
            self.logit_scale = logit_scale

    def forward(
        self,
        input_ids: torch.LongTensor,
        past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
        attention_mask: Optional[torch.ByteTensor] = None,
        prefix_mask: Optional[torch.ByteTensor] = None,
        sequence_id: Optional[torch.LongTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        return_dict: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        use_cache: Optional[bool] = None,
    ):
        return_dict = (
            return_dict if return_dict is not None else self.config.return_dict
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        outputs = self.transformer(
            input_ids=input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            prefix_mask=prefix_mask,
            sequence_id=sequence_id,
            return_dict=return_dict,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            use_cache=use_cache,
        )
        logits, speculative_logits = self.lm_head(outputs.last_hidden_state)
        if self.logit_scale is not None:
            if self.logit_scale == 0:
                warnings.warn(
                    f"Multiplying logits by self.logit_scale={self.logit_scale!r}. This will produce uniform (uninformative) outputs."
                )
            logits *= self.logit_scale
        loss = None
        if labels is not None:
            labels = torch.roll(labels, shifts=-1)
            labels[:, -1] = -100
            loss = F.cross_entropy(
                logits.view(-1, logits.size(-1)), labels.to(logits.device).view(-1)
            )
        return (
            CausalLMOutputWithPast(
                loss=loss,
                logits=logits,
                past_key_values=outputs.past_key_values,
                hidden_states=outputs.hidden_states,
                attentions=outputs.attentions,
            ),
            speculative_logits,
        )

    def prepare_inputs_for_generation(
        self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs
    ):
        if inputs_embeds is not None:
            raise NotImplementedError("inputs_embeds is not implemented for MPT yet")
        attention_mask = kwargs["attention_mask"].bool()
        if attention_mask[:, -1].sum() != attention_mask.shape[0]:
            raise NotImplementedError(
                "MPT does not support generation with right padding."
            )
        if self.transformer.attn_uses_sequence_id and self.training:
            sequence_id = torch.zeros_like(input_ids[:1])
        else:
            sequence_id = None
        if past_key_values is not None:
            input_ids = input_ids[:, -1].unsqueeze(-1)
        if self.transformer.prefix_lm:
            prefix_mask = torch.ones_like(attention_mask)
            if kwargs.get("use_cache") is False:
                raise NotImplementedError(
                    "MPT with prefix_lm=True does not support use_cache=False."
                )
        else:
            prefix_mask = None
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "prefix_mask": prefix_mask,
            "sequence_id": sequence_id,
            "past_key_values": past_key_values,
            "use_cache": kwargs.get("use_cache", True),
        }

    @staticmethod
    def _reorder_cache(past_key_values, beam_idx):
        """Used by HuggingFace generate when using beam search with kv-caching.

        See https://github.com/huggingface/transformers/blob/3ec7a47664ebe40c40f4b722f6bb1cd30c3821ec/src/transformers/models/gpt2/modeling_gpt2.py#L1122-L1133
        for an example in transformers.
        """
        reordered_past = []
        for layer_past in past_key_values:
            reordered_past += [
                tuple(
                    (past_state.index_select(0, beam_idx) for past_state in layer_past)
                )
            ]
        return reordered_past


================================================
FILE: server/text_generation_server/models/custom_modeling/neox_modeling.py
================================================
# coding=utf-8
# Copyright 2022 EleutherAI The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch GPTNeoX model."""

from typing import Optional, Tuple, Union

import os
import torch
import torch.distributed
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss

from transformers.activations import ACT2FN
from transformers.modeling_outputs import (
    BaseModelOutputWithPast,
    CausalLMOutputWithPast,
)
from transformers.modeling_utils import PreTrainedModel
from text_generation_server.layers import (
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    TensorParallelRowLinear,
    SpeculativeHead,
)


CUSTOM_KERNELS_ENABLED = False
if (
    torch.cuda.is_available()
    and not os.environ.get("DISABLE_CUSTOM_KERNELS", "False") == "True"
):
    try:
        from custom_kernels import fused_attention_cuda

        CUSTOM_KERNELS_ENABLED = True
    except ImportError:
        pass


def make_causal_mask(
    input_ids_shape: torch.Size, device: torch.device, past_key_values_length: int
) -> torch.BoolTensor:
    """
    Make causal mask used for self-attention.
    """
    batch_size, target_length = input_ids_shape
    mask = torch.ones(
        (target_length, target_length + past_key_values_length),
        dtype=torch.bool,
        device=device,
    )
    mask = mask.triu(1 + past_key_values_length)

    expanded_mask = mask.unsqueeze(0).expand(
        batch_size, target_length, target_length + past_key_values_length
    )
    return expanded_mask


def expand_mask(mask: torch.Tensor, tgt_length: int) -> torch.BoolTensor:
    """
    Expands attention_mask from `[batch_size, src_length]` to `[batch_size, 1, tgt_length, src_length]`.
    """
    batch_size, src_length = mask.shape
    tgt_length = tgt_length if tgt_length is not None else src_length

    expanded_mask = ~(mask[:, None, :].to(torch.bool))
    return expanded_mask.expand(batch_size, tgt_length, src_length)


def prepare_attn_mask(
    attention_mask: torch.Tensor,
    input_shape: Tuple[int, int],
    past_key_values_length: int,
) -> torch.BoolTensor:
    # create causal mask
    # [batch_size, seq_length] -> [batch_size, tgt_length, src_length]
    combined_attention_mask = None
    device = attention_mask.device
    _, src_length = input_shape

    if src_length > 1:
        combined_attention_mask = make_causal_mask(
            input_shape, device=device, past_key_values_length=past_key_values_length
        )

    # [batch_size, seq_length] -> [batch_size, tgt_length, src_length]
    expanded_attn_mask = expand_mask(attention_mask, tgt_length=src_length)
    combined_attention_mask = (
        expanded_attn_mask
        if combined_attention_mask is None
        else expanded_attn_mask | combined_attention_mask
    )

    return combined_attention_mask


class GPTNeoXPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """


class GPTNeoXAttention(nn.Module):
    def __init__(self, config, prefix, weights):
        super().__init__()
        self.num_attention_heads = config.num_attention_heads
        self.hidden_size = config.hidden_size
        self.head_size = self.hidden_size // self.num_attention_heads
        self.rotary_ndims = int(self.head_size * config.rotary_pct)
        # ??? TODO
        # self.register_buffer(
        #     "bias",
        #     torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
        #         1, 1, max_positions, max_positions
        #     ),
        # )
        # self.register_buffer("masked_bias", torch.tensor(-1e9))
        self.rotary_emb = RotaryEmbedding(
            self.rotary_ndims,
            config.max_position_embeddings,
            base=config.rotary_emb_base,
        )
        self.rotary_emb.inv_freq = nn.Parameter(
            weights.get_tensor(f"{prefix}.rotary_emb.inv_freq")
        )
        self.inv_norm_factor = 1.0 / torch.sqrt(
            torch.tensor(self.head_size, dtype=torch.float32)
        ).to(torch.get_default_dtype())

        if self.num_attention_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_attention_heads` must be divisible by `num_shards` "
                f"(got `num_attention_heads`: {self.num_attention_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_attention_heads = (
            self.num_attention_heads // weights.process_group.size()
        )
        self.query_key_value = TensorParallelColumnLinear.load(
            config, prefix=f"{prefix}.query_key_value", weights=weights, bias=True
        )
        self.dense = TensorParallelRowLinear.load(
            config, prefix=f"{prefix}.dense", weights=weights, bias=True
        )

    def forward(
        self,
        hidden_states,
        position_ids,
        attention_mask,
        head_mask=None,
        layer_past=None,
        use_cache=False,
        output_attentions=False,
    ):
        has_layer_past = layer_past is not None

        # Compute QKV
        # Attention heads [batch, seq_len, hidden_size]
        #   --> [batch, seq_len, (np * 3 * head_size)]
        qkv = self.query_key_value(hidden_states)

        # [batch, seq_len, (num_heads * 3 * head_size)]
        #   --> [batch, seq_len, num_heads, 3 * head_size]
        new_qkv_shape = qkv.size()[:-1] + (self.num_attention_heads, 3 * self.head_size)
        qkv = qkv.view(*new_qkv_shape).permute(0, 2, 1, 3)
        # [batch, seq_len, num_attention_heads, 3 * head_size] --> 3 [batch, num_attention_heads, seq_len, head_size]
        query, key, value = qkv.split(self.head_size, -1)

        # Compute token offset for rotary embeddings (when decoding)
        seq_len = key.shape[-2]
        if has_layer_past:
            seq_len += layer_past[0].shape[-2]

        # Compute rotary embeddings on rotary_ndims
        query_rot = query[..., : self.rotary_ndims]
        key_rot = key[..., : self.rotary_ndims]

        query_rot, key_rot = self.rotary_emb(query_rot, key_rot, position_ids, seq_len)

        query[..., : self.rotary_ndims] = query_rot
        key[..., : self.rotary_ndims] = key_rot

        if CUSTOM_KERNELS_ENABLED:
            attn_output, present, attn_weights = fused_attention_cuda.forward(
                query,
                key,
                value,
                layer_past,
                attention_mask,
                head_mask,
                self.inv_norm_factor,
                self.num_attention_heads,
                use_cache,
            )
        else:
            # Cache QKV values
            if has_layer_past:
                past_key = layer_past[0]
                past_value = layer_past[1]
                key = torch.cat((past_key, key), dim=-2)
                value = torch.cat((past_value, value), dim=-2)
            present = (key, value) if use_cache else None

            # Compute attention
            attn_output, attn_weights = self._attn(
                query, key, value, attention_mask, head_mask
            )

            # Reshape outputs
            attn_output = self._merge_heads(
                attn_output, self.num_attention_heads, self.head_size
            )

        attn_output = self.dense(attn_output)

        outputs = (attn_output, present)
        if output_attentions:
            outputs += (attn_weights,)

        return outputs

    @classmethod
    def _split_heads(cls, tensor, num_attention_heads, attn_head_size):
        """
        Splits hidden dim into attn_head_size and num_attention_heads
        """
        # tensor: [bs, seq_len, hidden_size]
        new_shape = tensor.size()[:-1] + (num_attention_heads, attn_head_size)
        # -> [bs, seq_len, num_attention_heads, attn_head_size]
        tensor = tensor.view(new_shape)
        # -> [bs, num_attention_heads, seq_len, attn_head_size]
        tensor = tensor.permute(0, 2, 1, 3)
        return tensor

    @classmethod
    def _merge_heads(cls, tensor, num_attention_heads, attn_head_size):
        """
        Merges attn_head_size dim and num_attn_heads dim into hidden dim
        """
        # tensor [bs, num_attention_heads, seq_len, attn_head_size]
        tensor = tensor.permute(0, 2, 1, 3).contiguous()
        # -> [bs, seq_len, num_attention_heads, attn_head_size]
        tensor = tensor.view(
            tensor.size(0), tensor.size(1), num_attention_heads * attn_head_size
        )
        # -> [bs, seq_len, hidden_size]
        return tensor

    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
        # q, k, v: [bs, num_attention_heads, seq_len, attn_head_size]
        # compute causal mask from causal mask buffer
        batch_size, num_attention_heads, query_length, attn_head_size = query.size()
        key_length = key.size(-2)

        query = query.reshape(
            batch_size * num_attention_heads, query_length, attn_head_size
        )
        key = key.reshape(batch_size * num_attention_heads, key_length, attn_head_size)
        attn_scores = torch.zeros(
            1,
            dtype=query.dtype,
            device=key.device,
        ).expand(batch_size * num_attention_heads, query_length, key_length)
        attn_scores = torch.baddbmm(
            attn_scores,
            query,
            key.transpose(1, 2),
            beta=1.0,
            alpha=self.inv_norm_factor,
        )

        # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length]
        input_dtype = attn_scores.dtype
        if input_dtype in [torch.float16, torch.bfloat16]:
            attn_scores = attn_scores.to(torch.float)
        attn_scores = torch.where(
            attention_mask, torch.finfo(attn_scores.dtype).min, attn_scores
        )
        attn_scores = attn_scores.view(
            batch_size, num_attention_heads, query_length, key_length
        )

        attn_weights = nn.functional.softmax(attn_scores, dim=-1)
        attn_weights = attn_weights.to(value.dtype)

        # Mask heads if we want to
        if head_mask is not None:
            attn_weights = attn_weights * head_mask

        attn_output = torch.matmul(attn_weights, value)
        return attn_output, attn_weights


class RotaryEmbedding(torch.nn.Module):
    def __init__(self, dim, max_position_embeddings, base=10000, device=None):
        super().__init__()
        self.true_inv_freq = 1.0 / (
            base ** (torch.arange(0, dim, 2).float().to(device) / dim)
        )
        self.register_buffer("inv_freq", self.true_inv_freq)

        # Build here to make `torch.jit.trace` work.
        self.max_seq_len_cached = max_position_embeddings
        self.cos_cached = None
        self.sin_cached = None

    @staticmethod
    def rotate_half(x):
        """Rotates half the hidden dims of the input."""
        x1 = x[..., : x.shape[-1] // 2]
        x2 = x[..., x.shape[-1] // 2 :]
        return torch.cat((-x2, x1), dim=-1)

    @staticmethod
    def _create_cos_sin(inv_freq, max_position_embeddings, dtype, device):
        t = torch.arange(
            max_position_embeddings, device=inv_freq.device, dtype=inv_freq.dtype
        )
        freqs = torch.einsum("i,j->ij", t, inv_freq)
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = torch.cat((freqs, freqs), dim=-1)
        return emb.cos().to(device).to(dtype), emb.sin().to(device).to(dtype)

    def forward(self, q, k, position_ids, seq_len=None):
        # x: [bs, num_attention_heads, seq_len, head_size]
        if (
            seq_len > self.max_seq_len_cached
            or self.cos_cached is None
            or self.sin_cached is None
        ):
            if seq_len > self.max_seq_len_cached:
                self.max_seq_len_cached = seq_len
            self.cos_cached, self.sin_cached = self._create_cos_sin(
                self.true_inv_freq, self.max_seq_len_cached, q.dtype, q.device
            )
        return rotary_forward(q, k, self.cos_cached, self.sin_cached, position_ids)


@torch.jit.script
def rotary_forward(q, k, cos, sin, position_ids):
    cos = cos[position_ids].unsqueeze(1)
    sin = sin[position_ids].unsqueeze(1)

    chunk_size = q.shape[-1] // 2
    q1, q2 = q.split(chunk_size, -1)
    q_rotated = torch.cat((-q2, q1), dim=-1)
    k1, k2 = k.split(chunk_size, -1)
    k_rotated = torch.cat((-k2, k1), dim=-1)

    q_embed = (q * cos) + (q_rotated * sin)
    k_embed = (k * cos) + (k_rotated * sin)
    return q_embed, k_embed


class GPTNeoXMLP(nn.Module):
    def __init__(self, config, prefix, weights):
        super().__init__()
        self.act = (
            ACT2FN[config.hidden_act]
            if "gelu_fast" not in config.hidden_act
            else lambda x: torch.nn.functional.gelu(x, approximate="tanh")
        )

        self.dense_h_to_4h = TensorParallelColumnLinear.load(
            config, prefix=f"{prefix}.dense_h_to_4h", weights=weights, bias=True
        )
        self.dense_4h_to_h = TensorParallelRowLinear.load(
            config, prefix=f"{prefix}.dense_4h_to_h", weights=weights, bias=True
        )

    def forward(self, hidden_states):
        hidden_states = self.dense_h_to_4h(hidden_states)
        hidden_states = self.act(hidden_states)
        hidden_states = self.dense_4h_to_h(hidden_states)
        return hidden_states


class GPTNeoXLayer(nn.Module):
    def __init__(self, layer_id, prefix: str, config, weights):
        super().__init__()
        self.use_parallel_residual = config.use_parallel_residual
        self.input_layernorm = nn.LayerNorm.load(
            prefix=f"{prefix}.layers.{layer_id}.input_layernorm",
            weights=weights,
            eps=config.layer_norm_eps,
        )
        self.post_attention_layernorm = nn.LayerNorm.load(
            prefix=f"{prefix}.layers.{layer_id}.post_attention_layernorm",
            weights=weights,
            eps=config.layer_norm_eps,
        )
        self.attention = GPTNeoXAttention(
            config, prefix=f"{prefix}.layers.{layer_id}.attention", weights=weights
        )
        self.mlp = GPTNeoXMLP(
            config, prefix=f"{prefix}.layers.{layer_id}.mlp", weights=weights
        )

    def forward(
        self,
        hidden_states,
        position_ids,
        attention_mask=None,
        head_mask=None,
        use_cache=False,
        layer_past=None,
        output_attentions=False,
    ):
        attention_layer_outputs = self.attention(
            self.input_layernorm(hidden_states),
            attention_mask=attention_mask,
            position_ids=position_ids,
            layer_past=layer_past,
            head_mask=head_mask,
            use_cache=use_cache,
            output_attentions=output_attentions,
        )
        attn_output = attention_layer_outputs[
            0
        ]  # output_attn: attn_output, present, (attn_weights)
        outputs = attention_layer_outputs[1:]

        if self.use_parallel_residual:
            # pseudocode:
            # x = x + attn(ln1(x)) + mlp(ln2(x))
            mlp_output = self.mlp(self.post_attention_layernorm(hidden_states))
            hidden_states = mlp_output + attn_output + hidden_states
        else:
            # pseudocode:
            # x = x + attn(ln1(x))
            # x = x + mlp(ln2(x))
            attn_output = attn_output + hidden_states
            mlp_output = self.mlp(self.post_attention_layernorm(attn_output))
            hidden_states = mlp_output + attn_output

        if use_cache:
            outputs = (
                hidden_states,
            ) + outputs  # hidden_states, present, (attn_weights)
        else:
            outputs = (hidden_states,) + outputs[1:]  # hidden_states, (attn_weights)

        return outputs


class GPTNeoXModel(GPTNeoXPreTrainedModel):
    def __init__(self, prefix: str, config, weights):
        super().__init__(config)
        self.config = config

        self.num_attention_heads = config.num_attention_heads

        self.embed_in = TensorParallelEmbedding(
            prefix=f"{prefix}.embed_in", weights=weights
        )
        self.layers = nn.ModuleList(
            [
                GPTNeoXLayer(layer_id, prefix, config, weights)
                for layer_id in range(config.num_hidden_layers)
            ]
        )
        self.final_layer_norm = nn.LayerNorm.load(
            prefix=f"{prefix}.final_layer_norm",
            weights=weights,
            eps=config.layer_norm_eps,
        )
        self.tp_world_size = weights.process_group.size()

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        position_ids=None,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPast]:
        r"""
        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        """
        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache

        if input_ids is not None and inputs_embeds is not None:
            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time"
            )
        elif input_ids is not None:
            input_shape = input_ids.size()
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.size()[:-1]
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        batch_size, seq_length = input_shape

        if past_key_values is None:
            past_length = 0
            past_key_values = tuple([None] * self.config.num_hidden_layers)
        else:
            past_length = past_key_values[0][0].size(-2)

        if position_ids is None:
            device = input_ids.device if input_ids is not None else inputs_embeds.device
            position_ids = torch.arange(
                past_length, seq_length + past_length, dtype=torch.long, device=device
            )
            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
        else:
            position_ids = position_ids.view(-1, seq_length).long()

        if inputs_embeds is None:
            inputs_embeds = self.embed_in(input_ids)

        hidden_states = inputs_embeds

        # Attention mask.
        seq_length_with_past = seq_length
        past_key_values_length = 0
        if past_key_values[0] is not None:
            past_key_values_length = past_key_values[0][0].shape[-1]
            seq_length_with_past = seq_length_with_past + past_key_values_length
        if attention_mask is None:
            attention_mask = torch.ones(
                (batch_size, seq_length_with_past), device=hidden_states.device
            )
        else:
            attention_mask = attention_mask.to(hidden_states.device)

        causal_mask = prepare_attn_mask(
            attention_mask,
            input_shape=(batch_size, seq_length),
            past_key_values_length=past_key_values_length,
        )

        assert self.num_attention_heads % self.tp_world_size == 0
        block_size = self.num_attention_heads // self.tp_world_size
        causal_mask = torch.repeat_interleave(causal_mask, block_size, dim=0)

        # Prepare head mask if needed
        # 1.0 in head_mask indicate we keep the head
        # attention_probs has shape bsz x n_heads x N x N
        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)

        presents = () if use_cache else None
        all_attentions = () if output_attentions else None
        all_hidden_states = () if output_hidden_states else None
        for i, (layer, layer_past) in enumerate(zip(self.layers, past_key_values)):
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            outputs = layer(
                hidden_states,
                position_ids=position_ids,
                attention_mask=causal_mask,
                head_mask=head_mask[i],
                layer_past=layer_past,
                use_cache=use_cache,
                output_attentions=output_attentions,
            )
            hidden_states = outputs[0]
            if use_cache is True:
                presents = presents + (outputs[1],)
            if output_attentions:
                all_attentions = all_attentions + (outputs[2 if use_cache else 1],)

        hidden_states = self.final_layer_norm(hidden_states)
        # Add last hidden state
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        if not return_dict:
            return tuple(
                v
                for v in [hidden_states, presents, all_hidden_states, all_attentions]
                if v is not None
            )

        return BaseModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=presents,
            hidden_states=all_hidden_states,
            attentions=all_attentions,
        )


class GPTNeoxForCausalLM(GPTNeoXPreTrainedModel):
    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]

    def __init__(self, prefix: str, config, weights):
        super().__init__(config)

        if not prefix:
            prefix = "gpt_neox"
        else:
            prefix = f"{prefix}.gpt_neox"

        self.gpt_neox = GPTNeoXModel(prefix, config, weights)
        self.embed_out = SpeculativeHead.load(
            config, prefix="embed_out", weights=weights
        )

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, CausalLMOutputWithPast]:
        r"""
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional tensors are
            only required when the model is used as a decoder in a Sequence to Sequence model.

            Contains pre-computed hidden-states (key and values in the self-attention blocks that can be used (see
            `past_key_values` input) to speed up sequential decoding.

            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).

        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, GPTNeoXForCausalLM, GPTNeoXConfig
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
        >>> config = GPTNeoXConfig.from_pretrained("EleutherAI/gpt-neox-20b")
        >>> config.is_decoder = True
        >>> model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/gpt-neox-20b", config=config)

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> prediction_logits = outputs.logits
        ```"""
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        outputs = self.gpt_neox(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            past_key_values=past_key_values,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        hidden_states = outputs[0]
        lm_logits, speculative_logits = self.embed_out(hidden_states)

        lm_loss = None
        if labels is not None:
            # move labels to correct device to enable model parallelism
            labels = labels.to(lm_logits.device)
            # we are doing next-token prediction; shift prediction scores and input ids by one
            shift_logits = lm_logits[:, :-1, :].contiguous()
            labels = labels[:, 1:].contiguous()
            loss_fct = CrossEntropyLoss()
            lm_loss = loss_fct(
                shift_logits.view(-1, shift_logits.size(-1)), labels.view(-1)
            )

        if not return_dict:
            output = (lm_logits,) + outputs[1:]
            return ((lm_loss,) + output) if lm_loss is not None else output

        return (
            CausalLMOutputWithPast(
                loss=lm_loss,
                logits=lm_logits,
                past_key_values=outputs.past_key_values,
                hidden_states=outputs.hidden_states,
                attentions=outputs.attentions,
            ),
            speculative_logits,
        )

    def prepare_inputs_for_generation(
        self,
        input_ids,
        past_key_values=None,
        attention_mask=None,
        inputs_embeds=None,
        **kwargs,
    ):
        input_shape = input_ids.shape

        # cut decoder_input_ids if past is used
        if past_key_values and past_key_values[0] is not None:
            input_ids = input_ids[:, -1:]

        position_ids = kwargs.get("position_ids", None)
        if attention_mask is not None and position_ids is None:
            # create position_ids on the fly for batch generation
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
            if past_key_values:
                position_ids = position_ids[:, -1].unsqueeze(-1)

        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
        if attention_mask is None:
            attention_mask = input_ids.new_ones(input_shape)

        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
        if inputs_embeds is not None and past_key_values is None:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}

        model_inputs.update(
            {
                "attention_mask": attention_mask,
                "past_key_values": past_key_values,
                "position_ids": position_ids,
            }
        )

        return model_inputs

    def _reorder_cache(self, past_key_values, beam_idx):
        reordered_past = ()
        for layer_past in past_key_values:
            reordered_past += (
                tuple(
                    past_state.index_select(0, beam_idx)
                    for past_state in layer_past[:2]
                )
                + layer_past[2:],
            )
        return reordered_past


================================================
FILE: server/text_generation_server/models/custom_modeling/opt_modeling.py
================================================
# coding=utf-8
# Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch OPT model."""

import random
from typing import List, Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn

from transformers.activations import ACT2FN
from transformers.modeling_outputs import (
    BaseModelOutputWithPast,
    CausalLMOutputWithPast,
)
from transformers.modeling_utils import PreTrainedModel
from transformers import OPTConfig
from text_generation_server.layers import (
    FastLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    TensorParallelRowLinear,
    SpeculativeHead,
)

EPS = 1e-5


# Copied from transformers.models.bart.modeling_bart._make_causal_mask
def _make_causal_mask(
    input_ids_shape: torch.Size,
    dtype: torch.dtype,
    device: torch.device,
    past_key_values_length: int = 0,
):
    """
    Make causal mask used for bi-directional self-attention.
    """
    bsz, tgt_len = input_ids_shape
    mask = torch.full(
        (tgt_len, tgt_len),
        torch.tensor(torch.finfo(dtype).min, device=device),
        device=device,
    )
    mask_cond = torch.arange(mask.size(-1), device=device)
    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
    mask = mask.to(dtype)

    if past_key_values_length > 0:
        mask = torch.cat(
            [
                torch.zeros(
                    tgt_len, past_key_values_length, dtype=dtype, device=device
                ),
                mask,
            ],
            dim=-1,
        )
    return mask[None, None, :, :].expand(
        bsz, 1, tgt_len, tgt_len + past_key_values_length
    )


def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
    """
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    """
    bsz, src_len = mask.size()
    tgt_len = tgt_len if tgt_len is not None else src_len

    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)

    inverted_mask = 1.0 - expanded_mask

    return inverted_mask.masked_fill(
        inverted_mask.to(torch.bool), torch.finfo(dtype).min
    )


class OPTLearnedPositionalEmbedding(nn.Module):
    """
    This module learns positional embeddings up to a fixed maximum size.
    """

    def __init__(self, prefix: str, weights):
        super().__init__()
        self.offset = 2
        self.weight = nn.Parameter(
            weights.get_tensor(
                f"{prefix if prefix else ''}decoder.embed_positions.weight"
            )
        )

    def forward(
        self, attention_mask: torch.LongTensor, past_key_values_length: int = 0
    ):
        """`input_ids_shape` is expected to be [bsz x seqlen]."""
        attention_mask = attention_mask.long()

        # create positions depending on attention_mask
        positions = (
            torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask
        ).long() - 1

        # cut positions if `past_key_values_length` is > 0
        positions = positions[:, past_key_values_length:]

        return torch.nn.functional.embedding(positions + self.offset, self.weight)


class OPTAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(
        self,
        config,
        prefix,
        weights,
        is_decoder: bool = False,
        bias: bool = True,
        process_group=None,
    ):
        super().__init__()
        hidden_size = config.hidden_size
        num_heads = config.num_attention_heads

        self.hidden_size = hidden_size
        self.num_heads = num_heads
        self.dropout = config.dropout
        self.head_dim = hidden_size // num_heads

        if (self.head_dim * num_heads) != self.hidden_size:
            raise ValueError(
                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
                f" and `num_heads`: {num_heads})."
            )
        self.scaling = self.head_dim**-0.5
        self.is_decoder = is_decoder

        process_group = weights.process_group
        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // process_group.size()
        self.hidden_size = self.hidden_size // process_group.size()

        self.q_proj = TensorParallelColumnLinear.load(
            config, prefix=f"{prefix}.q_proj", weights=weights, bias=bias
        )
        self.k_proj = TensorParallelColumnLinear.load(
            config, prefix=f"{prefix}.k_proj", weights=weights, bias=bias
        )
        self.v_proj = TensorParallelColumnLinear.load(
            config, prefix=f"{prefix}.v_proj", weights=weights, bias=bias
        )
        self.out_proj = TensorParallelRowLinear.load(
            config, prefix=f"{prefix}.out_proj", weights=weights, bias=bias
        )

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return (
            tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
            .transpose(1, 2)
            .contiguous()
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        key_value_states: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        """Input shape: Batch x Time x Channel"""

        # if key_value_states are provided this layer is used as a cross-attention layer
        # for the decoder
        is_cross_attention = key_value_states is not None

        bsz, tgt_len, _ = hidden_states.size()

        # get query proj
        query_states = self.q_proj(hidden_states) * self.scaling
        # get key, value proj
        if is_cross_attention and past_key_value is not None:
            # reuse k,v, cross_attentions
            key_states = past_key_value[0]
            value_states = past_key_value[1]
        elif is_cross_attention:
            # cross_attentions
            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
        elif past_key_value is not None:
            # reuse k, v, self_attention
            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
            key_states = torch.cat([past_key_value[0], key_states], dim=2)
            value_states = torch.cat([past_key_value[1], value_states], dim=2)
        else:
            # self_attention
            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)

        if self.is_decoder:
            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
            # Further calls to cross_attention layer can then reuse all cross-attention
            # key/value_states (first "if" case)
            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
            # all previous decoder key/value_states. Further calls to uni-directional self-attention
            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
            # if encoder bi-directional self-attention `past_key_value` is always `None`
            past_key_value = (key_states, value_states)

        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
        key_states = key_states.view(*proj_shape)
        value_states = value_states.view(*proj_shape)

        src_len = key_states.size(1)
        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))

        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
            raise ValueError(
                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
                f" {attn_weights.size()}"
            )

        if attention_mask is not None:
            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
                raise ValueError(
                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
                )
            attn_weights = (
                attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
                + attention_mask
            )
            attn_weights = torch.max(
                attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)
            )
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)

        # upcast to fp32 if the weights are in fp16. Please see https://github.com/huggingface/transformers/pull/17437
        if attn_weights.dtype == torch.float16:
            attn_weights = nn.functional.softmax(
                attn_weights, dim=-1, dtype=torch.float32
            ).to(torch.float16)
        else:
            attn_weights = nn.functional.softmax(attn_weights, dim=-1)

        if layer_head_mask is not None:
            if layer_head_mask.size() != (self.num_heads,):
                raise ValueError(
                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
                    f" {layer_head_mask.size()}"
                )
            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(
                bsz, self.num_heads, tgt_len, src_len
            )
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)

        if output_attentions:
            # this operation is a bit awkward, but it's required to
            # make sure that attn_weights keeps its gradient.
            # In order to do so, attn_weights have to be reshaped
            # twice and have to be reused in the following
            attn_weights_reshaped = attn_weights.view(
                bsz, self.num_heads, tgt_len, src_len
            )
            attn_weights = attn_weights_reshaped.view(
                bsz * self.num_heads, tgt_len, src_len
            )
        else:
            attn_weights_reshaped = None

        attn_probs = nn.functional.dropout(
            attn_weights, p=self.dropout, training=self.training
        )

        attn_output = torch.bmm(attn_probs, value_states)

        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
        attn_output = attn_output.transpose(1, 2)

        # Use the `hidden_size` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
        # partitioned aross GPUs when using tensor-parallelism.
        attn_output = attn_output.reshape(bsz, tgt_len, self.hidden_size)

        attn_output = self.out_proj(attn_output)

        return attn_output, attn_weights_reshaped, past_key_value


class OPTDecoderLayer(nn.Module):
    def __init__(self, layer_id: int, prefix: str, config: OPTConfig, weights):
        super().__init__()
        self.process_group = weights.process_group
        self.hidden_size = config.hidden_size
        self.self_attn = OPTAttention(
            config,
            prefix=f"{prefix}.self_attn",
            weights=weights,
            is_decoder=True,
            bias=config.enable_bias,
        )
        self.do_layer_norm_before = config.do_layer_norm_before
        self.dropout = config.dropout
        self.activation_fn = ACT2FN[config.activation_function]

        self.self_attn_layer_norm = nn.LayerNorm.load(
            prefix=f"{prefix}.self_attn_layer_norm", weights=weights, eps=EPS
        )
        self.fc1 = TensorParallelColumnLinear.load(
            config, prefix=f"{prefix}.fc1", weights=weights, bias=config.enable_bias
        )
        self.fc2 = TensorParallelRowLinear.load(
            config, prefix=f"{prefix}.fc2", weights=weights, bias=config.enable_bias
        )
        self.final_layer_norm = nn.LayerNorm.load(
            prefix=f"{prefix}.final_layer_norm", weights=weights, eps=EPS
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
    ) -> Tuple[
        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
    ]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, hidden_size)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`, *optional*): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
        """

        residual = hidden_states

        # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
        if self.do_layer_norm_before:
            hidden_states = self.self_attn_layer_norm(hidden_states)

        # Self Attention
        hidden_states, self_attn_weights, present_key_value = self.self_attn(
            hidden_states=hidden_states,
            past_key_value=past_key_value,
            attention_mask=attention_mask,
            layer_head_mask=layer_head_mask,
            output_attentions=output_attentions,
        )
        hidden_states = nn.functional.dropout(
            hidden_states, p=self.dropout, training=self.training
        )
        hidden_states = residual + hidden_states

        # 350m applies layer norm AFTER attention
        if not self.do_layer_norm_before:
            hidden_states = self.self_attn_layer_norm(hidden_states)

        # Fully Connected
        hidden_states_shape = hidden_states.shape
        hidden_states = hidden_states.reshape(-1, hidden_states.size(-1))
        residual = hidden_states

        # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
        if self.do_layer_norm_before:
            hidden_states = self.final_layer_norm(hidden_states)

        hidden_states = self.fc1(hidden_states)
        hidden_states = self.activation_fn(hidden_states)

        hidden_states = self.fc2(hidden_states)
        hidden_states = nn.functional.dropout(
            hidden_states, p=self.dropout, training=self.training
        )

        hidden_states = (residual + hidden_states).view(hidden_states_shape)

        # 350m applies layer norm AFTER attention
        if not self.do_layer_norm_before:
            hidden_states = self.final_layer_norm(hidden_states)

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (self_attn_weights,)

        if use_cache:
            outputs += (present_key_value,)

        return outputs


class OPTPreTrainedModel(PreTrainedModel):
    config_class = OPTConfig


class OPTDecoder(OPTPreTrainedModel):
    def __init__(self, prefix: str, config: OPTConfig, weights):
        super().__init__(config)
        self.dropout = config.dropout
        self.layerdrop = config.layerdrop
        self.padding_idx = config.pad_token_id
        self.max_target_positions = config.max_position_embeddings
        self.vocab_size = config.vocab_size

        prefix = prefix + "." if prefix else ""

        self.embed_tokens = TensorParallelEmbedding(
            prefix=f"{prefix}decoder.embed_tokens", weights=weights
        )
        self.embed_positions = OPTLearnedPositionalEmbedding(prefix, weights)

        if config.word_embed_proj_dim != config.hidden_size:
            self.project_out = FastLinear.load(
                config,
                prefix=f"{prefix}decoder.project_out",
                weights=weights,
                bias=False,
            )
        else:
            self.project_out = None

        if config.word_embed_proj_dim != config.hidden_size:
            self.project_in = FastLinear.load(
                config,
                prefix=f"{prefix}decoder.project_in",
                weights=weights,
                bias=False,
            )
        else:
            self.project_in = None

        # Note that the only purpose of `config._remove_final_layer_norm` is to keep backward compatibility
        # with checkpoints that have been fine-tuned before transformers v4.20.1
        # see https://github.com/facebookresearch/metaseq/pull/164
        if config.do_layer_norm_before and not config._remove_final_layer_norm:
            self.final_layer_norm = nn.LayerNorm.load(
                prefix=f"{prefix}decoder.final_layer_norm", weights=weights, eps=EPS
            )
        else:
            self.final_layer_norm = None

        self.layers = nn.ModuleList(
            [
                OPTDecoderLayer(
                    layer_id,
                    prefix=f"{prefix}decoder.layers.{layer_id}",
                    config=config,
                    weights=weights,
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )

    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
    def _prepare_decoder_attention_mask(
        self, attention_mask, input_shape, inputs_embeds, past_key_values_length
    ):
        # create causal mask
        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
        combined_attention_mask = None
        if input_shape[-1] > 1:
            combined_attention_mask = _make_causal_mask(
                input_shape,
                inputs_embeds.dtype,
                device=inputs_embeds.device,
                past_key_values_length=past_key_values_length,
            )

        if attention_mask is not None:
            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
            expanded_attn_mask = _expand_mask(
                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
            ).to(inputs_embeds.device)
            combined_attention_mask = (
                expanded_attn_mask
                if combined_attention_mask is None
                else expanded_attn_mask + combined_attention_mask
            )

        return combined_attention_mask

    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPast]:
        r"""
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(num_hidden_layers, num_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of

                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.

            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        """
        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache

        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        # retrieve input_ids and inputs_embeds
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError(
                "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
            )
        elif input_ids is not None:
            input_shape = input_ids.size()
            input_ids = input_ids.view(-1, input_shape[-1])
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.size()[:-1]
        else:
            raise ValueError(
                "You have to specify either decoder_input_ids or decoder_inputs_embeds"
            )

        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids)

        batch_size, seq_length = input_shape
        past_key_values_length = (
            past_key_values[0][0].shape[2] if past_key_values is not None else 0
        )
        # required mask seq length can be calculated via length of past
        mask_seq_length = past_key_values_length + seq_length

        # embed positions
        if attention_mask is None:
            attention_mask = torch.ones(
                batch_size, mask_seq_length, device=inputs_embeds.device
            )
        causal_attention_mask = self._prepare_decoder_attention_mask(
            attention_mask, input_shape, inputs_embeds, past_key_values_length
        )
        pos_embeds = self.embed_positions(attention_mask, past_key_values_length)

        if self.project_in is not None:
            inputs_embeds = self.project_in(inputs_embeds)

        hidden_states = inputs_embeds + pos_embeds

        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        next_decoder_cache = () if use_cache else None

        # check if head_mask has a correct number of layers specified if desired
        for attn_mask, mask_name in zip([head_mask], ["head_mask"]):
            if attn_mask is not None:
                if attn_mask.size()[0] != (len(self.layers)):
                    raise ValueError(
                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
                        f" {head_mask.size()[0]}."
                    )

        for idx, decoder_layer in enumerate(self.layers):
            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            dropout_probability = random.uniform(0, 1)
            if self.training and (dropout_probability < self.layerdrop):
                continue

            past_key_value = (
                past_key_values[idx] if past_key_values is not None else None
            )

            layer_outputs = decoder_layer(
                hidden_states,
                attention_mask=causal_attention_mask,
                layer_head_mask=(head_mask[idx] if head_mask is not None else None),
                past_key_value=past_key_value,
                output_attentions=output_attentions,
                use_cache=use_cache,
            )

            hidden_states = layer_outputs[0]

            if use_cache:
                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)

            if output_attentions:
                all_self_attns += (layer_outputs[1],)

        if self.final_layer_norm is not None:
            hidden_states = self.final_layer_norm(hidden_states)

        if self.project_out is not None:
            hidden_states = self.project_out(hidden_states)

        # add hidden states from the last decoder layer
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        next_cache = next_decoder_cache if use_cache else None
        if not return_dict:
            return tuple(
                v
                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
                if v is not None
            )
        return BaseModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=next_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
        )


class OPTModel(OPTPreTrainedModel):
    def __init__(self, prefix: str, config: OPTConfig, weights):
        super().__init__(config)
        self.decoder = OPTDecoder(prefix, config, weights)
        # Initialize weights and apply final processing

    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPast]:
        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
        decoder_outputs = self.decoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        if not return_dict:
            return decoder_outputs

        return BaseModelOutputWithPast(
            last_hidden_state=decoder_outputs.last_hidden_state,
            past_key_values=decoder_outputs.past_key_values,
            hidden_states=decoder_outputs.hidden_states,
            attentions=decoder_outputs.attentions,
        )


class OPTForCausalLM(OPTPreTrainedModel):
    def __init__(self, prefix, config, weights):
        super().__init__(config)
        if not prefix and any(s.startswith("model") for s in weights.routing.keys()):
            prefix = "model"

        self.model = OPTModel(prefix, config, weights)

        self.lm_head = SpeculativeHead.load(
            config,
            prefix=f"{prefix + '.' if prefix else ''}decoder.embed_tokens",
            weights=weights,
        )

    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, CausalLMOutputWithPast]:
        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
        outputs = self.model.decoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        logits, speculative_logits = self.lm_head(outputs.last_hidden_state)

        loss = None

        return (
            CausalLMOutputWithPast(
                loss=loss,
                logits=logits,
                past_key_values=outputs.past_key_values,
                hidden_states=outputs.hidden_states,
                attentions=outputs.attentions,
            ),
            speculative_logits,
        )

    def prepare_inputs_for_generation(
        self,
        input_ids,
        past_key_values=None,
        attention_mask=None,
        inputs_embeds=None,
        **kwargs,
    ):
        if past_key_values:
            input_ids = input_ids[:, -1:]

        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
        if inputs_embeds is not None and past_key_values is None:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}

        model_inputs.update(
            {
                "past_key_values": past_key_values,
                "use_cache": kwargs.get("use_cache"),
                "attention_mask": attention_mask,
            }
        )
        return model_inputs

    @staticmethod
    def _reorder_cache(past_key_values, beam_idx):
        reordered_past = ()
        for layer_past in past_key_values:
            reordered_past += (
                tuple(
                    past_state.index_select(0, beam_idx) for past_state in layer_past
                ),
            )
        return reordered_past


================================================
FILE: server/text_generation_server/models/custom_modeling/phi_modeling.py
================================================
# imlementation of the PhiModel and PhiForCausalLM classes

import torch
import torch.distributed

import math
from torch import nn
from typing import Optional, List, Tuple
from transformers.configuration_utils import PretrainedConfig
from transformers.modeling_outputs import CausalLMOutputWithPast

from text_generation_server.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    SpeculativeHead,
    FastLinear,
)


# PhiConfig is the configuration class for the PhiModel.
class PhiConfig(PretrainedConfig):
    def __init__(
        self,
        vocab_size=51200,
        n_positions=2048,
        n_embd=2560,
        n_layer=32,
        n_inner=None,
        n_head=32,
        rotary_dim=32,
        layer_norm_epsilon=1e-5,
        tie_word_embeddings=False,
        pad_vocab_size_multiple=64,
        pad_token_id=0,
        bos_token_id=1,
        eos_token_id=2,
        no_bias=False,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.n_positions = n_positions
        self.n_embd = n_embd
        self.n_layer = n_layer
        self.n_inner = n_inner
        self.n_head = n_head
        self.rotary_dim = rotary_dim

        self.layer_norm_epsilon = layer_norm_epsilon
        self.tie_word_embeddings = tie_word_embeddings
        self.pad_vocab_size_multiple = pad_vocab_size_multiple
        self.pad_token_id = pad_token_id
        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id
        self.no_bias = no_bias

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )


# RotaryEmbedding is a class that implements the rotary embedding.
class RotaryEmbedding(nn.Module):
    def __init__(self, dim, max_seq_len):
        super().__init__()
        inv_freq = [1.0 / 10000.0 ** (i / dim) for i in range(0, dim, 2)]
        inv_freq_len = len(inv_freq)
        inv_freq = torch.tensor(inv_freq).view(1, inv_freq_len)
        t = torch.arange(0, max_seq_len, dtype=torch.float).view(max_seq_len, 1)
        freqs = t.matmul(inv_freq)
        self.sin = freqs.sin()
        self.cos = freqs.cos()

    def apply_rotary_emb_qkv(self, qkv, seqlen_offset):
        b_size, seqlen, three, _, _headdim = qkv.shape
        if three != 3:
            raise Exception("unexpected shape for qkv")
        _, rotary_dim = self.cos.shape
        rotary_dim = rotary_dim * 2
        q_rot = qkv[:, :, 0, :, :rotary_dim]
        q_pass = qkv[:, :, 0, :, rotary_dim:]
        k_rot = qkv[:, :, 1, :, :rotary_dim]
        k_pass = qkv[:, :, 1, :, rotary_dim:]
        q12 = torch.chunk(q_rot, 2, dim=-1)
        k12 = torch.chunk(k_rot, 2, dim=-1)
        q1, q2 = q12[0], q12[1]
        k1, k2 = k12[0], k12[1]
        c = self.cos.narrow(0, seqlen_offset, seqlen).unsqueeze(1)
        s = self.sin.narrow(0, seqlen_offset, seqlen).unsqueeze(1)
        q_rot = torch.cat(
            [
                q1 * c - q2 * s,
                q1 * s + q2 * c,
            ],
            dim=-1,
        )
        k_rot = torch.cat(
            [
                k1 * c - k2 * s,
                k1 * s + k2 * c,
            ],
            dim=-1,
        )
        q = torch.cat([q_rot, q_pass], dim=-1)
        k = torch.cat([k_rot, k_pass], dim=-1)
        v = qkv[:, :, 2]
        return q, k, v


# PhiCausalLMHead is the head of the PhiModel. It is a linear layer with a layer norm.
class PhiCausalLMHead(nn.Module):
    def __init__(self, config, weights):
        super().__init__()
        self.ln = nn.LayerNorm.load(
            prefix="lm_head.ln",
            weights=weights,
            eps=config.layer_norm_epsilon,
        )
        self.linear = SpeculativeHead.load(
            config=config, prefix="lm_head.linear", weights=weights
        )

    def forward(self, hidden_states):
        hidden_states = self.ln(hidden_states)
        hidden_states = self.linear(hidden_states)
        return hidden_states


# PhiMHA is a multi-head attention layer. This layer uses an attention mask to prevent tokens from attending to subsequent tokens.
class PhiMHA(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.Wqkv = TensorParallelColumnLinear.load(
            config, prefix=f"{prefix}.Wqkv", weights=weights, bias=not config.no_bias
        )
        self.out_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.out_proj",
            weights=weights,
            bias=not config.no_bias,
        )
        self.op_size = config.n_embd
        self.head_dim = int(config.n_embd / config.n_head)
        self.num_heads = config.n_head
        self.rotary_emb = RotaryEmbedding(
            config.rotary_dim,
            config.n_positions,
        )
        self.softmax_scale = 1.0 / math.sqrt(self.head_dim)

    def forward(
        self,
        hidden_states,
        past_kv_cache,
        attention_mask=None,
    ):
        b_size, seq_len, _n_embd = hidden_states.shape
        qkv = self.Wqkv(hidden_states)
        qkv = qkv.view(b_size, seq_len, 3, self.num_heads, self.head_dim)
        seqlen_offset = 0 if past_kv_cache is None else past_kv_cache[0].shape[1]
        q, k, v = self.rotary_emb.apply_rotary_emb_qkv(qkv, seqlen_offset)

        # if there is a kv_cache, then we need to concatenate
        if past_kv_cache is not None:
            prev_k, prev_v = past_kv_cache
            k = torch.cat([prev_k, k], dim=1)
            v = torch.cat([prev_v, v], dim=1)

        past_kv_cache = [k, v]
        attn_weights = torch.einsum("bthd,bshd->bhts", q, k * self.softmax_scale)

        if attention_mask is not None:
            seqlen_k = k.shape[1]
            seqlen_q = q.shape[1]
            causal_mask = torch.triu(
                torch.full((seqlen_q, seqlen_k), -10000.0, device=attn_weights.device),
                1,
            )
            attn_weights = attn_weights + causal_mask.to(dtype=attn_weights.dtype)

        attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1)
        attn_output = attn_weights.matmul(v.transpose(1, 2)).squeeze(0)
        attn_output = (
            attn_output.view((b_size, self.num_heads, seq_len, self.head_dim))
            .transpose(1, 2)
            .flatten(-2)
        )
        return self.out_proj(attn_output), past_kv_cache


# PhiMLP is a multi-layer perceptron. It contains two linear layers with a gelu activation function.
class PhiMLP(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()

        self.n_inner = config.n_inner
        self.fc1 = FastLinear.load(
            config=config,
            prefix=f"{prefix}.fc1",
            weights=weights,
            bias=False,
        )
        self.fc2 = FastLinear.load(
            config=config,
            prefix=f"{prefix}.fc2",
            weights=weights,
            bias=False,
        )
        self.activation = torch.nn.functional.gelu

    def forward(self, hidden_states):
        hidden_states = self.fc1(hidden_states)
        hidden_states = self.activation(hidden_states)
        hidden_states = self.fc2(hidden_states)
        return hidden_states


# PhiBlock is a single transformer block. It contains a layer norm, a multi-head attention layer and an multi-layer perceptron.
class PhiBlock(nn.Module):
    def __init__(self, layer_id, config, weights):
        super().__init__()
        self.layer_id = layer_id
        self.layer_norm = nn.LayerNorm.load(
            prefix=f"{layer_id}.ln", weights=weights, eps=config.layer_norm_epsilon
        )
        self.mixer = PhiMHA(prefix=f"{layer_id}.mixer", config=config, weights=weights)
        self.mlp = PhiMLP(prefix=f"{layer_id}.mlp", config=config, weights=weights)

    def forward(
        self,
        hidden_states,
        kv_cache,
        attention_mask,
    ):
        residual = hidden_states
        hidden_states = self.layer_norm(hidden_states)
        attn_outputs, past_kv_cache = self.mixer(
            hidden_states, kv_cache, attention_mask
        )
        feed_forward_hidden_states = self.mlp(hidden_states)
        out = attn_outputs + feed_forward_hidden_states + residual
        return out, past_kv_cache


# PhiModel implements the embedding layer and the transformer blocks.
class PhiModel(nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()
        self.tp_rank = weights.process_group.rank()
        self.tp_world_size = weights.process_group.size()
        self.embed_tokens = TensorParallelEmbedding(
            prefix=f"{prefix}.embd.wte", weights=weights
        )
        self.blocks = nn.ModuleList(
            [
                PhiBlock(f"{prefix}.h.{layer_id}", config, weights)
                for layer_id in range(config.n_layer)
            ]
        )

    def forward(
        self,
        input_ids: torch.LongTensor,
        past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
        attention_mask: Optional[torch.ByteTensor] = None,
        return_dict: Optional[bool] = None,
        use_cache: Optional[bool] = None,
    ) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]]]:
        hidden_states = self.embed_tokens(input_ids)
        seq_len = hidden_states.shape[1]
        mask = None if seq_len <= 1 else attention_mask

        past_key_values = (
            [None] * len(self.blocks) if past_key_values is None else past_key_values
        )

        for index, block in enumerate(self.blocks):
            hidden_states, new_key_values = block(
                hidden_states, past_key_values[index], mask
            )
            past_key_values[index] = new_key_values

        return hidden_states, past_key_values


# PhiForCausalLM wraps the PhiModel and PhiCausalLMHead together and returns a CausalLMOutputWithPast object.
class PhiForCausalLM(torch.nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()

        if not prefix:
            prefix = "transformer"
        else:
            prefix = f"{prefix}.transformer"

        self.model = PhiModel(prefix, config, weights)
        self.lm_head = PhiCausalLMHead(config, weights)

    def forward(
        self,
        input_ids: torch.LongTensor,
        past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
        attention_mask: Optional[torch.ByteTensor] = None,
        return_dict: Optional[bool] = None,
        use_cache: Optional[bool] = None,
        labels: Optional[torch.LongTensor] = None,
    ) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]]]:
        model_output = self.model(
            input_ids, past_key_values, attention_mask, return_dict, use_cache
        )
        logits = self.lm_head(model_output[0])

        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss()(
                logits[:, :-1].view(-1, logits.size(-1)), labels[:, 1:].view(-1)
            )

        if not return_dict:
            return (
                ((loss,) + (logits,) + model_output[1:])
                if loss is not None
                else (logits,) + model_output[1:]
            )

        return CausalLMOutputWithPast(
            loss=loss,
            logits=logits,
            past_key_values=model_output[1],
            hidden_states=None,
            attentions=None,
        )


================================================
FILE: server/text_generation_server/models/custom_modeling/qwen2_5_vl.py
================================================
# coding=utf-8
# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch Qwen2.5 VL model."""

from typing import Optional, Tuple, List

import torch
import torch.utils.checkpoint
from torch import nn
from text_generation_server.utils.import_utils import SYSTEM

if SYSTEM == "ipex":
    import intel_extension_for_pytorch as ipex
else:
    import flash_attn_2_cuda

import numpy as np

from transformers.activations import ACT2FN
from transformers.configuration_utils import PretrainedConfig

import torch.nn.functional as F

from text_generation_server.layers.layernorm import FastRMSNorm
from text_generation_server.layers import (
    TensorParallelColumnLinear,
    TensorParallelRowLinear,
    TensorParallelEmbedding,
    SpeculativeHead,
)
from text_generation_server.layers.attention import (
    Seqlen,
)
from text_generation_server.models.custom_modeling.flash_qwen2_modeling import (
    Qwen2Model,
)

# Copied from: https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
from typing import Union
from transformers.feature_extraction_utils import BatchFeature
from transformers.image_utils import ImageInput, VideoInput
from transformers.processing_utils import (
    ProcessingKwargs,
    ProcessorMixin,
    Unpack,
    VideosKwargs,
)
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput


class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False):
    fps: Union[List[float], float]


class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False):
    videos_kwargs: Qwen2_5_VLVideosProcessorKwargs
    _defaults = {
        "text_kwargs": {
            "padding": False,
        },
        "videos_kwargs": {"fps": 2.0},
    }


class Qwen2_5_VLProcessor(ProcessorMixin):
    r"""
    Constructs a Qwen2.5-VL processor which wraps a Qwen2.5-VL image processor and a Qwen2 tokenizer into a single processor.
    [`Qwen2_5_VLProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
    [`~Qwen2_5_VLProcessor.__call__`] and [`~Qwen2_5_VLProcessor.decode`] for more information.
    Args:
        image_processor ([`Qwen2VLImageProcessor`], *optional*):
            The image processor is a required input.
        tokenizer ([`Qwen2TokenizerFast`], *optional*):
            The tokenizer is a required input.
        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
            in a chat into a tokenizable string.
    """

    attributes = ["image_processor", "tokenizer"]
    valid_kwargs = ["chat_template"]

    image_processor_class = "AutoImageProcessor"
    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")

    def __init__(
        self, image_processor=None, tokenizer=None, chat_template=None, **kwargs
    ):
        self.image_token = (
            "<|image_pad|>"
            if not hasattr(tokenizer, "image_token")
            else tokenizer.image_token
        )
        self.video_token = (
            "<|video_pad|>"
            if not hasattr(tokenizer, "video_token")
            else tokenizer.video_token
        )
        super().__init__(image_processor, tokenizer, chat_template=chat_template)

    def __call__(
        self,
        images: ImageInput = None,
        text: Union[
            TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]
        ] = None,
        videos: VideoInput = None,
        **kwargs: Unpack[Qwen2_5_VLProcessorKwargs],
    ) -> BatchFeature:
        """
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
        Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.

        Args:
            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. Both channels-first and channels-last formats are supported.
            text (`str`, `List[str]`, `List[List[str]]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
                tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:
                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return NumPy `np.ndarray` objects.
                - `'jax'`: Return JAX `jnp.ndarray` objects.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
            - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
            - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
            - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
            - **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
        """
        output_kwargs = self._merge_kwargs(
            Qwen2_5_VLProcessorKwargs,
            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
            **kwargs,
        )
        if images is not None:
            image_inputs = self.image_processor(
                images=images, videos=None, **output_kwargs["images_kwargs"]
            )
            image_grid_thw = image_inputs["image_grid_thw"]
        else:
            image_inputs = {}
            image_grid_thw = None

        if videos is not None:
            videos_inputs = self.image_processor(
                images=None, videos=videos, **output_kwargs["images_kwargs"]
            )
            video_grid_thw = videos_inputs["video_grid_thw"]

            fps = output_kwargs["videos_kwargs"].pop("fps", 2.0)
            if isinstance(fps, (int, float)):
                second_per_grid_ts = [
                    self.image_processor.temporal_patch_size / fps
                ] * len(video_grid_thw)
            elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw):
                second_per_grid_ts = [
                    self.image_processor.temporal_patch_size / tmp for tmp in fps
                ]
            else:
                raise ValueError(
                    f"The length of fps ({len(fps) if hasattr(fps, '__len__') else fps}) must be equal to the length of video_grid_thw ({len(video_grid_thw)}) or fps should be a single number."
                )
            videos_inputs.update({"second_per_grid_ts": second_per_grid_ts})

        else:
            videos_inputs = {}
            video_grid_thw = None

        if not isinstance(text, list):
            text = [text]

        if image_grid_thw is not None:
            merge_length = self.image_processor.merge_size**2
            index = 0
            for i in range(len(text)):
                while self.image_token in text[i]:
                    text[i] = text[i].replace(
                        self.image_token,
                        "<|placeholder|>"
                        * (image_grid_thw[index].prod() // merge_length),
                        1,
                    )
                    index += 1
                text[i] = text[i].replace("<|placeholder|>", self.image_token)

        if video_grid_thw is not None:
            merge_length = self.image_processor.merge_size**2
            index = 0
            for i in range(len(text)):
                while self.video_token in text[i]:
                    text[i] = text[i].replace(
                        self.video_token,
                        "<|placeholder|>"
                        * (video_grid_thw[index].prod() // merge_length),
                        1,
                    )
                    index += 1
                text[i] = text[i].replace("<|placeholder|>", self.video_token)

        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])

        return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})

    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        """
        return self.tokenizer.batch_decode(*args, **kwargs)

    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        """
        return self.tokenizer.decode(*args, **kwargs)

    def post_process_image_text_to_text(self, generated_outputs):
        """
        Post-process the output of the model to decode the text.

        Args:
            generated_outputs (`torch.Tensor` or `np.ndarray`):
                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
                or `(sequence_length,)`.

        Returns:
            `List[str]`: The decoded text.
        """
        return self.tokenizer.batch_decode(
            generated_outputs,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False,
        )

    @property
    def model_input_names(self):
        tokenizer_input_names = self.tokenizer.model_input_names
        image_processor_input_names = self.image_processor.model_input_names
        names_from_processor = list(
            dict.fromkeys(tokenizer_input_names + image_processor_input_names)
        )
        return names_from_processor + ["second_per_grid_ts"]


# Copied from: https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py
class Qwen2_5_VLVisionConfig(PretrainedConfig):
    model_type = "qwen2_5_vl"
    base_config_key = "vision_config"

    def __init__(
        self,
        depth=32,
        hidden_size=3584,
        hidden_act="silu",
        intermediate_size=3420,
        num_heads=16,
        in_channels=3,
        patch_size=14,
        spatial_merge_size=2,
        spatial_patch_size=14,
        temporal_patch_size=2,
        tokens_per_second=4,
        window_size=112,
        out_hidden_size=3584,
        fullatt_block_indexes=[7, 15, 23, 31],
        **kwargs,
    ):
        super().__init__(**kwargs)

        self.depth = depth
        self.hidden_size = hidden_size
        self.hidden_act = hidden_act
        self.intermediate_size = intermediate_size
        self.num_heads = num_heads
        self.in_channels = in_channels
        self.patch_size = patch_size
        self.spatial_patch_size = spatial_patch_size
        self.spatial_merge_size = spatial_merge_size
        self.temporal_patch_size = temporal_patch_size
        self.tokens_per_second = tokens_per_second
        self.window_size = window_size
        self.fullatt_block_indexes = fullatt_block_indexes
        self.out_hidden_size = out_hidden_size


class Qwen2_5_VLConfig(PretrainedConfig):
    def __init__(
        self,
        vocab_size=152064,
        hidden_size=8192,
        intermediate_size=29568,
        num_hidden_layers=80,
        num_attention_heads=64,
        num_key_value_heads=8,
        hidden_act="silu",
        max_position_embeddings=32768,
        initializer_range=0.02,
        rms_norm_eps=1e-05,
        use_cache=True,
        tie_word_embeddings=False,
        rope_theta=1000000.0,
        use_sliding_window=False,
        sliding_window=4096,
        max_window_layers=80,
        attention_dropout=0.0,
        vision_config=None,
        rope_scaling=None,
        **kwargs,
    ):
        if vision_config is not None:
            self.vision_config = Qwen2_5_VLVisionConfig(**vision_config)

        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.use_sliding_window = use_sliding_window
        self.sliding_window = sliding_window
        self.max_window_layers = max_window_layers

        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.attention_dropout = attention_dropout
        self.rope_scaling = rope_scaling

        # Validate the correctness of rotary position embeddings parameters
        # BC: if there is a 'type' field, move it to 'rope_type'.
        # and change type from 'mrope' to 'default' because `mrope` does defeault RoPE calculations
        # one can set it to "linear"/"dynamic" etc. to have scaled RoPE
        # TODO: @raushan update config in the hub
        if self.rope_scaling is not None and "type" in self.rope_scaling:
            if self.rope_scaling["type"] == "mrope":
                self.rope_scaling["type"] = "default"
            self.rope_scaling["rope_type"] = self.rope_scaling["type"]

        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)


# Copied from transformers.models.llama.modeling_llama.rotate_half
def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)


def apply_rotary_pos_emb_vision(
    tensor: torch.Tensor, freqs: torch.Tensor
) -> torch.Tensor:
    orig_dtype = tensor.dtype
    tensor = tensor.float()
    cos = freqs.cos()
    sin = freqs.sin()
    cos = cos.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
    sin = sin.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
    output = (tensor * cos) + (rotate_half(tensor) * sin)
    output = output.to(orig_dtype)
    return output


class Qwen2_5VLAttention(nn.Module):
    def __init__(self, *, prefix, config, weights):
        super().__init__()
        self.embed_dim = config.hidden_size // weights.process_group.size()
        self.head_dim = config.hidden_size // config.num_heads
        self.num_heads = config.num_heads // weights.process_group.size()

        self.qkv = TensorParallelColumnLinear.load_qkv(
            config,
            prefix=f"{prefix}.qkv",
            weights=weights,
            bias=False,
            num_heads=self.num_heads,
            num_key_value_heads=self.num_heads,
        )
        self.qkv.linear.bias = weights.get_sharded(f"{prefix}.qkv.bias", dim=0)

        self.proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.proj",
            weights=weights,
            bias=True,
        )
        self.softmax_scale = 1.0 / np.sqrt(self.embed_dim // self.num_heads)

    def forward(
        self,
        hidden_state: torch.Tensor,
        cu_seqlens: torch.Tensor,
        rotary_pos_emb: torch.Tensor,
        max_seqlen: int,
    ) -> torch.Tensor:
        # apply the qkv linear layer to the hidden state
        qkv = self.qkv(hidden_state)
        query, key, value = qkv.split(
            [self.embed_dim, self.embed_dim, self.embed_dim], dim=1
        )

        # reshape the query, key, and value tensors
        _shape = (
            hidden_state.shape[0],
            self.num_heads,
            self.embed_dim // self.num_heads,
        )
        query = query.view(*_shape)
        key = key.view(*_shape)
        value = value.view(*_shape)

        # apply rotary positional embeddings
        query = apply_rotary_pos_emb_vision(query.unsqueeze(0), rotary_pos_emb).squeeze(
            0
        )
        key = apply_rotary_pos_emb_vision(key.unsqueeze(0), rotary_pos_emb).squeeze(0)

        # calc maximum sequence length for any batch
        query = query.contiguous()
        key = key.contiguous()
        value = value.contiguous()
        causal = False

        # execute flash attention
        if SYSTEM == "ipex":
            attn_output = torch.empty_like(query)
            if query.device.type == "xpu":
                ipex.llm.functional.varlen_attention(
                    query.contiguous(),
                    key.contiguous(),
                    value.contiguous(),
                    attn_output,
                    cu_seqlens,
                    cu_seqlens,
                    None,
                    max_seqlen,
                    max_seqlen,
                    0.0,
                    self.softmax_scale,
                    False,
                    causal,
                    False,
                    None,
                )
            else:
                ipex.llm.functional.varlen_attention(
                    query,
                    key,
                    value,
                    attn_output,
                    cu_seqlens,
                    cu_seqlens,
                    max_seqlen,
                    max_seqlen,
                    0.0,
                    self.softmax_scale,
                    False,
                    causal,
                    False,
                    None,
                )
        else:
            attn_output = flash_attn_2_cuda.varlen_fwd(
                query,
                key,
                value,
                None,  # tmp buffer (auto-allocated)
                cu_seqlens,  # cu_seqlens_q
                cu_seqlens,  # cu_seqlens_k
                None,  # max_seqlen_q (auto-computed)
                None,  # max_seqlen_k (auto-computed)
                None,  # block_tables
                None,  # broadcast_mask
                max_seqlen,  # max_seqlen
                max_seqlen,  # max_seqlen
                0.0,  # dropout_p
                self.softmax_scale,
                False,  # zero_tensors
                causal,  # causal attention within each sequence
                -1,  # window_size_left
                -1,  # window_size_right
                0.0,  # softmax_cap
                False,  # deterministic
                None,  # rng_state
            )[0]

        # reshape output to original dimensions
        attn_output = attn_output.reshape(hidden_state.shape[0], -1)
        attn_output = self.proj(attn_output)
        return attn_output


class Qwen2_5VLVisionMLP(nn.Module):
    def __init__(self, *, prefix, config, weights):
        super().__init__()
        self.activation_fn = ACT2FN[config.hidden_act]

        self.intermediate_size = (
            config.intermediate_size // weights.process_group.size()
        )

        self.up = TensorParallelColumnLinear.load(
            prefix=f"{prefix}.up_proj", weights=weights, config=config, bias=True
        )
        self.gate = TensorParallelColumnLinear.load(
            prefix=f"{prefix}.gate_proj", weights=weights, config=config, bias=True
        )
        self.down = TensorParallelRowLinear.load(
            prefix=f"{prefix}.down_proj", weights=weights, config=config, bias=True
        )

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        gate_states = self.gate(hidden_states)
        up_states = self.up(hidden_states)
        activated_states = self.activation_fn(gate_states) * up_states
        down_states = self.down(activated_states)
        return down_states


class Qwen2_5VLVisionBlock(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.attn = Qwen2_5VLAttention(
            prefix=f"{prefix}.attn",
            config=config,
            weights=weights,
        )
        self.norm1 = FastRMSNorm.load(
            prefix=f"{prefix}.norm1",
            weights=weights,
            eps=1e-6,
        )
        self.norm2 = FastRMSNorm.load(
            prefix=f"{prefix}.norm2",
            weights=weights,
            eps=1e-6,
        )
        self.mlp = Qwen2_5VLVisionMLP(
            prefix=f"{prefix}.mlp",
            config=config,
            weights=weights,
        )

    def forward(
        self, hidden_states, cu_seqlens, rotary_pos_emb, max_seqlen
    ) -> torch.Tensor:
        norm1_out, _ = self.norm1(hidden_states)
        attn_out = self.attn(norm1_out, cu_seqlens, rotary_pos_emb, max_seqlen)
        hidden_states = hidden_states + attn_out
        norm2_out, _ = self.norm2(hidden_states)
        mlp_out = self.mlp(norm2_out)
        hidden_states = hidden_states + mlp_out
        return hidden_states


class Qwen2_5VLPatchMerger(nn.Module):
    def __init__(self, *, prefix, config, weights):
        super().__init__()
        self.hidden_size = config.hidden_size * (config.spatial_merge_size**2)
        self.patch_merger_ln_q = FastRMSNorm.load(
            prefix=f"{prefix}.ln_q",
            weights=weights,
            eps=1e-6,
        )
        self.fc1 = TensorParallelColumnLinear.load(
            prefix=f"{prefix}.mlp.0", weights=weights, config=config, bias=True
        )
        self.fc2 = TensorParallelRowLinear.load(
            prefix=f"{prefix}.mlp.2", weights=weights, config=config, bias=True
        )

    def forward(self, hidden_states) -> torch.Tensor:
        hidden_states, _ = self.patch_merger_ln_q(hidden_states)
        hidden_states = hidden_states.view(-1, self.hidden_size)
        hidden_states = self.fc1(hidden_states)
        hidden_states = F.gelu(hidden_states)
        hidden_states = self.fc2(hidden_states)
        return hidden_states


class Qwen2_5VisionModel(nn.Module):
    def __init__(self, *, prefix, config, weights):
        super().__init__()

        self.spatial_merge_size = config.spatial_merge_size
        kernel_size = [config.temporal_patch_size, config.patch_size, config.patch_size]
        self.patch_embedding = nn.Conv3d(
            in_channels=config.in_channels,
            out_channels=config.hidden_size,
            kernel_size=kernel_size,
            stride=kernel_size,
            bias=False,
        )
        self.patch_embedding.weight = nn.Parameter(
            weights.get_tensor(f"{prefix}.patch_embed.proj.weight"), requires_grad=False
        )
        head_dim = config.hidden_size // config.num_heads

        theta = 10000.0
        dim = head_dim // 2
        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
        self.register_buffer("inv_freq", inv_freq, persistent=False)

        self.blocks = nn.ModuleList(
            [
                Qwen2_5VLVisionBlock(
                    prefix=f"{prefix}.blocks.{i}",
                    config=config,
                    weights=weights,
                )
                for i in range(config.depth)
            ]
        )
        self.merger = Qwen2_5VLPatchMerger(
            prefix=f"{prefix}.merger",
            config=config,
            weights=weights,
        )

        self.temporal_patch_size = config.temporal_patch_size
        self.spatial_patch_size = config.spatial_patch_size
        self.in_channels = config.in_channels
        self.embed_dim = config.hidden_size
        self.window_size = config.window_size
        self.patch_size = config.patch_size
        self.spatial_merge_unit = config.spatial_merge_size * config.spatial_merge_size
        self.fullatt_block_indexes = config.fullatt_block_indexes

    def apply_class_embedding(self, hidden_state: torch.Tensor) -> torch.Tensor:
        batch_size, _, hidden_size = hidden_state.shape
        class_embedding = self.class_embedding.expand(batch_size, 1, hidden_size)
        hidden_state = torch.cat([class_embedding, hidden_state], dim=1)
        return hidden_state

    def get_window_index(self, grid_thw):
        window_index: list = []
        cu_window_seqlens: list = [0]
        window_index_id = 0
        vit_merger_window_size = (
            self.window_size // self.spatial_merge_size // self.patch_size
        )

        for grid_t, grid_h, grid_w in grid_thw:
            llm_grid_h, llm_grid_w = (
                grid_h // self.spatial_merge_size,
                grid_w // self.spatial_merge_size,
            )
            index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(
                grid_t, llm_grid_h, llm_grid_w
            )
            pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size
            pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size
            num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
            num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size
            index_padded = F.pad(index, (0, pad_w, 0, pad_h), "constant", -100)
            index_padded = index_padded.reshape(
                grid_t,
                num_windows_h,
                vit_merger_window_size,
                num_windows_w,
                vit_merger_window_size,
            )
            index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape(
                grid_t,
                num_windows_h * num_windows_w,
                vit_merger_window_size,
                vit_merger_window_size,
            )
            seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
            index_padded = index_padded.reshape(-1)
            index_new = index_padded[index_padded != -100]
            window_index.append(index_new + window_index_id)
            cu_seqlens_tmp = (
                seqlens.cumsum(0) * self.spatial_merge_unit + cu_window_seqlens[-1]
            )
            cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
            window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
        window_index = torch.cat(window_index, dim=0)

        return window_index, cu_window_seqlens

    def forward(
        self,
        pixel_values: torch.Tensor,
        grid_thw: Optional[torch.LongTensor] = None,
    ) -> torch.Tensor:
        # reshape the input tensor for processing
        shape = (
            -1,
            self.in_channels,
            self.temporal_patch_size,
            self.spatial_patch_size,
            self.spatial_patch_size,
        )
        pixel_values = pixel_values.view(shape).to(self.patch_embedding.weight.dtype)
        hidden_states = self.patch_embedding(pixel_values).view(-1, self.embed_dim)
        # TODO: revisit to see if we can avoid some of these reshapes

        # find the position ids for the input tensor based on the grid_thw
        pos_ids = []
        for t, h, w in grid_thw:
            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
            hpos_ids = hpos_ids.reshape(
                h // self.spatial_merge_size,
                self.spatial_merge_size,
                w // self.spatial_merge_size,
                self.spatial_merge_size,
            )
            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
            hpos_ids = hpos_ids.flatten()

            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
            wpos_ids = wpos_ids.reshape(
                h // self.spatial_merge_size,
                self.spatial_merge_size,
                w // self.spatial_merge_size,
                self.spatial_merge_size,
            )
            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
            wpos_ids = wpos_ids.flatten()
            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))

        pos_ids = torch.cat(pos_ids, dim=0)

        max_grid_size = grid_thw[:, 1:].max()

        # apply the positional embeddings to the position ids
        seq = torch.arange(
            max_grid_size, device=self.inv_freq.device, dtype=self.inv_freq.dtype
        )
        rotary_pos_emb_full = torch.outer(seq, self.inv_freq)
        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
        window_index, cu_window_seqlens = self.get_window_index(grid_thw)
        seq_len = hidden_states.shape[0]
        patch_shape = (seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
        og_shape = (seq_len, -1)

        hidden_states = hidden_states.view(patch_shape)[window_index, :, :].view(
            og_shape
        )
        rotary_pos_emb = rotary_pos_emb.view(patch_shape)[window_index, :, :].view(
            og_shape
        )

        rotary_pos_emb = rotary_pos_emb.to(device=hidden_states.device)

        cu_window_seqlens = torch.tensor(
            cu_window_seqlens,
            device=hidden_states.device,
            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
        )
        cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)

        # create a cu_seqlens tensor to be used in the attention mask
        cu_seqlens = torch.repeat_interleave(
            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
        ).cumsum(dim=0, dtype=torch.int32)
        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
        max_seqlen = torch.max(cu_seqlens[1:] - cu_seqlens[:-1])

        # iterately apply the blocks to the hidden states
        for layer_num, block in enumerate(self.blocks):
            # NOTE: qwen2_5_vl.py has a concept of full attention blocks
            # that are applied at specific layers.
            if layer_num in self.fullatt_block_indexes:
                cu_seqlens_now = cu_seqlens
            else:
                cu_seqlens_now = cu_window_seqlens

            hidden_states = block(
                hidden_states, cu_seqlens_now, rotary_pos_emb, max_seqlen
            )

        # apply the final patch merger to the hidden states
        hidden_states = self.merger(hidden_states)
        reverse_indices = torch.argsort(window_index)
        hidden_states = hidden_states[reverse_indices, :]
        return hidden_states


class Qwen2_5VLForConditionalGeneration(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        config.vision_config.quantize = None
        config.vision_config.speculator = config.speculator
        # set rope_scaling.type == "mrope" since AutoConfig.from_pretrained incorrectly
        # returns rope_scaling.type == "default" for Qwen2_5-VL model at the moment
        if (
            hasattr(config, "rope_scaling")
            and config.rope_scaling is not None
            and config.rope_scaling.get("type", None) == "default"
        ):
            config.rope_scaling.update({"rope_type": "mrope"})
        self.hidden_size = config.hidden_size
        self.vision_start_token_id = config.vision_start_token_id
        self.vision_end_token_id = config.vision_end_token_id
        self.image_token_id = config.image_token_id
        self.video_token_id = config.video_token_id
        self.spatial_merge_size = config.vision_config.spatial_merge_size
        self.embed_tokens = TensorParallelEmbedding(
            prefix="model.embed_tokens", weights=weights
        )
        self.visual = Qwen2_5VisionModel(
            prefix="visual", config=config.vision_config, weights=weights
        )
        self.text_model = Qwen2Model(prefix=None, config=config, weights=weights)
        if config.tie_word_embeddings:
            suffix = "model.embed_tokens"
        else:
            suffix = "lm_head"

        self.lm_head = SpeculativeHead.load(
            config,
            prefix=suffix if not prefix else f"{prefix}.{suffix}",
            weights=weights,
        )
        self.device = weights.device

    # based on https://github.com/huggingface/transformers/blob/e284c7e954abe12c34b50461c17f8115a0afe115/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1391
    # modified to first find segments then initialize position ids for each segment
    # Steps:
    #  locate all vision and text segments
    #  calculate `vision_segment_lengths` for each vision segment to be use as offset
    #  calculate `text_segment_lengths` for each text segment to be used as offset
    #  create position ids for each vision segment based on the image grid
    #  create position ids for each text segment
    #  combine all the position ids
    #  the final segment is the difference between the last vision segment and the end of the input
    #  combine all the position ids and reshape to (3, input_ids_len) then swap dimensions to (input_ids_len, 3)
    def get_position_ids(
        self,
        input_ids: torch.Tensor,
        image_grid_thw: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        if image_grid_thw is None:
            return (
                torch.arange(input_ids.shape[0], device=input_ids.device)
                .unsqueeze(1)
                .repeat(1, 3)
            )

        spatial_merge_size = self.spatial_merge_size
        vision_start_token_id = self.vision_start_token_id
        vision_end_token_id = self.vision_end_token_id
        device = input_ids.device
        dtype = input_ids.dtype
        input_ids_len = input_ids.shape[0]

        vision_starts = torch.where(input_ids == vision_start_token_id)[0]
        vision_ends = torch.where(input_ids == vision_end_token_id)[0]
        vision_segments = torch.stack((vision_starts, vision_ends), dim=1)
        prev_vision_end = torch.cat(
            [torch.zeros(1, device=vision_ends.device, dtype=dtype), vision_ends[:-1]]
        )
        text_lengths_between_vision = vision_segments[:, 0] - prev_vision_end + 1
        vision_widths_max = torch.cat(
            [
                torch.zeros(1, device=image_grid_thw.device, dtype=dtype),
                image_grid_thw[:-1, 2] // spatial_merge_size,
            ]
        )
        vision_segment_lengths = vision_widths_max + text_lengths_between_vision
        vision_segment_lengths = vision_segment_lengths.cumsum(dim=0)
        text_segment_lengths = vision_segment_lengths - text_lengths_between_vision

        # create position ids for each vision segment based on the image grid
        llm_pos_ids_list = []
        for i, _ in enumerate(vision_segments):
            t, h, w = (
                image_grid_thw[i][0],
                image_grid_thw[i][1] // spatial_merge_size,
                image_grid_thw[i][2] // spatial_merge_size,
            )
            t_indices = torch.arange(t, device=device).repeat_interleave(h * w)
            h_indices = torch.arange(h, device=device).repeat_interleave(w).repeat(t)
            w_indices = torch.arange(w, device=device).repeat(t * h)
            image_position_ids = torch.stack([t_indices, h_indices, w_indices], dim=0)

            # offset by the position of the last vision segment
            im = image_position_ids + vision_segment_lengths[i]
            llm_pos_ids_list.append(im)

        # create position ids for each text segment
        text_ranges = [
            torch.arange(seq_len, device=device).view(1, -1).expand(3, -1)
            + text_segment_lengths[i]
            for i, seq_len in enumerate(text_lengths_between_vision)
        ]

        full_llm_pos_ids_list = [
            item for sublist in zip(text_ranges, llm_pos_ids_list) for item in sublist
        ]
        # import ipdb

        # ipdb.set_trace()
        max_s = full_llm_pos_ids_list[-1].max() + 1
        final_text_len = input_ids_len - vision_ends[-1]
        if final_text_len > 0:
            m = torch.arange(final_text_len, device=device).view(1, -1).expand(3, -1)
            full_llm_pos_ids_list.append(m + max_s)

        position_ids = (
            torch.cat(full_llm_pos_ids_list, dim=1).reshape(3, -1).transpose(0, 1)
        )
        return position_ids

    def get_vision_embeds(
        self,
        pixel_values: torch.FloatTensor,
        pixel_attention_mask: Optional[torch.FloatTensor] = None,
        image_sizes: Optional[torch.Tensor] = None,
        image_grid_thw: Optional[torch.LongTensor] = None,
    ):
        image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw).squeeze(0)
        return image_embeds

    def get_inputs_embeds(
        self,
        input_ids: torch.Tensor,
        vision_embeds: torch.Tensor = None,
    ):
        inputs_embeds = self.embed_tokens(input_ids)

        # apply the visual model to the pixel values if they are provided
        if vision_embeds is not None:
            inputs_embeds[input_ids == self.image_token_id] = vision_embeds

        return inputs_embeds

    def forward(
        self,
        inputs_embeds: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        prefill_cache_indices: Optional[torch.Tensor],
        lm_head_indices: Optional[torch.Tensor],
        # Unused in this model
        attention_mask: Optional[torch.Tensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
        image_indices=None,
    ):
        hidden_states = self.text_model(
            inputs_embeds=inputs_embeds,
            position_ids=position_ids,
            cu_seqlen_prefill=cu_seqlen_prefill,
            kv_cache=kv_cache,
            block_tables=block_tables,
            slots=slots,
            seqlen=seqlen,
            max_s=max_s,
            true_max_s=max_s,
            prefill_cache_indices=prefill_cache_indices,
            adapter_data=adapter_data,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.lm_head(hidden_states)
        return logits, speculative_logits


================================================
FILE: server/text_generation_server/models/custom_modeling/qwen2_vl.py
================================================
# coding=utf-8
# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch Qwen2 VL model."""

from typing import Optional, Tuple, List

import torch
import torch.utils.checkpoint
from torch import nn
from text_generation_server.utils.import_utils import SYSTEM

if SYSTEM == "ipex":
    import intel_extension_for_pytorch as ipex
else:
    import flash_attn_2_cuda

import numpy as np

from transformers.activations import ACT2FN
import torch.nn.functional as F

from text_generation_server.layers.layernorm import FastLayerNorm, FastRMSNorm
from text_generation_server.layers import (
    TensorParallelColumnLinear,
    TensorParallelRowLinear,
    TensorParallelEmbedding,
    SpeculativeHead,
)
from text_generation_server.layers.attention import (
    Seqlen,
)
from text_generation_server.models.custom_modeling.flash_qwen2_modeling import (
    Qwen2Model,
)


# Copied from transformers.models.llama.modeling_llama.rotate_half
def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)


def apply_rotary_pos_emb_vision(
    tensor: torch.Tensor, freqs: torch.Tensor
) -> torch.Tensor:
    orig_dtype = tensor.dtype
    tensor = tensor.float()
    cos = freqs.cos()
    sin = freqs.sin()
    cos = cos.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
    sin = sin.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
    output = (tensor * cos) + (rotate_half(tensor) * sin)
    output = output.to(orig_dtype)
    return output


class Qwen2VLAttention(nn.Module):
    def __init__(self, *, prefix, config, weights):
        super().__init__()
        self.embed_dim = config.embed_dim // weights.process_group.size()
        self.head_dim = config.hidden_size // config.num_heads
        self.num_heads = config.num_heads // weights.process_group.size()

        self.qkv = TensorParallelColumnLinear.load_qkv(
            config,
            prefix=f"{prefix}.qkv",
            weights=weights,
            bias=False,
            num_heads=self.num_heads,
            num_key_value_heads=self.num_heads,
        )
        self.qkv.linear.bias = weights.get_sharded(f"{prefix}.qkv.bias", dim=0)
        self.proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.proj",
            weights=weights,
            bias=True,
        )
        self.softmax_scale = 1.0 / np.sqrt(self.embed_dim // self.num_heads)

    def forward(
        self,
        hidden_state: torch.Tensor,
        cu_seqlens: torch.Tensor,
        rotary_pos_emb: torch.Tensor,
        max_seqlen: int,
    ) -> torch.Tensor:
        # apply the qkv linear layer to the hidden state
        qkv = self.qkv(hidden_state)
        query, key, value = qkv.split(
            [self.embed_dim, self.embed_dim, self.embed_dim], dim=1
        )

        # reshape the query, key, and value tensors
        _shape = (
            hidden_state.shape[0],
            self.num_heads,
            self.embed_dim // self.num_heads,
        )
        query = query.view(*_shape)
        key = key.view(*_shape)
        value = value.view(*_shape)

        # apply rotary positional embeddings
        query = apply_rotary_pos_emb_vision(query.unsqueeze(0), rotary_pos_emb).squeeze(
            0
        )
        key = apply_rotary_pos_emb_vision(key.unsqueeze(0), rotary_pos_emb).squeeze(0)

        # calc maximum sequence length for any batch
        query = query.contiguous()
        key = key.contiguous()
        value = value.contiguous()
        causal = False

        # execute flash attention
        if SYSTEM == "ipex":
            attn_output = torch.empty_like(query)
            if query.device.type == "xpu":
                ipex.llm.functional.varlen_attention(
                    query.contiguous(),
                    key.contiguous(),
                    value.contiguous(),
                    attn_output,
                    cu_seqlens,
                    cu_seqlens,
                    None,
                    max_seqlen,
                    max_seqlen,
                    0.0,
                    self.softmax_scale,
                    False,
                    causal,
                    False,
                    None,
                )
            else:
                ipex.llm.functional.varlen_attention(
                    query,
                    key,
                    value,
                    attn_output,
                    cu_seqlens,
                    cu_seqlens,
                    max_seqlen,
                    max_seqlen,
                    0.0,
                    self.softmax_scale,
                    False,
                    causal,
                    False,
                    None,
                )
        else:
            attn_output = flash_attn_2_cuda.varlen_fwd(
                query,
                key,
                value,
                None,  # tmp buffer (auto-allocated)
                cu_seqlens,  # cu_seqlens_q
                cu_seqlens,  # cu_seqlens_k
                None,  # max_seqlen_q (auto-computed)
                None,  # max_seqlen_k (auto-computed)
                None,  # block_tables
                None,  # broadcast_mask
                max_seqlen,  # max_seqlen
                max_seqlen,  # max_seqlen
                0.0,  # dropout_p
                self.softmax_scale,
                False,  # zero_tensors
                causal,  # causal attention within each sequence
                -1,  # window_size_left
                -1,  # window_size_right
                0.0,  # softmax_cap
                False,  # deterministic
                None,  # rng_state
            )[0]

        # reshape output to original dimensions
        attn_output = attn_output.reshape(hidden_state.shape[0], -1)
        attn_output = self.proj(attn_output)
        return attn_output


class Qwen2VLVisionMLP(nn.Module):
    def __init__(self, *, prefix, config, weights):
        super().__init__()
        self.activation_fn = ACT2FN[config.hidden_act]
        self.fc1 = TensorParallelColumnLinear.load(
            prefix=f"{prefix}.fc1", weights=weights, config=config, bias=True
        )
        self.fc2 = TensorParallelRowLinear.load(
            prefix=f"{prefix}.fc2", weights=weights, config=config, bias=True
        )

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.fc1(hidden_states)
        hidden_states = self.activation_fn(hidden_states)
        hidden_states = self.fc2(hidden_states)
        return hidden_states


class Qwen2VLVisionBlock(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.attn = Qwen2VLAttention(
            prefix=f"{prefix}.attn",
            config=config,
            weights=weights,
        )
        self.norm1 = FastLayerNorm.load(
            prefix=f"{prefix}.norm1",
            weights=weights,
            eps=1e-6,
        )
        self.norm2 = FastLayerNorm.load(
            prefix=f"{prefix}.norm2",
            weights=weights,
            eps=1e-6,
        )
        self.mlp = Qwen2VLVisionMLP(
            prefix=f"{prefix}.mlp",
            config=config,
            weights=weights,
        )

    def forward(
        self, hidden_states, cu_seqlens, rotary_pos_emb, max_seqlen
    ) -> torch.Tensor:
        norm1_out, residual = self.norm1(hidden_states)
        attn_out = self.attn(norm1_out, cu_seqlens, rotary_pos_emb, max_seqlen)
        hidden_states = attn_out + residual
        norm2_out, residual = self.norm2(hidden_states)
        hidden_states = hidden_states + self.mlp(norm2_out)
        return hidden_states


class Qwen2VLPatchMerger(nn.Module):
    def __init__(self, *, prefix, config, weights):
        super().__init__()
        self.hidden_size = config.embed_dim * (config.spatial_merge_size**2)
        self.patch_merger_ln_q = FastLayerNorm.load(
            prefix=f"{prefix}.ln_q",
            weights=weights,
            eps=1e-6,
        )
        self.fc1 = TensorParallelColumnLinear.load(
            prefix=f"{prefix}.mlp.0", weights=weights, config=config, bias=True
        )
        self.fc2 = TensorParallelRowLinear.load(
            prefix=f"{prefix}.mlp.2", weights=weights, config=config, bias=True
        )

    def forward(self, hidden_states) -> torch.Tensor:
        hidden_states, _ = self.patch_merger_ln_q(hidden_states)
        hidden_states = hidden_states.view(-1, self.hidden_size)
        hidden_states = self.fc1(hidden_states)
        hidden_states = F.gelu(hidden_states)
        hidden_states = self.fc2(hidden_states)
        return hidden_states


class Qwen2VisionModel(nn.Module):
    def __init__(self, *, prefix, config, weights):
        super().__init__()
        self.spatial_merge_size = config.spatial_merge_size
        kernel_size = [config.temporal_patch_size, config.patch_size, config.patch_size]
        self.patch_embedding = nn.Conv3d(
            in_channels=config.in_chans,
            out_channels=config.embed_dim,
            kernel_size=kernel_size,
            stride=kernel_size,
            bias=False,
        )
        self.patch_embedding.weight = nn.Parameter(
            weights.get_tensor(f"{prefix}.patch_embed.proj.weight"), requires_grad=False
        )
        head_dim = config.embed_dim // config.num_heads
        # TODO: replace with static positional embeddings once implemented
        theta = 10000.0
        dim = head_dim // 2
        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
        self.register_buffer("inv_freq", inv_freq, persistent=False)

        self.blocks = nn.ModuleList(
            [
                Qwen2VLVisionBlock(
                    prefix=f"{prefix}.blocks.{i}",
                    config=config,
                    weights=weights,
                )
                for i in range(config.depth)
            ]
        )
        self.merger = Qwen2VLPatchMerger(
            prefix=f"{prefix}.merger",
            config=config,
            weights=weights,
        )

        self.temporal_patch_size = config.temporal_patch_size
        self.spatial_patch_size = config.spatial_patch_size
        self.in_channels = config.in_channels
        self.embed_dim = config.embed_dim

    def apply_class_embedding(self, hidden_state: torch.Tensor) -> torch.Tensor:
        batch_size, _, hidden_size = hidden_state.shape
        class_embedding = self.class_embedding.expand(batch_size, 1, hidden_size)
        hidden_state = torch.cat([class_embedding, hidden_state], dim=1)
        return hidden_state

    def forward(
        self,
        pixel_values: torch.Tensor,
        grid_thw: Optional[torch.LongTensor] = None,
    ) -> torch.Tensor:
        # reshape the input tensor for processing
        shape = (
            -1,
            self.in_channels,
            self.temporal_patch_size,
            self.spatial_patch_size,
            self.spatial_patch_size,
        )
        pixel_values = pixel_values.view(shape).to(self.patch_embedding.weight.dtype)
        hidden_states = self.patch_embedding(pixel_values).view(-1, self.embed_dim)
        # TODO: revisit to see if we can avoid some of these reshapes

        # find the position ids for the input tensor based on the grid_thw
        pos_ids = []
        for t, h, w in grid_thw:
            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
            hpos_ids = hpos_ids.reshape(
                h // self.spatial_merge_size,
                self.spatial_merge_size,
                w // self.spatial_merge_size,
                self.spatial_merge_size,
            )
            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
            hpos_ids = hpos_ids.flatten()

            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
            wpos_ids = wpos_ids.reshape(
                h // self.spatial_merge_size,
                self.spatial_merge_size,
                w // self.spatial_merge_size,
                self.spatial_merge_size,
            )
            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
            wpos_ids = wpos_ids.flatten()
            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))

        pos_ids = torch.cat(pos_ids, dim=0)
        max_grid_size = grid_thw[:, 1:].max()

        # apply the positional embeddings to the position ids
        seq = torch.arange(
            max_grid_size, device=self.inv_freq.device, dtype=self.inv_freq.dtype
        )
        rotary_pos_emb_full = torch.outer(seq, self.inv_freq)
        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
        rotary_pos_emb = rotary_pos_emb.to(hidden_states.device, hidden_states.dtype)

        # create a cu_seqlens tensor to be used in the attention mask
        cu_seqlens = torch.repeat_interleave(
            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
        ).cumsum(dim=0, dtype=torch.int32)
        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
        max_seqlen = torch.max(cu_seqlens[1:] - cu_seqlens[:-1])
        # iterately apply the blocks to the hidden states
        for block in self.blocks:
            hidden_states = block(hidden_states, cu_seqlens, rotary_pos_emb, max_seqlen)

        # apply the final patch merger to the hidden states
        hidden_states = self.merger(hidden_states)
        return hidden_states


class Qwen2VLForConditionalGeneration(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        config.vision_config.quantize = None
        config.vision_config.speculator = config.speculator
        # set rope_scaling.type == "mrope" since AutoConfig.from_pretrained incorrectly
        # returns rope_scaling.type == "default" for Qwen2-VL model at the moment
        if (
            hasattr(config, "rope_scaling")
            and config.rope_scaling is not None
            and config.rope_scaling.get("type", None) == "default"
        ):
            config.rope_scaling.update({"rope_type": "mrope"})
        self.hidden_size = config.hidden_size
        self.vision_start_token_id = config.vision_start_token_id
        self.vision_end_token_id = config.vision_end_token_id
        self.image_token_id = config.image_token_id
        self.video_token_id = config.video_token_id
        self.spatial_merge_size = config.vision_config.spatial_merge_size
        self.embed_tokens = TensorParallelEmbedding(
            prefix="model.embed_tokens", weights=weights
        )
        self.visual = Qwen2VisionModel(
            prefix="visual", config=config.vision_config, weights=weights
        )
        self.text_model = Qwen2Model(prefix=None, config=config, weights=weights)
        if config.tie_word_embeddings:
            suffix = "model.embed_tokens"
        else:
            suffix = "lm_head"

        self.lm_head = SpeculativeHead.load(
            config,
            prefix=suffix if not prefix else f"{prefix}.{suffix}",
            weights=weights,
        )
        self.norm = FastRMSNorm.load(
            prefix="model.norm",
            weights=weights,
            eps=config.rms_norm_eps,
        )
        self.device = weights.device

    # based on https://github.com/huggingface/transformers/blob/e284c7e954abe12c34b50461c17f8115a0afe115/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1391
    # modified to first find segments then initialize position ids for each segment
    # Steps:
    #  locate all vision and text segments
    #  calculate `vision_segment_lengths` for each vision segment to be use as offset
    #  calculate `text_segment_lengths` for each text segment to be used as offset
    #  create position ids for each vision segment based on the image grid
    #  create position ids for each text segment
    #  combine all the position ids
    #  the final segment is the difference between the last vision segment and the end of the input
    #  combine all the position ids and reshape to (3, input_ids_len) then swap dimensions to (input_ids_len, 3)
    def get_position_ids(
        self,
        input_ids: torch.Tensor,
        image_grid_thw: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        if image_grid_thw is None:
            return (
                torch.arange(input_ids.shape[0], device=input_ids.device)
                .unsqueeze(1)
                .repeat(1, 3)
            )

        spatial_merge_size = self.spatial_merge_size
        vision_start_token_id = self.vision_start_token_id
        vision_end_token_id = self.vision_end_token_id
        device = input_ids.device
        dtype = input_ids.dtype
        input_ids_len = input_ids.shape[0]

        vision_starts = torch.where(input_ids == vision_start_token_id)[0]
        vision_ends = torch.where(input_ids == vision_end_token_id)[0]
        vision_segments = torch.stack((vision_starts, vision_ends), dim=1)
        prev_vision_end = torch.cat(
            [torch.zeros(1, device=vision_ends.device, dtype=dtype), vision_ends[:-1]]
        )
        text_lengths_between_vision = vision_segments[:, 0] - prev_vision_end + 1
        vision_widths_max = torch.cat(
            [
                torch.zeros(1, device=image_grid_thw.device, dtype=dtype),
                image_grid_thw[:-1, 2] // spatial_merge_size,
            ]
        )
        vision_segment_lengths = vision_widths_max + text_lengths_between_vision
        vision_segment_lengths = vision_segment_lengths.cumsum(dim=0)
        text_segment_lengths = vision_segment_lengths - text_lengths_between_vision

        # create position ids for each vision segment based on the image grid
        llm_pos_ids_list = []
        for i, _ in enumerate(vision_segments):
            t, h, w = (
                image_grid_thw[i][0],
                image_grid_thw[i][1] // spatial_merge_size,
                image_grid_thw[i][2] // spatial_merge_size,
            )
            t_indices = torch.arange(t, device=device).repeat_interleave(h * w)
            h_indices = torch.arange(h, device=device).repeat_interleave(w).repeat(t)
            w_indices = torch.arange(w, device=device).repeat(t * h)
            image_position_ids = torch.stack([t_indices, h_indices, w_indices], dim=0)

            # offset by the position of the last vision segment
            im = image_position_ids + vision_segment_lengths[i]
            llm_pos_ids_list.append(im)

        # create position ids for each text segment
        text_ranges = [
            torch.arange(seq_len, device=device).view(1, -1).expand(3, -1)
            + text_segment_lengths[i]
            for i, seq_len in enumerate(text_lengths_between_vision)
        ]

        full_llm_pos_ids_list = [
            item for sublist in zip(text_ranges, llm_pos_ids_list) for item in sublist
        ]
        max_s = full_llm_pos_ids_list[-1].max() + 1
        final_text_len = input_ids_len - vision_ends[-1]
        if final_text_len > 0:
            m = torch.arange(final_text_len, device=device).view(1, -1).expand(3, -1)
            full_llm_pos_ids_list.append(m + max_s)

        position_ids = (
            torch.cat(full_llm_pos_ids_list, dim=1).reshape(3, -1).transpose(0, 1)
        )
        return position_ids

    def get_vision_embeds(
        self,
        pixel_values: torch.FloatTensor,
        pixel_attention_mask: Optional[torch.FloatTensor] = None,
        image_sizes: Optional[torch.Tensor] = None,
        image_grid_thw: Optional[torch.LongTensor] = None,
    ):
        image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw).squeeze(0)
        return image_embeds

    def get_inputs_embeds(
        self,
        input_ids: torch.Tensor,
        vision_embeds: torch.Tensor = None,
    ):
        inputs_embeds = self.embed_tokens(input_ids)

        # apply the visual model to the pixel values if they are provided
        if vision_embeds is not None:
            inputs_embeds[input_ids == self.image_token_id] = vision_embeds

        return inputs_embeds

    def forward(
        self,
        inputs_embeds: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        prefill_cache_indices: Optional[torch.Tensor],
        lm_head_indices: Optional[torch.Tensor],
        adapter_data: Optional[torch.Tensor] = None,
        image_indices=None,
        attention_mask=None,
    ):
        hidden_states = self.text_model(
            inputs_embeds=inputs_embeds,
            position_ids=position_ids,
            cu_seqlen_prefill=cu_seqlen_prefill,
            kv_cache=kv_cache,
            block_tables=block_tables,
            slots=slots,
            seqlen=seqlen,
            max_s=max_s,
            true_max_s=max_s,
            prefill_cache_indices=prefill_cache_indices,
            adapter_data=adapter_data,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.lm_head(hidden_states)
        return logits, speculative_logits


================================================
FILE: server/text_generation_server/models/custom_modeling/siglip.py
================================================
from typing import Optional, Tuple
import warnings
import math
import torch
from torch import nn

from transformers.activations import ACT2FN
from transformers.modeling_outputs import (
    BaseModelOutputWithPooling,
)
from transformers import SiglipConfig, SiglipVisionConfig
from torch.nn.init import _calculate_fan_in_and_fan_out

from text_generation_server.layers.tensor_parallel import (
    TensorParallelEmbedding,
    TensorParallelColumnLinear,
    TensorParallelRowLinear,
)


class SiglipVisionEmbeddings(nn.Module):
    def __init__(self, prefix, config: SiglipVisionConfig, weights):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size
        self.image_size = config.image_size
        self.patch_size = config.patch_size
        self.patch_embedding = nn.Conv2d(
            in_channels=config.num_channels,
            out_channels=self.embed_dim,
            kernel_size=self.patch_size,
            stride=self.patch_size,
            padding="valid",
        )
        self.patch_embedding.weight = nn.Parameter(
            weights.get_tensor(f"{prefix}.patch_embedding.weight"), requires_grad=False
        )
        self.patch_embedding.bias = nn.Parameter(
            weights.get_tensor(f"{prefix}.patch_embedding.bias"), requires_grad=False
        )
        self.num_patches = (self.image_size // self.patch_size) ** 2
        self.num_positions = self.num_patches
        self.position_embedding = TensorParallelEmbedding(
            prefix=f"{prefix}.position_embedding", weights=weights
        )
        self.register_buffer(
            "position_ids",
            torch.arange(self.num_positions, device=weights.device).expand((1, -1)),
            persistent=False,
        )

    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
        patch_embeds = self.patch_embedding(
            pixel_values
        )  # shape = [*, width, grid, grid]
        embeddings = patch_embeds.flatten(2).transpose(1, 2)

        embeddings = embeddings + self.position_embedding(self.position_ids)
        return embeddings


class SiglipAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.embed_dim // self.num_heads
        self.head_size = self.head_dim
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {self.num_heads})."
            )
        self.num_heads = self.num_heads // weights.process_group.size()
        self.embed_dim = self.embed_dim // weights.process_group.size()
        self.scale = self.head_dim**-0.5
        self.dropout = config.attention_dropout

        self.k_proj = TensorParallelColumnLinear.load(
            config, prefix=f"{prefix}.k_proj", weights=weights, bias=True
        )
        self.v_proj = TensorParallelColumnLinear.load(
            config, prefix=f"{prefix}.v_proj", weights=weights, bias=True
        )
        self.q_proj = TensorParallelColumnLinear.load(
            config, prefix=f"{prefix}.q_proj", weights=weights, bias=True
        )
        self.out_proj = TensorParallelRowLinear.load(
            config, prefix=f"{prefix}.out_proj", weights=weights, bias=True
        )

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return (
            tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
            .transpose(1, 2)
            .contiguous()
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        """Input shape: Batch x Time x Channel"""

        bsz, tgt_len, _ = hidden_states.size()
        query_states = self.q_proj(hidden_states)
        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
        key_states = key_states.view(*proj_shape)
        value_states = value_states.view(*proj_shape)

        src_len = key_states.size(1)
        # scale post matmul
        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) * self.scale

        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
            raise ValueError(
                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
                f" {attn_weights.size()}"
            )

        if attention_mask is not None:
            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
                raise ValueError(
                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
                )
            attn_weights = (
                attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
                + attention_mask
            )
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)

        # upcast attention to fp32
        attn_weights = nn.functional.softmax(
            attn_weights, dim=-1, dtype=torch.float32
        ).to(attn_weights.dtype)
        attn_weights = nn.functional.dropout(
            attn_weights, p=self.dropout, training=self.training
        )
        attn_output = torch.matmul(attn_weights, value_states)

        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_size):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_size)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_size)
        attn_output = attn_output.transpose(1, 2)
        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)

        attn_output = self.out_proj(attn_output)

        return attn_output, attn_weights


class SiglipMLP(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.config = config
        self.activation_fn = ACT2FN[config.hidden_act]
        self.fc1 = TensorParallelColumnLinear.load(  # config.hidden_size, config.intermediate_size
            prefix=f"{prefix}.fc1", config=config, weights=weights, bias=True
        )
        self.fc2 = TensorParallelRowLinear.load(  # config.intermediate_size, config.hidden_size
            prefix=f"{prefix}.fc2", config=config, weights=weights, bias=True
        )

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.fc1(hidden_states)
        hidden_states = self.activation_fn(hidden_states)
        hidden_states = self.fc2(hidden_states)
        return hidden_states


class SiglipEncoderLayer(nn.Module):
    def __init__(self, prefix, config: SiglipConfig, weights):
        super().__init__()
        self.embed_dim = config.hidden_size
        self.self_attn = SiglipAttention(
            prefix=f"{prefix}.self_attn", config=config, weights=weights
        )
        self.layer_norm1 = nn.LayerNorm.load(
            prefix=f"{prefix}.layer_norm1", weights=weights, eps=config.layer_norm_eps
        )
        self.mlp = SiglipMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
        self.layer_norm2 = nn.LayerNorm.load(
            prefix=f"{prefix}.layer_norm2", weights=weights, eps=config.layer_norm_eps
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
    ) -> Tuple[torch.FloatTensor]:
        residual = hidden_states
        hidden_states = self.layer_norm1(hidden_states)
        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
        )
        hidden_states = residual + hidden_states
        residual = hidden_states
        hidden_states = self.layer_norm2(hidden_states)
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states
        return hidden_states, None


class SiglipMultiheadAttentionPoolingHead(nn.Module):
    """Multihead Attention Pooling."""

    def __init__(self, prefix, config: SiglipVisionConfig, weights):
        super().__init__()

        self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
        self.attention = torch.nn.MultiheadAttention(
            config.hidden_size, config.num_attention_heads, batch_first=True
        )
        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.mlp = SiglipMLP(prefix, config, weights)

    def forward(self, hidden_state):
        batch_size = hidden_state.shape[0]
        probe = self.probe.repeat(batch_size, 1, 1)

        hidden_state = self.attention(probe, hidden_state, hidden_state)[0]

        residual = hidden_state
        hidden_state = self.layernorm(hidden_state)
        hidden_state = residual + self.mlp(hidden_state)

        return hidden_state[:, 0]


def _trunc_normal_(tensor, mean, std, a, b):
    # Cut & paste from PyTorch official master until it's in a few official releases - RW
    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
    def norm_cdf(x):
        # Computes standard normal cumulative distribution function
        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0

    if (mean < a - 2 * std) or (mean > b + 2 * std):
        warnings.warn(
            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
            "The distribution of values may be incorrect.",
            stacklevel=2,
        )

    # Values are generated by using a truncated uniform distribution and
    # then using the inverse CDF for the normal distribution.
    # Get upper and lower cdf values
    lower = norm_cdf((a - mean) / std)
    upper = norm_cdf((b - mean) / std)

    # Uniformly fill tensor with values from [l, u], then translate to
    # [2l-1, 2u-1].
    tensor.uniform_(2 * lower - 1, 2 * upper - 1)

    # Use inverse cdf transform for normal distribution to get truncated
    # standard normal
    tensor.erfinv_()

    # Transform to proper mean, std
    tensor.mul_(std * math.sqrt(2.0))
    tensor.add_(mean)

    # Clamp to ensure it's in the proper range
    tensor.clamp_(min=a, max=b)


def trunc_normal_tf_(
    tensor: torch.Tensor,
    mean: float = 0.0,
    std: float = 1.0,
    a: float = -2.0,
    b: float = 2.0,
) -> torch.Tensor:
    """Fills the input Tensor with values drawn from a truncated
    normal distribution. The values are effectively drawn from the
    normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
    with values outside :math:`[a, b]` redrawn until they are within
    the bounds. The method used for generating the random values works
    best when :math:`a \\leq \text{mean} \\leq b`.

    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
    and the result is subsquently scaled and shifted by the mean and std args.

    Args:
        tensor: an n-dimensional `torch.Tensor`
        mean: the mean of the normal distribution
        std: the standard deviation of the normal distribution
        a: the minimum cutoff value
        b: the maximum cutoff value
    """
    with torch.no_grad():
        _trunc_normal_(tensor, 0, 1.0, a, b)
        tensor.mul_(std).add_(mean)


def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
    if mode == "fan_in":
        denom = fan_in
    elif mode == "fan_out":
        denom = fan_out
    elif mode == "fan_avg":
        denom = (fan_in + fan_out) / 2

    variance = scale / denom

    if distribution == "truncated_normal":
        # constant is stddev of standard normal truncated to (-2, 2)
        trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
    elif distribution == "normal":
        with torch.no_grad():
            tensor.normal_(std=math.sqrt(variance))
    elif distribution == "uniform":
        bound = math.sqrt(3 * variance)
        with torch.no_grad():
            tensor.uniform_(-bound, bound)
    else:
        raise ValueError(f"invalid distribution {distribution}")


def lecun_normal_(tensor):
    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")


def default_flax_embed_init(tensor):
    variance_scaling_(tensor, mode="fan_in", distribution="normal")


class SiglipEncoder(nn.Module):
    """
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`SiglipEncoderLayer`].

    Args:
        config: SiglipConfig
    """

    def __init__(self, prefix, config: SiglipConfig, weights):
        super().__init__()
        self.config = config
        self.layers = nn.ModuleList(
            [
                SiglipEncoderLayer(
                    prefix=f"{prefix}.layers.{i}", config=config, weights=weights
                )
                for i in range(config.num_hidden_layers)
            ]
        )

    def forward(
        self,
        inputs_embeds,
        attention_mask: Optional[torch.Tensor] = None,
    ):
        hidden_states = inputs_embeds
        for idx, encoder_layer in enumerate(self.layers):
            hidden_states, _ = encoder_layer(
                hidden_states,
                attention_mask,
            )

        return hidden_states


class SiglipVisionTransformer(nn.Module):
    def __init__(self, prefix, config: SiglipVisionConfig, weights):
        super().__init__()
        self.config = config

        self.embeddings = SiglipVisionEmbeddings(
            prefix=f"{prefix}.embeddings", config=config, weights=weights
        )
        self.encoder = SiglipEncoder(
            prefix=f"{prefix}.encoder", config=config, weights=weights
        )

    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
    ):
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        hidden_states = self.embeddings(pixel_values)

        # NOTE: up until this point, the code logits are exactly
        # the same as the transformers code. The values evaulate
        # slightly differently in our encoder layer.
        encoder_outputs = self.encoder(
            inputs_embeds=hidden_states,
        )
        last_hidden_state = encoder_outputs

        return BaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,
            # pooler_output=pooled_output,
            # hidden_states=encoder_outputs,
        )


================================================
FILE: server/text_generation_server/models/custom_modeling/t5_modeling.py
================================================
# coding=utf-8
# Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch T5 model."""

import copy
import math
import warnings
from typing import Optional, Tuple, Union

from loguru import logger

import torch
import torch.distributed
from torch import nn
from torch.nn import CrossEntropyLoss

from transformers.activations import ACT2FN
from transformers.modeling_outputs import (
    BaseModelOutput,
    BaseModelOutputWithPastAndCrossAttentions,
    Seq2SeqLMOutput,
)
from transformers.modeling_utils import PreTrainedModel
from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
from transformers.utils import (
    is_torch_fx_proxy,
)
from transformers import T5Config
from text_generation_server.layers import (
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    TensorParallelRowLinear,
    SpeculativeHead,
)

# copied from https://github.com/huggingface/transformers/blob/cd4584e3c809bb9e1392ccd3fe38b40daba5519a/src/transformers/models/t5/modeling_t5.py#L1316
# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
__HEAD_MASK_WARNING_MSG = """
The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently,
`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions.
If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers,
num_heads)`.
"""


class PartialTPEmbedding(nn.Module):
    def __init__(self, prefix: str, weights):
        super().__init__()
        weight = weights.get_sharded(f"{prefix}.weight", dim=1)
        self.weight = nn.Parameter(weight)

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        return torch.nn.functional.embedding(input, self.weight)


@torch.jit.script
def layer_norm(hidden_states, weight, epsilon):
    # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
    # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
    # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
    # half-precision inputs is done in fp32

    variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
    hidden_states = hidden_states * torch.rsqrt(variance + epsilon)

    # convert into half-precision if necessary
    if weight.dtype in [torch.float16, torch.bfloat16]:
        hidden_states = hidden_states.to(weight.dtype)

    return weight * hidden_states


class T5LayerNorm(nn.Module):
    def __init__(self, prefix, weights, eps=1e-6):
        """
        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
        """
        super().__init__()
        weight = weights.get_tensor(f"{prefix}.weight")
        self.weight = nn.Parameter(weight)
        self.variance_epsilon = torch.tensor(eps)

    def forward(self, hidden_states):
        return layer_norm(hidden_states, self.weight, self.variance_epsilon)


try:
    from apex.normalization import FusedRMSNorm

    T5LayerNorm = FusedRMSNorm  # noqa

    logger.info(
        "Discovered apex.normalization.FusedRMSNorm - will use it instead of T5LayerNorm"
    )
except ImportError:
    # using the normal T5LayerNorm
    pass
except Exception:
    logger.warning("discovered apex but it failed to load, falling back to T5LayerNorm")
    pass

ALL_LAYERNORM_LAYERS.append(T5LayerNorm)


class T5DenseActDense(nn.Module):
    def __init__(self, config: T5Config, prefix, weights):
        super().__init__()
        self.wi = TensorParallelColumnLinear.load(
            config, prefix=f"{prefix}.wi", weights=weights, bias=False
        )

        ### XXX: T5 models do not handle well both f16 and quantization.
        ### Overidding specifically this layer for that reason.
        ### https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py#L316
        ### https://github.com/huggingface/transformers/issues/20287
        _q = config.quantize
        _dtype = weights.dtype
        weights.dtype = torch.float32
        config.quantize = None
        self.wo_cast = (torch.float32, _dtype)
        self.wo = TensorParallelRowLinear.load(
            config, prefix=f"{prefix}.wo", weights=weights, bias=False
        )
        weights.dtype = _dtype
        config.quantize = _q

        self.dropout = nn.Dropout(config.dropout_rate)
        self.act = (
            ACT2FN[config.dense_act_fn]
            if "gelu" not in config.dense_act_fn
            else lambda x: torch.nn.functional.gelu(x, approximate="tanh")
        )

    def forward(self, hidden_states):
        hidden_states = self.wi(hidden_states)
        hidden_states = self.act(hidden_states)
        hidden_states = self.dropout(hidden_states)

        hidden_states = hidden_states.to(dtype=self.wo_cast[0])
        hidden_states = self.wo(hidden_states)
        # XXX: Recasting is already done within the layer norm.
        # Casting back to float16 here modifies results
        # hidden_states = hidden_states.to(dtype=self.wo_cast[1])
        return hidden_states


class T5DenseGatedActDense(nn.Module):
    def __init__(self, config: T5Config, prefix, weights):
        super().__init__()
        self.wi_0 = TensorParallelColumnLinear.load(
            config, prefix=f"{prefix}.wi_0", weights=weights, bias=False
        )
        self.wi_1 = TensorParallelColumnLinear.load(
            config, prefix=f"{prefix}.wi_1", weights=weights, bias=False
        )
        ### XXX: T5 models do not handle well both f16 and quantization.
        ### Overidding specifically this layer for that reason.
        ### https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py#L316
        ### https://github.com/huggingface/transformers/issues/20287
        _q = config.quantize
        _dtype = weights.dtype
        weights.dtype = torch.float32
        config.quantize = None
        self.wo_cast = (torch.float32, _dtype)
        self.wo = TensorParallelRowLinear.load(
            config, prefix=f"{prefix}.wo", weights=weights, bias=False
        )
        weights.dtype = _dtype
        config.quantize = _q

        self.dropout = nn.Dropout(config.dropout_rate)
        self.act = (
            ACT2FN[config.dense_act_fn]
            if "gelu" not in config.dense_act_fn
            else lambda x: torch.nn.functional.gelu(x, approximate="tanh")
        )

    def forward(self, hidden_states):
        hidden_gelu = self.act(self.wi_0(hidden_states))
        hidden_linear = self.wi_1(hidden_states)
        hidden_states = hidden_gelu * hidden_linear
        hidden_states = self.dropout(hidden_states)

        hidden_states = hidden_states.to(dtype=self.wo_cast[0])
        hidden_states = self.wo(hidden_states)
        # XXX: Recasting is already done within the layer norm.
        # Casting back to float16 here modifies results
        # hidden_states = hidden_states.to(dtype=self.wo_cast[1])
        return hidden_states


class T5LayerFF(nn.Module):
    def __init__(self, config: T5Config, prefix, weights):
        super().__init__()
        if config.is_gated_act:
            self.DenseReluDense = T5DenseGatedActDense(
                config, prefix=f"{prefix}.DenseReluDense", weights=weights
            )
        else:
            self.DenseReluDense = T5DenseActDense(
                config, prefix=f"{prefix}.DenseReluDense", weights=weights
            )

        self.layer_norm = T5LayerNorm(
            prefix=f"{prefix}.layer_norm",
            weights=weights,
            eps=config.layer_norm_epsilon,
        )
        self.dropout = nn.Dropout(config.dropout_rate)

    def forward(self, hidden_states):
        forwarded_states = self.layer_norm(hidden_states)
        forwarded_states = self.DenseReluDense(forwarded_states)
        hidden_states = hidden_states + self.dropout(forwarded_states)
        return hidden_states


class T5Attention(nn.Module):
    def __init__(
        self, config: T5Config, prefix, weights, has_relative_attention_bias=False
    ):
        super().__init__()
        self.is_decoder = config.is_decoder
        self.has_relative_attention_bias = has_relative_attention_bias
        self.relative_attention_num_buckets = config.relative_attention_num_buckets
        self.relative_attention_max_distance = config.relative_attention_max_distance
        self.d_model = config.d_model
        self.key_value_proj_dim = config.d_kv
        self.n_heads = config.num_heads
        self.dropout = config.dropout_rate
        self.inner_dim = self.n_heads * self.key_value_proj_dim

        process_group = weights.process_group
        # Mesh TensorFlow initialization to avoid scaling before softmax
        assert self.n_heads % process_group.size() == 0
        self.q = TensorParallelColumnLinear.load(
            config, prefix=f"{prefix}.q", weights=weights, bias=False
        )
        self.k = TensorParallelColumnLinear.load(
            config, prefix=f"{prefix}.k", weights=weights, bias=False
        )
        self.v = TensorParallelColumnLinear.load(
            config, prefix=f"{prefix}.v", weights=weights, bias=False
        )
        self.o = TensorParallelRowLinear.load(
            config, prefix=f"{prefix}.o", weights=weights, bias=False
        )
        if self.n_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`n_heads` must be divisible by `num_shards` (got `n_heads`: {self.n_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.n_heads = self.n_heads // process_group.size()
        self.inner_dim = self.inner_dim // process_group.size()

        if self.has_relative_attention_bias:
            self.relative_attention_bias = PartialTPEmbedding(
                prefix=f"{prefix}.relative_attention_bias", weights=weights
            )

    @staticmethod
    def _relative_position_bucket(
        relative_position, bidirectional=True, num_buckets=32, max_distance=128
    ):
        """
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        """
        relative_buckets = 0
        if bidirectional:
            num_buckets //= 2
            relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
            relative_position = torch.abs(relative_position)
        else:
            relative_position = -torch.min(
                relative_position, torch.zeros_like(relative_position)
            )
        # now relative_position is in the range [0, inf)

        # half of the buckets are for exact increments in positions
        max_exact = num_buckets // 2
        is_small = relative_position < max_exact

        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
        relative_position_if_large = max_exact + (
            torch.log(relative_position.float() / max_exact)
            / math.log(max_distance / max_exact)
            * (num_buckets - max_exact)
        ).to(torch.long)
        relative_position_if_large = torch.min(
            relative_position_if_large,
            torch.full_like(relative_position_if_large, num_buckets - 1),
        )

        relative_buckets += torch.where(
            is_small, relative_position, relative_position_if_large
        )
        return relative_buckets

    def compute_bias(self, query_length, key_length, device=None):
        """Compute binned relative position bias"""
        if device is None:
            device = self.relative_attention_bias.weight.device
        context_position = torch.arange(query_length, dtype=torch.long, device=device)[
            :, None
        ]
        memory_position = torch.arange(key_length, dtype=torch.long, device=device)[
            None, :
        ]
        relative_position = (
            memory_position - context_position
        )  # shape (query_length, key_length)
        relative_position_bucket = self._relative_position_bucket(
            relative_position,  # shape (query_length, key_length)
            bidirectional=(not self.is_decoder),
            num_buckets=self.relative_attention_num_buckets,
            max_distance=self.relative_attention_max_distance,
        )
        values = self.relative_attention_bias(
            relative_position_bucket
        )  # shape (query_length, key_length, num_heads)
        values = values.permute([2, 0, 1]).unsqueeze(
            0
        )  # shape (1, num_heads, query_length, key_length)
        return values

    def forward(
        self,
        hidden_states,
        mask=None,
        key_value_states=None,
        position_bias=None,
        past_key_value=None,
        layer_head_mask=None,
        query_length=None,
        use_cache=False,
        output_attentions=False,
    ):
        """
        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
        """
        # Input is (batch_size, seq_length, dim)
        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)

        batch_size, seq_length = hidden_states.shape[:2]

        real_seq_length = seq_length

        if past_key_value is not None:
            assert (
                len(past_key_value) == 2
            ), f"past_key_value should have 2 past states: keys and values. Got {len(past_key_value)} past states"
            real_seq_length += (
                past_key_value[0].shape[2] if query_length is None else query_length
            )

        key_length = (
            real_seq_length if key_value_states is None else key_value_states.shape[1]
        )

        def shape(states):
            """projection"""
            return states.view(
                batch_size, -1, self.n_heads, self.key_value_proj_dim
            ).transpose(1, 2)

        def unshape(states):
            """reshape"""
            return (
                states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
            )

        def project(hidden_states, proj_layer, key_value_states, past_key_value):
            """projects hidden states correctly to key/query states"""
            if key_value_states is None:
                # self-attn
                # (batch_size, n_heads, seq_length, dim_per_head)
                hidden_states = shape(proj_layer(hidden_states))
            elif past_key_value is None:
                # cross-attn
                # (batch_size, n_heads, seq_length, dim_per_head)
                hidden_states = shape(proj_layer(key_value_states))

            if past_key_value is not None:
                if key_value_states is None:
                    # self-attn
                    # (batch_size, n_heads, key_length, dim_per_head)
                    hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
                elif past_key_value.shape[2] != key_value_states.shape[1]:
                    # checking that the `sequence_length` of the `past_key_value` is the same as
                    # the provided `key_value_states` to support prefix tuning
                    # cross-attn
                    # (batch_size, n_heads, seq_length, dim_per_head)
                    hidden_states = shape(proj_layer(key_value_states))
                else:
                    # cross-attn
                    hidden_states = past_key_value
            return hidden_states

        # get query states
        query_states = shape(
            self.q(hidden_states)
        )  # (batch_size, n_heads, seq_length, dim_per_head)

        # get key/value states
        key_states = project(
            hidden_states,
            self.k,
            key_value_states,
            past_key_value[0] if past_key_value is not None else None,
        )
        value_states = project(
            hidden_states,
            self.v,
            key_value_states,
            past_key_value[1] if past_key_value is not None else None,
        )

        # compute scores
        scores = torch.matmul(
            query_states, key_states.transpose(3, 2)
        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9

        if position_bias is None:
            if not self.has_relative_attention_bias:
                position_bias = torch.zeros(
                    (1, self.n_heads, real_seq_length, key_length),
                    device=scores.device,
                    dtype=scores.dtype,
                )
            else:
                position_bias = self.compute_bias(
                    real_seq_length, key_length, device=scores.device
                )

            # if key and values are already calculated
            # we want only the last query position bias
            if past_key_value is not None:
                position_bias = position_bias[:, :, -hidden_states.size(1) :, :]

            if mask is not None:
                position_bias = (
                    position_bias + mask
                )  # (batch_size, n_heads, seq_length, key_length)

        position_bias_masked = position_bias

        scores += position_bias_masked
        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
            scores
        )  # (batch_size, n_heads, seq_length, key_length)
        attn_weights = nn.functional.dropout(
            attn_weights, p=self.dropout, training=self.training
        )  # (batch_size, n_heads, seq_length, key_length)

        # Mask heads if we want to
        if layer_head_mask is not None:
            attn_weights = attn_weights * layer_head_mask

        attn_output = unshape(
            torch.matmul(attn_weights, value_states)
        )  # (batch_size, seq_length, dim)
        attn_output = self.o(attn_output)

        present_key_value_state = (
            (key_states, value_states) if (self.is_decoder and use_cache) else None
        )
        outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)

        if output_attentions:
            outputs = outputs + (attn_weights,)
        return outputs


class T5LayerSelfAttention(nn.Module):
    def __init__(self, config, prefix, weights, has_relative_attention_bias=False):
        super().__init__()
        self.SelfAttention = T5Attention(
            config,
            prefix=f"{prefix}.SelfAttention",
            weights=weights,
            has_relative_attention_bias=has_relative_attention_bias,
        )
        self.layer_norm = T5LayerNorm(
            prefix=f"{prefix}.layer_norm",
            weights=weights,
            eps=config.layer_norm_epsilon,
        )
        self.dropout = nn.Dropout(config.dropout_rate)

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        position_bias=None,
        layer_head_mask=None,
        past_key_value=None,
        use_cache=False,
        output_attentions=False,
    ):
        normed_hidden_states = self.layer_norm(hidden_states)
        attention_output = self.SelfAttention(
            normed_hidden_states,
            mask=attention_mask,
            position_bias=position_bias,
            layer_head_mask=layer_head_mask,
            past_key_value=past_key_value,
            use_cache=use_cache,
            output_attentions=output_attentions,
        )
        hidden_states = hidden_states + self.dropout(attention_output[0])
        outputs = (hidden_states,) + attention_output[
            1:
        ]  # add attentions if we output them
        return outputs


class T5LayerCrossAttention(nn.Module):
    def __init__(self, config, prefix, weights):
        super().__init__()
        self.EncDecAttention = T5Attention(
            config,
            prefix=f"{prefix}.EncDecAttention",
            weights=weights,
            has_relative_attention_bias=False,
        )
        self.layer_norm = T5LayerNorm(
            prefix=f"{prefix}.layer_norm",
            weights=weights,
            eps=config.layer_norm_epsilon,
        )
        self.dropout = nn.Dropout(config.dropout_rate)

    def forward(
        self,
        hidden_states,
        key_value_states,
        attention_mask=None,
        position_bias=None,
        layer_head_mask=None,
        past_key_value=None,
        use_cache=False,
        query_length=None,
        output_attentions=False,
    ):
        normed_hidden_states = self.layer_norm(hidden_states)
        attention_output = self.EncDecAttention(
            normed_hidden_states,
            mask=attention_mask,
            key_value_states=key_value_states,
            position_bias=position_bias,
            layer_head_mask=layer_head_mask,
            past_key_value=past_key_value,
            use_cache=use_cache,
            query_length=query_length,
            output_attentions=output_attentions,
        )
        layer_output = hidden_states + self.dropout(attention_output[0])
        outputs = (layer_output,) + attention_output[
            1:
        ]  # add attentions if we output them
        return outputs


class T5Block(nn.Module):
    def __init__(self, config, prefix, weights, has_relative_attention_bias: bool):
        super().__init__()
        self.is_decoder = config.is_decoder
        self.layer = nn.ModuleList()
        self.layer.append(
            T5LayerSelfAttention(
                config,
                prefix=f"{prefix}.layer.0",
                weights=weights,
                has_relative_attention_bias=has_relative_attention_bias,
            )
        )
        if self.is_decoder:
            i = 2
            self.layer.append(
                T5LayerCrossAttention(
                    config, prefix=f"{prefix}.layer.1", weights=weights
                )
            )
        else:
            i = 1

        self.layer.append(
            T5LayerFF(config, prefix=f"{prefix}.layer.{i}", weights=weights)
        )

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        position_bias=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        encoder_decoder_position_bias=None,
        layer_head_mask=None,
        cross_attn_layer_head_mask=None,
        past_key_value=None,
        use_cache=False,
        output_attentions=False,
        return_dict=True,
    ):
        if past_key_value is not None:
            if not self.is_decoder:
                logger.warning(
                    "`past_key_values` is passed to the encoder. Please make sure this is intended."
                )
            expected_num_past_key_values = 2 if encoder_hidden_states is None else 4

            if len(past_key_value) != expected_num_past_key_values:
                raise ValueError(
                    f"There should be {expected_num_past_key_values} past states. "
                    f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
                    f"Got {len(past_key_value)} past key / value states"
                )

            self_attn_past_key_value = past_key_value[:2]
            cross_attn_past_key_value = past_key_value[2:]
        else:
            self_attn_past_key_value, cross_attn_past_key_value = None, None

        self_attention_outputs = self.layer[0](
            hidden_states,
            attention_mask=attention_mask,
            position_bias=position_bias,
            layer_head_mask=layer_head_mask,
            past_key_value=self_attn_past_key_value,
            use_cache=use_cache,
            output_attentions=output_attentions,
        )
        hidden_states, present_key_value_state = self_attention_outputs[:2]
        attention_outputs = self_attention_outputs[
            2:
        ]  # Keep self-attention outputs and relative position weights

        # clamp inf values to enable fp16 training
        if hidden_states.dtype == torch.float16:
            clamp_value = torch.where(
                torch.isinf(hidden_states).any(),
                torch.finfo(hidden_states.dtype).max - 1000,
                torch.finfo(hidden_states.dtype).max,
            )
            hidden_states = torch.clamp(
                hidden_states, min=-clamp_value, max=clamp_value
            )

        do_cross_attention = self.is_decoder and encoder_hidden_states is not None
        if do_cross_attention:
            # the actual query length is unknown for cross attention
            # if using past key value states. Need to inject it here
            if present_key_value_state is not None:
                query_length = present_key_value_state[0].shape[2]
            else:
                query_length = None

            cross_attention_outputs = self.layer[1](
                hidden_states,
                key_value_states=encoder_hidden_states,
                attention_mask=encoder_attention_mask,
                position_bias=encoder_decoder_position_bias,
                layer_head_mask=cross_attn_layer_head_mask,
                past_key_value=cross_attn_past_key_value,
                query_length=query_length,
                use_cache=use_cache,
                output_attentions=output_attentions,
            )
            hidden_states = cross_attention_outputs[0]

            # clamp inf values to enable fp16 training
            if hidden_states.dtype == torch.float16:
                clamp_value = torch.where(
                    torch.isinf(hidden_states).any(),
                    torch.finfo(hidden_states.dtype).max - 1000,
                    torch.finfo(hidden_states.dtype).max,
                )
                hidden_states = torch.clamp(
                    hidden_states, min=-clamp_value, max=clamp_value
                )

            # Combine self attn and cross attn key value states
            if present_key_value_state is not None:
                present_key_value_state = (
                    present_key_value_state + cross_attention_outputs[1]
                )

            # Keep cross-attention outputs and relative position weights
            attention_outputs = attention_outputs + cross_attention_outputs[2:]

        # Apply Feed Forward layer
        hidden_states = self.layer[-1](hidden_states)

        # clamp inf values to enable fp16 training
        if hidden_states.dtype == torch.float16:
            clamp_value = torch.where(
                torch.isinf(hidden_states).any(),
                torch.finfo(hidden_states.dtype).max - 1000,
                torch.finfo(hidden_states.dtype).max,
            )
            hidden_states = torch.clamp(
                hidden_states, min=-clamp_value, max=clamp_value
            )

        outputs = (hidden_states,)

        if use_cache:
            outputs = outputs + (present_key_value_state,) + attention_outputs
        else:
            outputs = outputs + attention_outputs

        return outputs  # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)


class T5PreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = T5Config

    def _shift_right(self, input_ids):
        decoder_start_token_id = self.config.decoder_start_token_id
        pad_token_id = self.config.pad_token_id

        assert decoder_start_token_id is not None, (
            "self.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id."
            " See T5 docs for more information"
        )

        # shift inputs to the right
        if is_torch_fx_proxy(input_ids):
            # Item assignment is not supported natively for proxies.
            shifted_input_ids = torch.full(
                input_ids.shape[:-1] + (1,), decoder_start_token_id
            )
            shifted_input_ids = torch.cat(
                [shifted_input_ids, input_ids[..., :-1]], dim=-1
            )
        else:
            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
            shifted_input_ids[..., 0] = decoder_start_token_id

        assert (
            pad_token_id is not None
        ), "self.model.config.pad_token_id has to be defined."
        # replace possible -100 values in labels by `pad_token_id`
        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)

        return shifted_input_ids


class T5Stack(T5PreTrainedModel):
    def __init__(self, config, prefix, weights, embed_tokens):
        super().__init__(config)

        self.is_decoder = config.is_decoder

        self.embed_tokens = embed_tokens
        self.block = nn.ModuleList(
            [
                T5Block(
                    config,
                    prefix=f"{prefix}.block.{layer_id}",
                    weights=weights,
                    has_relative_attention_bias=(layer_id == 0),
                )
                for layer_id in range(config.num_layers)
            ]
        )
        self.final_layer_norm = T5LayerNorm(
            prefix=f"{prefix}.final_layer_norm",
            weights=weights,
            eps=config.layer_norm_epsilon,
        )
        self.dropout = nn.Dropout(config.dropout_rate)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        inputs_embeds=None,
        head_mask=None,
        cross_attn_head_mask=None,
        past_key_values=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        # Model parallel
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        if input_ids is not None and inputs_embeds is not None:
            err_msg_prefix = "decoder_" if self.is_decoder else ""
            raise ValueError(
                f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
            )
        elif input_ids is not None:
            input_shape = input_ids.size()
            input_ids = input_ids.view(-1, input_shape[-1])
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.size()[:-1]
        else:
            err_msg_prefix = "decoder_" if self.is_decoder else ""
            raise ValueError(
                f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds"
            )

        if inputs_embeds is None:
            assert (
                self.embed_tokens is not None
            ), "You have to initialize the model with valid token embeddings"
            inputs_embeds = self.embed_tokens(input_ids)

        batch_size, seq_length = input_shape

        # required mask seq length can be calculated via length of past
        mask_seq_length = (
            past_key_values[0][0].shape[2] + seq_length
            if past_key_values is not None
            else seq_length
        )

        if use_cache is True:
            assert (
                self.is_decoder
            ), f"`use_cache` can only be set to `True` if {self} is used as a decoder"

        if attention_mask is None:
            attention_mask = torch.ones(
                batch_size, mask_seq_length, device=inputs_embeds.device
            )
        if (
            self.is_decoder
            and encoder_attention_mask is None
            and encoder_hidden_states is not None
        ):
            encoder_seq_length = encoder_hidden_states.shape[1]
            encoder_attention_mask = torch.ones(
                batch_size,
                encoder_seq_length,
                device=inputs_embeds.device,
                dtype=torch.long,
            )

        # initialize past_key_values with `None` if past does not exist
        if past_key_values is None:
            past_key_values = [None] * len(self.block)

        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
        # ourselves in which case we just need to make it broadcastable to all heads.
        extended_attention_mask = self.get_extended_attention_mask(
            attention_mask, input_shape
        )

        # If a 2D or 3D attention mask is provided for the cross-attention
        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
        if self.is_decoder and encoder_hidden_states is not None:
            (
                encoder_batch_size,
                encoder_sequence_length,
                _,
            ) = encoder_hidden_states.size()
            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
            if encoder_attention_mask is None:
                encoder_attention_mask = torch.ones(
                    encoder_hidden_shape, device=inputs_embeds.device
                )
            encoder_extended_attention_mask = self.invert_attention_mask(
                encoder_attention_mask
            )
        else:
            encoder_extended_attention_mask = None

        # Prepare head mask if needed
        head_mask = self.get_head_mask(head_mask, self.config.num_layers)
        cross_attn_head_mask = self.get_head_mask(
            cross_attn_head_mask, self.config.num_layers
        )
        present_key_value_states = () if use_cache else None
        all_hidden_states = () if output_hidden_states else None
        all_attentions = () if output_attentions else None
        all_cross_attentions = () if (output_attentions and self.is_decoder) else None
        position_bias = None
        encoder_decoder_position_bias = None

        hidden_states = self.dropout(inputs_embeds)

        for i, (layer_module, past_key_value) in enumerate(
            zip(self.block, past_key_values)
        ):
            layer_head_mask = head_mask[i]
            cross_attn_layer_head_mask = cross_attn_head_mask[i]
            # Model parallel
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            layer_outputs = layer_module(
                hidden_states,
                attention_mask=extended_attention_mask,
                position_bias=position_bias,
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_extended_attention_mask,
                encoder_decoder_position_bias=encoder_decoder_position_bias,
                layer_head_mask=layer_head_mask,
                cross_attn_layer_head_mask=cross_attn_layer_head_mask,
                past_key_value=past_key_value,
                use_cache=use_cache,
                output_attentions=output_attentions,
            )

            # layer_outputs is a tuple with:
            # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
            if use_cache is False:
                layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:]

            hidden_states, present_key_value_state = layer_outputs[:2]

            # We share the position biases between the layers - the first layer store them
            # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
            # (cross-attention position bias), (cross-attention weights)
            position_bias = layer_outputs[2]
            if self.is_decoder and encoder_hidden_states is not None:
                encoder_decoder_position_bias = layer_outputs[
                    4 if output_attentions else 3
                ]
            # append next layer key value states
            if use_cache:
                present_key_value_states = present_key_value_states + (
                    present_key_value_state,
                )

            if output_attentions:
                all_attentions = all_attentions + (layer_outputs[3],)
                if self.is_decoder:
                    all_cross_attentions = all_cross_attentions + (layer_outputs[5],)

        hidden_states = self.final_layer_norm(hidden_states)
        hidden_states = self.dropout(hidden_states)

        # Add last layer
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        if not return_dict:
            return tuple(
                v
                for v in [
                    hidden_states,
                    present_key_value_states,
                    all_hidden_states,
                    all_attentions,
                    all_cross_attentions,
                ]
                if v is not None
            )
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=present_key_value_states,
            hidden_states=all_hidden_states,
            attentions=all_attentions,
            cross_attentions=all_cross_attentions,
        )


class T5ForConditionalGeneration(T5PreTrainedModel):
    def __init__(self, config: T5Config, weights):
        super().__init__(config)
        self.model_dim = config.d_model

        self.shared = TensorParallelEmbedding(prefix="shared", weights=weights)

        encoder_config = copy.deepcopy(config)
        encoder_config.is_decoder = False
        encoder_config.use_cache = False
        encoder_config.is_encoder_decoder = False
        self.encoder = T5Stack(
            config=encoder_config,
            prefix="encoder",
            weights=weights,
            embed_tokens=self.shared,
        )

        decoder_config = copy.deepcopy(config)
        decoder_config.is_decoder = True
        decoder_config.is_encoder_decoder = False
        decoder_config.num_layers = config.num_decoder_layers
        self.decoder = T5Stack(
            config=decoder_config,
            prefix="decoder",
            weights=weights,
            embed_tokens=self.shared,
        )

        try:
            self.lm_head = SpeculativeHead.load(
                config, prefix="lm_head", weights=weights
            )
        except RuntimeError:
            # Some models like t5-small were saved with shared weights unlike flan
            # Since they are declared as the same arch we have no choice but hope
            # that this is OK instead of using a proper flag.
            self.lm_head = SpeculativeHead.load(
                config, prefix="shared", weights=weights
            )

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.BoolTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        decoder_head_mask: Optional[torch.FloatTensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
        if head_mask is not None and decoder_head_mask is None:
            if self.config.num_layers == self.config.num_decoder_layers:
                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
                decoder_head_mask = head_mask

        # Encode if needed (training, first prediction pass)
        if encoder_outputs is None:
            # Convert encoder inputs in embeddings if needed
            encoder_outputs = self.encoder(
                input_ids=input_ids,
                attention_mask=attention_mask,
                inputs_embeds=inputs_embeds,
                head_mask=head_mask,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )
        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
            encoder_outputs = BaseModelOutput(
                last_hidden_state=encoder_outputs[0],
                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
            )

        hidden_states = encoder_outputs[0]

        if (
            labels is not None
            and decoder_input_ids is None
            and decoder_inputs_embeds is None
        ):
            # get decoder inputs from shifting lm labels to the right
            decoder_input_ids = self._shift_right(labels)

        # Decode
        decoder_outputs = self.decoder(
            input_ids=decoder_input_ids,
            attention_mask=decoder_attention_mask,
            inputs_embeds=decoder_inputs_embeds,
            past_key_values=past_key_values,
            encoder_hidden_states=hidden_states,
            encoder_attention_mask=attention_mask,
            head_mask=decoder_head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = decoder_outputs[0]

        if self.config.tie_word_embeddings:
            # Rescale output before projecting on vocab
            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
            sequence_output = sequence_output * (self.model_dim**-0.5)

        logits, speculative_logits = self.lm_head(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss(ignore_index=-100)
            # move labels to correct device to enable PP
            labels = labels.to(logits.device)
            loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
            # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666

        if not return_dict:
            output = (logits,) + decoder_outputs[1:] + encoder_outputs
            return ((loss,) + output) if loss is not None else output

        return (
            Seq2SeqLMOutput(
                loss=loss,
                logits=logits,
                past_key_values=decoder_outputs.past_key_values,
                decoder_hidden_states=decoder_outputs.hidden_states,
                decoder_attentions=decoder_outputs.attentions,
                cross_attentions=decoder_outputs.cross_attentions,
                encoder_last_hidden_state=encoder_outputs.last_hidden_state,
                encoder_hidden_states=encoder_outputs.hidden_states,
                encoder_attentions=encoder_outputs.attentions,
            ),
            speculative_logits,
        )

    def prepare_inputs_for_generation(
        self,
        input_ids,
        past_key_values=None,
        attention_mask=None,
        head_mask=None,
        decoder_head_mask=None,
        decoder_attention_mask=None,
        cross_attn_head_mask=None,
        use_cache=None,
        encoder_outputs=None,
        **kwargs,
    ):
        # cut decoder_input_ids if past is used
        if past_key_values is not None:
            input_ids = input_ids[:, -1:]

        return {
            "decoder_input_ids": input_ids,
            "past_key_values": past_key_values,
            "encoder_outputs": encoder_outputs,
            "attention_mask": attention_mask,
            "head_mask": head_mask,
            "decoder_head_mask": decoder_head_mask,
            "decoder_attention_mask": decoder_attention_mask,
            "cross_attn_head_mask": cross_attn_head_mask,
            "use_cache": use_cache,
        }

    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
        return self._shift_right(labels)

    def _reorder_cache(self, past_key_values, beam_idx):
        # if decoder past is not included in output
        # speedy decoding is disabled and no need to reorder
        if past_key_values is None:
            logger.warning(
                "You might want to consider setting `use_cache=True` to speed up decoding"
            )
            return past_key_values

        reordered_decoder_past = ()
        for layer_past_states in past_key_values:
            # get the correct batch idx from layer past batch dim
            # batch dim of `past` is at 2nd position
            reordered_layer_past_states = ()
            for layer_past_state in layer_past_states:
                # need to set correct `past` for each of the four key / value states
                reordered_layer_past_states = reordered_layer_past_states + (
                    layer_past_state.index_select(
                        0, beam_idx.to(layer_past_state.device)
                    ),
                )

            assert reordered_layer_past_states[0].shape == layer_past_states[0].shape
            assert len(reordered_layer_past_states) == len(layer_past_states)

            reordered_decoder_past = reordered_decoder_past + (
                reordered_layer_past_states,
            )
        return reordered_decoder_past


================================================
FILE: server/text_generation_server/models/custom_modeling/vlm.py
================================================
def load_text_model(prefix, config, weights, name=None):
    if config.model_type == "llama":
        from text_generation_server.models.custom_modeling.flash_llama_modeling import (
            FlashLlamaForCausalLM,
        )

        return FlashLlamaForCausalLM(prefix, config, weights, name=name)
    elif config.model_type == "mistral":
        from text_generation_server.models.custom_modeling.flash_mistral_modeling import (
            FlashMistralForCausalLM,
        )

        return FlashMistralForCausalLM(prefix, config, weights, name=name)
    elif config.model_type == "gemma":
        from text_generation_server.models.custom_modeling.flash_gemma_modeling import (
            FlashGemmaForCausalLM,
        )

        return FlashGemmaForCausalLM(prefix, config, weights, causal=False)
    elif config.model_type == "gemma2":
        from text_generation_server.models.custom_modeling.flash_gemma2_modeling import (
            FlashGemma2ForCausalLM,
        )

        return FlashGemma2ForCausalLM(prefix, config, weights)

    elif config.model_type == "gemma3" or config.model_type == "gemma3_text":
        from text_generation_server.models.custom_modeling.flash_gemma3_modeling import (
            FlashGemma3ForCausalLM,
        )

        return FlashGemma3ForCausalLM(prefix, config, weights)
    elif config.model_type == "paligemma":
        from text_generation_server.models.custom_modeling.flash_gemma_modeling import (
            FlashGemmaForCausalLM,
        )

        return FlashGemmaForCausalLM(prefix, config, weights)
    else:
        raise RuntimeError(f"Unsupported model type {config.model_type}")


def load_vision_model(prefix, config, weights):
    if config.model_type == "clip_vision_model":
        from text_generation_server.models.custom_modeling.clip import (
            CLIPVisionTransformer,
        )

        return CLIPVisionTransformer(
            prefix=f"{prefix}.vision_model", config=config, weights=weights
        )
    if (
        config.model_type == "siglip_vision_model"
        or config.model_type == "gemma3_vision"
    ):
        from text_generation_server.models.custom_modeling.siglip import (
            SiglipVisionTransformer,
        )

        # TODO: ensure that using the prefix doesn't break any existing models
        # that rely on the old prefix (update the old models if necessary)
        return SiglipVisionTransformer(
            # prefix="vision_model.vision_model", config=config, weights=weights
            prefix=f"{prefix}.vision_model",
            config=config,
            weights=weights,
        )
    else:
        raise RuntimeError(f"Unsupported model type {config.model_type}")


================================================
FILE: server/text_generation_server/models/flash_causal_lm.py
================================================
from contextlib import nullcontext
import math
import os
import time
import torch
import torch.distributed

import numpy as np

from loguru import logger
from dataclasses import dataclass
from opentelemetry import trace
from transformers import (
    PreTrainedTokenizerBase,
    AutoConfig,
    AutoTokenizer,
    GenerationConfig,
)
from typing import (
    Any,
    ContextManager,
    Iterable,
    Optional,
    Tuple,
    List,
    Type,
    Dict,
    Union,
)

from text_generation_server.adapters import AdapterBatchData, AdapterBatchMetadata
from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
from text_generation_server.utils.chunks import concat_text_chunks
from text_generation_server.utils.import_utils import SYSTEM
from text_generation_server.models import Model
from text_generation_server.utils.log import log_master
from text_generation_server.utils.prefill_chunking import (
    get_support_chunking,
    get_max_prefill_tokens,
)
from text_generation_server.utils.tokens import batch_top_tokens
from text_generation_server.utils.speculate import get_speculate
from text_generation_server.utils import (
    initialize_torch_distributed,
    weight_files,
    Weights,
)
from text_generation_server.models.types import (
    Batch,
    Tokens,
    Generation,
    GeneratedText,
)
from text_generation_server.pb import generate_pb2
from text_generation_server.models.globals import (
    MEM_POOL,
    ATTENTION,
    BLOCK_SIZE,
    CUDA_GRAPHS,
    REQUEST_LOGPROBS,
    TGI_WIGGLE_ROOM,
    get_adapter_to_index,
)
from text_generation_server.layers.attention import KVCache, Seqlen
from text_generation_server.utils import StoppingCriteria, HeterogeneousNextTokenChooser
from text_generation_server.utils.dist import MEMORY_FRACTION
from text_generation_server.utils.quantization import get_loader
from text_generation_server.utils.segments import SegmentConcatBuilder, find_segments

from text_generation_server.utils.import_utils import (
    empty_cache,
    synchronize,
    get_free_memory,
)
from text_generation_server.models.metadata_kernels import (
    has_triton,
    copy_next_input_ids_inplace,
    block_tables_to_ragged,
    block_tables_to_padded,
    prepare_position_slot_ids,
    slots_filtering,
)

tracer = trace.get_tracer(__name__)


def small_power_of_2(n: int):
    return 1 << ((n - 1).bit_length() - 1)


def init_cpu_threads_env(rank_id: int, world_size: int):
    import importlib.util

    if importlib.util.find_spec("numa") is not None:
        import numa
        import psutil

        nodes = numa.info.get_max_node() + 1
        rank_per_node = math.ceil(world_size / nodes)
        num_cpus_per_nodes = int(psutil.cpu_count(logical=False) / nodes)
        node_id = int(rank_id / rank_per_node)
        rank_offset_per_node = rank_id % rank_per_node
        if os.getenv("OMP_NUM_THREADS") is None:
            num_cpus_per_rank = max(int(num_cpus_per_nodes / rank_per_node), 1)
        else:
            num_cpus_per_rank = int(os.getenv("OMP_NUM_THREADS"))
        if len(numa.memory.get_membind_nodes()) == nodes:
            numa.memory.set_membind_nodes((node_id))
        torch.set_num_threads(num_cpus_per_rank)
        if len(numa.schedule.get_affinitive_cpus(0)) == psutil.cpu_count(logical=True):
            cpu_start = num_cpus_per_rank * rank_offset_per_node
            numa.schedule.run_on_cpus(
                0,
                *(
                    numa.info.node_to_cpus(node_id)[
                        cpu_start : cpu_start + num_cpus_per_rank
                    ]
                ),
            )
        logger.info(
            f"affinity={numa.schedule.get_affinitive_cpus(0)}, membind = {numa.memory.get_membind_nodes()}"
        )


@dataclass
class FlashCausalLMBatch(Batch):
    batch_id: int
    requests: List[generate_pb2.Request]
    # request id -> idx in list mapping
    requests_idx_mapping: Dict[int, int]

    # Decoder values
    # Can be a list for easy filtering
    # If `input_ids` is a list, it needs to be materialized to a tensor first
    input_ids: Union[torch.Tensor, List[List[int]]]
    # Will be set by `generate_token` and reset after each prefill forward before staying set in decode
    position_ids: Optional[torch.Tensor]
    speculative_ids: Optional[torch.Tensor]

    # Set when creating the batch
    # tensor of indices of the currently used slots, length = \sum_{i=0}^{b} s_i in prefill, length = b in decode
    # Will be set by `generate_token` and reset after each prefill forward before staying set in decode
    slot_indices: Optional[torch.Tensor]

    # list of length b of list of length s_i // block_size
    block_tables: List[List[int]]
    # tensor of size [b, max_total_seqlen // block_size] holding the paged attention block tables for all sequences
    block_tables_tensor: torch.Tensor
    # tensor of length \sum_{i=0}^{b} max_s_i  holding the paged attention slots for all sequences
    slots: torch.Tensor
    # list of length b + 1  containing the cumulative sequence slot lengths of the sequences in the batch
    # used for filtering
    cu_slots: torch.Tensor

    max_input_length: int
    max_current_length: int

    # Whether this batch contains at least one request that is prefilling
    prefilling: bool
    # Whether each request is prefilling
    prefilling_mask: List[bool]

    # Prefill metadata tensors to efficiently compute logprobs
    # tensor of length b + 1  containing the cumulative sequence lengths of the sequences in the batch, only used in prefill
    cu_seqlen_prefill: Optional[torch.Tensor]
    # Prefill cache indices is used to slice into the kv tensor before caching it into the paged attention buffers
    # as we only keep SLIDING_WINDOW values instead of the whole tensor
    prefill_cache_indices: Optional[torch.Tensor]
    # Will be set by `generate_token` and reset after each prefill forward
    prefill_head_indices: Optional[torch.Tensor]
    # Will be set by `generate_token` and reset after each prefill forward
    prefill_next_token_indices: Optional[torch.tensor]
    # Will be set by `generate_token` and reset after each prefill forward
    prefill_cu_outlens: Optional[List[int]]
    # Will be set by `generate_token` and reset after each prefill forward
    prefill_logprob_tokens: List[Optional[Tokens]]

    # All tokens
    all_input_ids: List[List[int]]
    all_input_ids_tensor: torch.Tensor

    # Lengths of all generations present in the batch
    input_lengths: List[int]
    # size [b], containing the number of blocks that can be retrieved from the cache
    cache_lengths: List[int]
    prompt_lengths: List[int]
    # Will be set by `generate_token` and reset after each prefill forward before staying set in decode
    input_lengths_tensor: Optional[torch.Tensor]
    cache_lengths_tensor: Optional[torch.Tensor]
    prompt_lengths_tensor: torch.Tensor

    prefix_offsets: List[Optional[int]]
    read_offsets: List[Optional[int]]

    # Generation helpers
    next_token_chooser: HeterogeneousNextTokenChooser
    stopping_criterias: List[StoppingCriteria]
    top_n_tokens: List[int]
    top_n_tokens_tensor: torch.Tensor

    # Adapter metadata for each request
    # Will be set by `generate_token` and reset after each prefill forward before staying set in decode
    adapter_meta: Optional[AdapterBatchMetadata]

    # Number of blocks in this batch
    num_blocks: int
    # Maximum number of blocks
    max_blocks: int

    def to_pb(self) -> generate_pb2.CachedBatch:
        return generate_pb2.CachedBatch(
            id=self.batch_id,
            request_ids=[r.id for r in self.requests],
            size=len(self),
            max_tokens=self.num_blocks * BLOCK_SIZE,
            current_tokens=(
                sum([len(i) for i in self.input_ids])
                if isinstance(self.input_ids, list)
                else len(self.input_ids)
            ),
        )

    @classmethod
    def batch_tokenized_inputs(
        cls, requests: Iterable[generate_pb2.Request], tokenizer
    ):
        max_length = 0
        all_input_ids = []
        batch_size = 0
        for r in requests:
            batch_size += 1
            inputs = concat_text_chunks(r.input_chunks.chunks)
            input_ids = tokenizer(
                inputs,
                truncation=True,
                max_length=r.truncate,
                add_special_tokens=r.add_special_tokens,
            )["input_ids"]
            max_length = max(max_length, len(input_ids))
            all_input_ids.append(input_ids)
        return all_input_ids

    @classmethod
    def from_tokenized(
        cls,
        pb: generate_pb2.Batch,
        tokenizer: PreTrainedTokenizerBase,
        batch_tokenized_inputs,
        dtype: torch.dtype,
        device: torch.device,
    ) -> "FlashCausalLMBatch":
        speculate = get_speculate()

        cache_lengths = []
        input_lengths = []
        prompt_lengths = []
        prefix_offsets = []
        read_offsets = []
        all_input_ids = []
        all_postfix_ids = []
        requests_idx_mapping = {}
        slots = []
        cu_slots = [0]

        next_token_chooser_parameters = []
        stopping_criterias = []
        top_n_tokens = []

        num_blocks = 0
        max_input_length = 0
        max_current_length = 0
        max_length = 0
        max_blocks = 0

        cu_blocks = [0]
        block_tables = []
        block_tables_ragged = []

        # Parse batch
        for i, (r, tokenized_input) in enumerate(
            zip(pb.requests, batch_tokenized_inputs)
        ):
            ### XXX: This consumes so much memory on long requests
            ### Deactivating it by default seems like the best course.
            if not REQUEST_LOGPROBS:
                r.prefill_logprobs = False
            # request id -> idx in list mapping
            requests_idx_mapping[r.id] = i

            prompt_length = len(tokenized_input)
            prompt_lengths.append(prompt_length)

            cache_length = r.cache_len

            assert (
                cache_length <= prompt_length
            ), f"Prefix {cache_length} vs input {prompt_length}"
            if cache_length == prompt_length:
                assert False, "unreachable"

            # `chunk_len` is an optional field in the protobuf
            # It is only set if the model support chunking
            if r.HasField("chunk_len"):
                input_length = r.chunk_len

                if cache_length + input_length < prompt_length:
                    # FIXME: speculate is not supported for context chunking at the moment
                    assert speculate == 0
                    assert get_support_chunking()
                    assert input_length > 0

                postfix_ids = tokenized_input[
                    cache_length : cache_length + input_length
                ]
                assert (
                    len(postfix_ids) == input_length
                ), "Rust and Python tokenizers are not aligned"
            else:
                # Use all the remaining ids
                postfix_ids = tokenized_input[cache_length:]
                input_length = len(postfix_ids)

            input_lengths.append(input_length)

            prefix_offsets.append(prompt_length - 5)
            read_offsets.append(prompt_length)

            all_postfix_ids.append(postfix_ids)
            all_input_ids.append(tokenized_input)

            next_token_chooser_parameters.append(r.parameters)

            stopping_criteria = StoppingCriteria.from_pb(
                r.stopping_parameters, tokenizer
            )
            max_new_tokens = stopping_criteria.max_new_tokens
            stopping_criterias.append(stopping_criteria)
            top_n_tokens.append(r.top_n_tokens)

            # Paged attention
            # Remove one as the first token des not have a past
            speculative_length = get_speculate()
            speculative_length = 0 if speculative_length is None else speculative_length

            # Tokens that need to be mapped to blocks.
            block_tokens = prompt_length + max_new_tokens - 1 + speculative_length

            # blocks and slots can be empty (for example in warmup)
            if not r.blocks:
                needed_blocks = math.ceil(block_tokens / BLOCK_SIZE)
                request_blocks = [
                    b for b in range(num_blocks, num_blocks + needed_blocks)
                ]
                request_slots = [
                    s
                    for b in request_blocks
                    for s in range(b * BLOCK_SIZE, (b + 1) * BLOCK_SIZE)
                ]
            else:
                request_blocks = r.blocks
                request_slots = r.slots

            block_tables.append(request_blocks)
            block_tables_ragged.extend(request_blocks)
            cu_blocks.append(len(block_tables_ragged))

            slots.extend(request_slots)
            cu_slots.append(len(slots))

            cache_lengths.append(cache_length)
            num_blocks += len(request_blocks)

            # Update
            max_blocks = max(max_blocks, len(request_blocks))
            max_input_length = max(max_input_length, input_length)
            max_current_length = max(max_current_length, cache_length + input_length)
            max_length = max(
                max_length,
                prompt_length + max_new_tokens + speculative_length,
            )

        next_token_chooser = HeterogeneousNextTokenChooser.from_pb(
            next_token_chooser_parameters, dtype, device, tokenizer
        )

        # Padded all_input_ids_tensor
        all_input_ids_tensor = np.zeros(
            (len(all_input_ids), max_length), dtype=np.int64
        )
        for i, input_ids in enumerate(all_input_ids):
            all_input_ids_tensor[i, : len(input_ids)] = input_ids

        # Create tensors on device
        all_input_ids_tensor = torch.tensor(
            all_input_ids_tensor, dtype=torch.int64, device=device
        )

        top_n_tokens_tensor = torch.tensor(
            top_n_tokens, device=device, dtype=torch.int64
        )

        block_tables_ragged = torch.tensor(
            block_tables_ragged, device=device, dtype=torch.int32
        )
        cu_blocks = torch.tensor(cu_blocks, device=device, dtype=torch.int64)
        block_tables_tensor = torch.empty(
            (len(block_tables), max_blocks),
            device=device,
            dtype=torch.int32,
        )

        # If the device supports Triton, we can use a fused kernel
        if has_triton():
            block_tables_to_padded(
                max_blocks, cu_blocks, block_tables_tensor, block_tables_ragged
            )
        else:
            for i, request_blocks in enumerate(block_tables):
                block_tables_tensor[i, : len(request_blocks)] = torch.tensor(
                    request_blocks
                )

        prompt_lengths_tensor = torch.tensor(
            prompt_lengths, dtype=torch.int32, device=device
        )

        slots = torch.tensor(slots, dtype=torch.int64, device=device)
        cu_slots = torch.tensor(cu_slots, dtype=torch.int64)

        return cls(
            batch_id=pb.id,
            requests=pb.requests,
            requests_idx_mapping=requests_idx_mapping,
            input_ids=all_postfix_ids,
            block_tables=block_tables,
            block_tables_tensor=block_tables_tensor,
            cache_lengths=cache_lengths,
            max_input_length=max_input_length,
            max_current_length=max_current_length,
            prefilling=True,
            prefilling_mask=[True] * len(pb.requests),
            prefill_logprob_tokens=[None] * len(pb.requests),
            input_lengths=input_lengths,
            prompt_lengths=prompt_lengths,
            prefix_offsets=prefix_offsets,
            read_offsets=read_offsets,
            all_input_ids=all_input_ids,
            all_input_ids_tensor=all_input_ids_tensor,
            next_token_chooser=next_token_chooser,
            stopping_criterias=stopping_criterias,
            top_n_tokens=top_n_tokens,
            top_n_tokens_tensor=top_n_tokens_tensor,
            num_blocks=num_blocks,
            max_blocks=max_blocks,
            speculative_ids=None,
            prompt_lengths_tensor=prompt_lengths_tensor,
            # These values will be set by `FlashCausalLMBatch.prepare_for_prefill`
            position_ids=None,
            cu_seqlen_prefill=None,
            prefill_cache_indices=None,
            slot_indices=None,
            slots=slots,
            cu_slots=cu_slots,
            prefill_head_indices=None,
            prefill_next_token_indices=None,
            prefill_cu_outlens=None,
            cache_lengths_tensor=None,
            input_lengths_tensor=None,
            adapter_meta=None,
        )

    @classmethod
    def from_pb(
        cls,
        pb: generate_pb2.Batch,
        tokenizer: PreTrainedTokenizerBase,
        dtype: torch.dtype,
        device: torch.device,
    ) -> "FlashCausalLMBatch":
        assert len(pb.requests) > 0
        batch_tokenized_inputs = cls.batch_tokenized_inputs(pb.requests, tokenizer)
        return cls.from_tokenized(pb, tokenizer, batch_tokenized_inputs, dtype, device)

    @tracer.start_as_current_span("filter")
    def filter(self, request_ids: List[int]) -> "FlashCausalLMBatch":
        if len(request_ids) == 0:
            raise ValueError("Batch must have at least one request")
        # We assume that if len(requests) == len(self) then the requests are the same
        if len(request_ids) == len(self):
            return self

        device = self.block_tables_tensor.device

        # New values after filtering
        requests_idx_mapping = {}

        # Used to index into tensors
        indices = []

        if not has_triton():
            # slots to keep after filtering
            slot_filtering_indices = torch.zeros(
                self.slots.shape[0], dtype=torch.bool, device=device
            )

        # Create on CPU to only move to GPU once instead of at every copy
        slot_indices = torch.empty(len(request_ids), dtype=torch.int64)
        max_input_length = 0
        max_current_length = 0

        requests = []
        block_tables = []
        all_input_ids = []
        input_ids = []

        prompt_lengths = []
        input_lengths = []
        cache_lengths = []
        prefix_offsets = []
        read_offsets = []
        cu_slots = [0]

        prefilling_mask = []
        prefill_logprob_tokens = []

        stopping_criterias = []
        top_n_tokens = []
        adapter_set = set()

        num_blocks = 0
        max_blocks = 0
        max_slots = 0
        cumulative_slot_tokens = 0

        for i, request_id in enumerate(request_ids):
            idx = self.requests_idx_mapping[request_id]
            indices.append(idx)
            requests_idx_mapping[request_id] = i

            requests.append(self.requests[idx])

            # Prefilling
            request_prefilling = self.prefilling_mask[idx]
            prefilling_mask.append(request_prefilling)

            # Get length
            request_input_length = self.input_lengths[idx]
            request_cache_length = self.cache_lengths[idx]
            max_input_length = max(max_input_length, request_input_length)
            max_current_length = max(
                max_current_length, request_cache_length + request_input_length
            )

            all_input_ids.append(self.all_input_ids[idx])

            prompt_lengths.append(self.prompt_lengths[idx])
            input_lengths.append(request_input_length)
            cache_lengths.append(request_cache_length)
            prefix_offsets.append(self.prefix_offsets[idx])
            read_offsets.append(self.read_offsets[idx])

            stopping_criteria = self.stopping_criterias[idx]
            stopping_criterias.append(stopping_criteria)

            top_n_tokens.append(self.top_n_tokens[idx])
            prefill_logprob_tokens.append(self.prefill_logprob_tokens[idx])

            ADAPTER_TO_INDEX = get_adapter_to_index()
            adapter_index = ADAPTER_TO_INDEX.get(self.requests[idx].adapter_id, 0)
            adapter_set.add(adapter_index)

            request_block_table = self.block_tables[idx]
            num_blocks += len(request_block_table)
            block_tables.append(request_block_table)

            start_slot = self.cu_slots[idx]
            end_slot = self.cu_slots[idx + 1]
            slot_length = end_slot - start_slot

            if not has_triton():
                # Set slice
                slot_filtering_indices[start_slot:end_slot] = True

            cu_slots.append(cumulative_slot_tokens + slot_length)

            # Input ids if the request was part of a prefilling batch
            # If the batch was decoding we can index into the tensor directly later
            if self.prefilling:
                input_ids.append(self.input_ids[idx])
            else:
                # Copy to tensor (CPU)
                slot_indices[i] = cumulative_slot_tokens + request_cache_length

            cumulative_slot_tokens += slot_length
            max_blocks = max(max_blocks, len(request_block_table))
            max_slots = max(max_slots, slot_length)

        all_input_ids_tensor = self.all_input_ids_tensor[indices]
        block_tables_tensor = self.block_tables_tensor[indices]
        next_token_chooser = self.next_token_chooser.filter(indices)
        top_n_tokens_tensor = self.top_n_tokens_tensor[indices]
        speculative_ids = (
            self.speculative_ids[indices] if self.speculative_ids is not None else None
        )
        prompt_lengths_tensor = self.prompt_lengths_tensor[indices]

        cu_slots = torch.tensor(cu_slots, dtype=torch.int64)

        if not has_triton():
            slots = self.slots[slot_filtering_indices]
        else:
            slots = self.slots.new_empty(cumulative_slot_tokens)
            gpu_cu_slots = cu_slots.to(device)
            slots_indexing_start = self.cu_slots.to(device)[indices]
            slots_filtering(
                max_slots, self.slots, slots, gpu_cu_slots, slots_indexing_start
            )

        if self.prefilling:
            # These values will be set by `FlashCausalLMBatch.prepare_for_prefill`
            position_ids = None
            slot_indices = None
            cache_lengths_tensor = None
            input_lengths_tensor = None
            adapter_meta = None
        else:
            # Index into tensors
            input_ids = self.input_ids[indices]
            position_ids = self.position_ids[indices]
            adapter_indices = self.adapter_meta.adapter_indices[indices]
            input_lengths_tensor = self.input_lengths_tensor[indices]
            cache_lengths_tensor = self.cache_lengths_tensor[indices]

            # Move to GPU now that we have the whole tensor
            slot_indices = slot_indices.to(device)

            adapter_segments, adapter_segment_indices = find_segments(adapter_indices)
            adapter_segments = torch.tensor(
                adapter_segments, dtype=torch.int32, device=device
            )
            adapter_meta = AdapterBatchMetadata(
                adapter_indices=adapter_indices,
                adapter_set=adapter_set,
                adapter_segments=adapter_segments,
                segment_indices=adapter_segment_indices,
            )

        return type(self)(
            batch_id=self.batch_id,
            requests=requests,
            requests_idx_mapping=requests_idx_mapping,
            input_ids=input_ids,
            position_ids=position_ids,
            cu_seqlen_prefill=None,
            prefill_cache_indices=None,
            slot_indices=slot_indices,
            block_tables=block_tables,
            block_tables_tensor=block_tables_tensor,
            slots=slots,
            cu_slots=cu_slots,
            max_input_length=max_input_length,
            max_current_length=max_current_length,
            prefilling=self.prefilling,
            prefilling_mask=prefilling_mask,
            prefill_head_indices=None,
            prefill_next_token_indices=None,
            prefill_cu_outlens=None,
            prefill_logprob_tokens=prefill_logprob_tokens,
            prompt_lengths=prompt_lengths,
            prompt_lengths_tensor=prompt_lengths_tensor,
            input_lengths=input_lengths,
            input_lengths_tensor=input_lengths_tensor,
            cache_lengths=cache_lengths,
            cache_lengths_tensor=cache_lengths_tensor,
            prefix_offsets=prefix_offsets,
            read_offsets=read_offsets,
            all_input_ids=all_input_ids,
            all_input_ids_tensor=all_input_ids_tensor,
            next_token_chooser=next_token_chooser,
            stopping_criterias=stopping_criterias,
            top_n_tokens=top_n_tokens,
            top_n_tokens_tensor=top_n_tokens_tensor,
            num_blocks=num_blocks,
            max_blocks=max_blocks,
            speculative_ids=speculative_ids,
            adapter_meta=adapter_meta,
        )

    @classmethod
    @tracer.start_as_current_span("concatenate")
    def concatenate(cls, batches: List["FlashCausalLMBatch"]) -> "FlashCausalLMBatch":
        # Batch attributes
        requests = []
        requests_idx_mapping = {}

        prefilling = False
        num_blocks = 0
        total_batch_size = 0
        total_slots = 0
        max_blocks = 0
        max_length = 0
        max_input_length = 0
        max_current_length = 0
        for b in batches:
            total_batch_size += len(b)
            max_blocks = max(max_blocks, b.max_blocks)
            total_slots += len(b.slots)
            num_blocks += b.num_blocks
            speculative_length = (
                b.speculative_ids.shape[1] if b.speculative_ids is not None else 0
            )
            max_input_length = max(max_input_length, b.max_input_length)
            max_current_length = max(max_current_length, b.max_current_length)
            max_length = max(
                max_length,
                max(
                    prompt_length
                    + stopping_criteria.max_new_tokens
                    + speculative_length
                    for prompt_length, stopping_criteria in zip(
                        b.prompt_lengths, b.stopping_criterias
                    )
                ),
            )
            prefilling = prefilling or b.prefilling

        slots = batches[0].slots.new_empty(total_slots)
        cu_slots = torch.zeros(total_batch_size + 1, dtype=torch.int64)
        if prefilling:
            input_ids = []
            # These values will be set by `FlashCausalLMBatch.prepare_for_prefill`
            position_ids = None
            slot_indices = None
            cache_lengths_tensor = None
            input_lengths_tensor = None
            adapter_meta = None
            adapter_segment_builder = None
        else:
            input_ids = batches[0].input_ids.new_empty(total_batch_size)
            if (
                batches[0].position_ids is not None
                and batches[0].position_ids.dim() == 2
            ):
                # Qwen2_vl case:
                position_ids = batches[0].position_ids.new_empty(
                    (total_batch_size, batches[0].position_ids.shape[-1])
                )
            else:
                position_ids = batches[0].position_ids.new_empty(total_batch_size)
            slot_indices = batches[0].slot_indices.new_empty(total_batch_size)
            input_lengths_tensor = batches[0].input_lengths_tensor.new_empty(
                total_batch_size
            )
            cache_lengths_tensor = batches[0].cache_lengths_tensor.new_empty(
                total_batch_size
            )
            total_indices_size = sum(
                b.adapter_meta.adapter_indices.shape[0] for b in batches
            )
            adapter_indices = batches[0].adapter_meta.adapter_indices.new_empty(
                total_indices_size
            )
            adapter_segment_builder = SegmentConcatBuilder()
            adapter_set = set()

        prompt_lengths_tensor = batches[0].prompt_lengths_tensor.new_empty(
            total_batch_size
        )
        block_tables_tensor = batches[0].block_tables_tensor.new_zeros(
            (total_batch_size, max_blocks)
        )
        all_input_ids_tensor = batches[0].all_input_ids_tensor.new_zeros(
            (total_batch_size, max_length)
        )
        top_n_tokens_tensor = batches[0].top_n_tokens_tensor.new_zeros(
            total_batch_size,
        )

        block_tables = []
        cache_lengths = []
        all_input_ids = []

        prompt_lengths = []
        input_lengths = []
        prefix_offsets = []
        read_offsets = []

        prefill_logprob_tokens = []

        next_token_chooser_parameters = []
        fsm_grammar_states = []
        stopping_criterias = []
        top_n_tokens = []
        prefilling_mask = []

        # Cumulative length
        cumulative_batch_size = 0
        cumulative_slots = 0
        cumulative_adapter_indices_size = 0

        for i, batch in enumerate(batches):
            requests.extend(batch.requests)

            if i == 0:
                requests_idx_mapping = batch.requests_idx_mapping
            else:
                # We need to offset the mapping for each batch by the cumulative batch size
                for k, v in batch.requests_idx_mapping.items():
                    requests_idx_mapping[k] = v + cumulative_batch_size

            start_index = cumulative_batch_size
            end_index = cumulative_batch_size + len(batch)

            # Copy tensors (GPU)
            top_n_tokens_tensor[start_index:end_index] = batch.top_n_tokens_tensor
            all_input_ids_tensor[
                start_index:end_index, : batch.all_input_ids_tensor.shape[1]
            ] = batch.all_input_ids_tensor[:, :max_length]

            block_tables_tensor[
                start_index:end_index, : batch.block_tables_tensor.shape[1]
            ] = batch.block_tables_tensor[:, :max_blocks]
            prompt_lengths_tensor[start_index:end_index] = batch.prompt_lengths_tensor

            slots_start_index = cumulative_slots
            slots_end_index = cumulative_slots + len(batch.slots)
            slots[slots_start_index:slots_end_index] = batch.slots
            cu_slots[start_index + 1 : end_index + 1] = (
                batch.cu_slots[1:] + cumulative_slots
            )

            if not prefilling:
                input_ids[start_index:end_index] = batch.input_ids
                position_ids[start_index:end_index] = batch.position_ids
                slot_indices[start_index:end_index] = (
                    batch.slot_indices + cumulative_slots
                )
                input_lengths_tensor[start_index:end_index] = batch.input_lengths_tensor
                cache_lengths_tensor[start_index:end_index] = batch.cache_lengths_tensor

                # Copy over adapter indices
                adapter_start_index = cumulative_adapter_indices_size
                adapter_end_index = (
                    cumulative_adapter_indices_size
                    + batch.adapter_meta.adapter_indices.shape[0]
                )
                adapter_indices[adapter_start_index:adapter_end_index] = (
                    batch.adapter_meta.adapter_indices
                )
                cumulative_adapter_indices_size = adapter_end_index
                adapter_set.update(batch.adapter_meta.adapter_set)
                adapter_segment_builder.concat(
                    batch.adapter_meta.adapter_segments,
                    batch.adapter_meta.segment_indices,
                )
            else:
                if isinstance(batch.input_ids, torch.Tensor):
                    batch.input_ids = batch.input_ids.view(-1, 1).tolist()
                input_ids.extend(batch.input_ids)

            prefilling_mask.extend(batch.prefilling_mask)
            block_tables.extend(batch.block_tables)
            cache_lengths.extend(batch.cache_lengths)
            all_input_ids.extend(batch.all_input_ids)

            prompt_lengths.extend(batch.prompt_lengths)
            input_lengths.extend(batch.input_lengths)
            prefix_offsets.extend(batch.prefix_offsets)
            read_offsets.extend(batch.read_offsets)

            prefill_logprob_tokens.extend(batch.prefill_logprob_tokens)

            next_token_chooser_parameters.extend([r.parameters for r in batch.requests])
            fsm_grammar_states.extend(batch.next_token_chooser.fsm_grammar_states)
            stopping_criterias.extend(batch.stopping_criterias)

            top_n_tokens.extend(batch.top_n_tokens)

            # Update
            cumulative_slots += len(batch.slots)
            cumulative_batch_size += len(batch)

        next_token_chooser = HeterogeneousNextTokenChooser.from_pb(
            next_token_chooser_parameters,
            dtype=batches[0].next_token_chooser.dtype,
            device=batches[0].next_token_chooser.device,
            tokenizer=batches[0].next_token_chooser.tokenizer,
            fsm_grammar_states=fsm_grammar_states,
        )

        # We skip computing the speculative_ids when the batch size is too large, so
        # we must check that all batches have them, otherwise they must be discarded
        if get_speculate() > 0 and all(b.speculative_ids is not None for b in batches):
            speculative_ids = torch.cat([b.speculative_ids for b in batches], dim=0)
        else:
            speculative_ids = None

        if adapter_segment_builder is not None:
            adapter_segments, adapter_segment_indices = adapter_segment_builder.build()
            adapter_meta = AdapterBatchMetadata(
                adapter_indices=adapter_indices,
                adapter_set=adapter_set,
                adapter_segments=adapter_segments,
                segment_indices=adapter_segment_indices,
            )

        return cls(
            batch_id=batches[0].batch_id,
            requests=requests,
            requests_idx_mapping=requests_idx_mapping,
            input_ids=input_ids,
            position_ids=position_ids,
            cu_seqlen_prefill=None,
            prefill_cache_indices=None,
            slot_indices=slot_indices,
            block_tables=block_tables,
            block_tables_tensor=block_tables_tensor,
            cache_lengths=cache_lengths,
            cache_lengths_tensor=cache_lengths_tensor,
            slots=slots,
            cu_slots=cu_slots,
            max_input_length=max_input_length,
            max_current_length=max_current_length,
            prefilling=prefilling,
            prefilling_mask=prefilling_mask,
            prefill_head_indices=None,
            prefill_next_token_indices=None,
            prefill_cu_outlens=None,
            prefill_logprob_tokens=prefill_logprob_tokens,
            prompt_lengths=prompt_lengths,
            prompt_lengths_tensor=prompt_lengths_tensor,
            input_lengths=input_lengths,
            input_lengths_tensor=input_lengths_tensor,
            prefix_offsets=prefix_offsets,
            read_offsets=read_offsets,
            all_input_ids=all_input_ids,
            all_input_ids_tensor=all_input_ids_tensor,
            next_token_chooser=next_token_chooser,
            stopping_criterias=stopping_criterias,
            top_n_tokens=top_n_tokens,
            top_n_tokens_tensor=top_n_tokens_tensor,
            num_blocks=num_blocks,
            max_blocks=max_blocks,
            speculative_ids=speculative_ids,
            adapter_meta=adapter_meta,
        )

    def prepare_for_prefill(self):
        # Prepare values if we need to continue prefilling
        # Speculation must be ignored while we prefill even with chunking
        # it simplifies everything
        assert self.speculative_ids is None

        device = self.block_tables_tensor.device

        if isinstance(self.input_ids, list):
            if len(self) > 1:
                input_ids = np.concatenate(self.input_ids, dtype=np.int64)
            else:
                input_ids = self.input_ids[0]
            self.input_ids = torch.tensor(input_ids, dtype=torch.int64, device=device)

        self.input_lengths_tensor = torch.tensor(
            self.input_lengths, dtype=torch.int32, device=device
        )
        cu_seqlen_prefill = self.input_lengths_tensor.new_zeros(len(self) + 1)
        torch.cumsum(self.input_lengths_tensor, out=cu_seqlen_prefill[1:], dim=0)
        self.cu_seqlen_prefill = cu_seqlen_prefill.to(torch.int32)
        self.cache_lengths_tensor = torch.tensor(
            self.cache_lengths, dtype=torch.int32, device=device
        )

        # If the device supports Triton, we can use a fused kernel
        if has_triton():
            self.position_ids = torch.empty(
                len(self.input_ids), dtype=torch.int32, device=device
            )
            self.slot_indices = torch.empty(
                len(self.input_ids), dtype=torch.int64, device=device
            )
            cu_slots_gpu = self.cu_slots.to(device)

            prepare_position_slot_ids(
                self.max_input_length,
                self.cache_lengths_tensor,
                self.cu_seqlen_prefill,
                cu_slots_gpu,
                self.position_ids,
                self.slot_indices,
            )

        position_ids = []
        slot_indices = []
        all_prefill_logprobs = True
        no_prefill_logprobs = True
        prefill_cu_outlens = [0]

        # Cumulative length
        cumulative_length = 0
        cumulative_slot_tokens = 0
        prefill_out_cumulative_length = 0

        adapter_indices_list = []
        adapter_set = set()

        for i, (
            r,
            cache_length,
            input_length,
            prompt_length,
            request_prefilling,
            blocks,
        ) in enumerate(
            zip(
                self.requests,
                self.cache_lengths,
                self.input_lengths,
                self.prompt_lengths,
                self.prefilling_mask,
                self.block_tables,
            )
        ):
            next_chunk_length = input_length

            if not has_triton():
                # Position ids
                request_position_ids = torch.arange(
                    cache_length, cache_length + input_length, dtype=torch.int32
                )
                position_ids.append(request_position_ids)

                if not r.slots:
                    request_slots = [
                        s
                        for b in blocks
                        for s in range(b * BLOCK_SIZE, (b + 1) * BLOCK_SIZE)
                    ]
                else:
                    request_slots = r.slots

                request_slot_indices = torch.arange(
                    cache_length + cumulative_slot_tokens,
                    cache_length + cumulative_slot_tokens + input_length,
                    dtype=torch.int64,
                )

                slot_indices.append(request_slot_indices)

                # Update
                cumulative_slot_tokens += len(request_slots)

            # Prefill logprobs is ignored if the request is done prefilling
            prefill_logprobs = r.prefill_logprobs and request_prefilling

            all_prefill_logprobs = all_prefill_logprobs and prefill_logprobs
            no_prefill_logprobs = no_prefill_logprobs and not prefill_logprobs

            if prefill_logprobs:
                prefill_cu_outlens.append(prefill_out_cumulative_length + input_length)
                prefill_out_cumulative_length += input_length
            else:
                prefill_cu_outlens.append(prefill_out_cumulative_length + 1)
                prefill_out_cumulative_length += 1

            ADAPTER_TO_INDEX = get_adapter_to_index()
            if ADAPTER_TO_INDEX:
                adapter_index = ADAPTER_TO_INDEX.get(r.adapter_id, 0)
                adapter_indices_list.append(
                    torch.full((next_chunk_length,), adapter_index)
                )
                adapter_set.add(adapter_index)

            # Update
            cumulative_length += next_chunk_length

        if not all_prefill_logprobs and not no_prefill_logprobs:
            prefill_head_indices = []
            prefill_next_token_indices = []

            # Cumulative length
            cumulative_length = 0
            prefill_out_cumulative_length = 0

            for i, (
                r,
                input_length,
                request_prefilling,
            ) in enumerate(
                zip(
                    self.requests,
                    self.input_lengths,
                    self.prefilling_mask,
                )
            ):
                # Prefill logprobs is ignored if the request is done prefilling
                prefill_logprobs = r.prefill_logprobs and request_prefilling

                if prefill_logprobs:
                    prefill_head_indices.append(
                        torch.arange(
                            cumulative_length,
                            cumulative_length + input_length,
                            dtype=torch.int64,
                        )
                    )
                    prefill_next_token_indices.append(
                        prefill_out_cumulative_length + input_length - 1
                    )
                    prefill_out_cumulative_length += input_length
                else:
                    prefill_head_indices.append(
                        torch.tensor(
                            [cumulative_length + input_length - 1],
                            dtype=torch.int64,
                        )
                    )
                    prefill_next_token_indices.append(prefill_out_cumulative_length)
                    prefill_out_cumulative_length += 1

                # Update
                cumulative_length += input_length

        if len(self) > 1:
            if position_ids:
                position_ids = torch.cat(position_ids)
            if slot_indices:
                slot_indices = torch.cat(slot_indices)
        else:
            if position_ids:
                position_ids = position_ids[0]
            if slot_indices:
                slot_indices = slot_indices[0]

        if not has_triton():
            self.position_ids = position_ids.to(device)
            self.slot_indices = slot_indices.to(device)

        self.prefill_cu_outlens = prefill_cu_outlens
        self.prefill_cache_indices = None

        if all_prefill_logprobs:
            prefill_head_indices = None
            prefill_next_token_indices = self.cu_seqlen_prefill[1:] - 1
        elif no_prefill_logprobs:
            prefill_head_indices = self.cu_seqlen_prefill[1:] - 1
            prefill_next_token_indices = None
        else:
            prefill_head_indices = torch.cat(prefill_head_indices).to(device)
            prefill_next_token_indices = torch.tensor(
                prefill_next_token_indices, dtype=torch.int64, device=device
            )

        self.prefill_head_indices = prefill_head_indices
        self.prefill_next_token_indices = prefill_next_token_indices

        if adapter_set:
            adapter_indices = torch.cat(adapter_indices_list).to(
                dtype=torch.int64, device=device
            )
            adapter_segments, adapter_segment_indices = find_segments(adapter_indices)
        else:
            adapter_indices = torch.zeros_like(self.input_ids)
            adapter_segments = [0, len(adapter_indices)]
            adapter_segment_indices = [len(adapter_indices) - 1]

        adapter_segments = torch.tensor(
            adapter_segments, dtype=torch.int32, device=device
        )

        self.adapter_meta = AdapterBatchMetadata(
            adapter_indices=adapter_indices,
            adapter_set=adapter_set,
            adapter_segments=adapter_segments,
            segment_indices=adapter_segment_indices,
        )

    def __len__(self):
        return len(self.requests)


ADAPTER_LAYERS = [
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
    "gate_proj",
    "up_proj",
    "down_proj",
]
ROW_PARALLEL = {"o_proj", "down_proj", "lm_head"}


class FlashCausalLM(Model):
    def __init__(
        self,
        model_id: str,
        model_class,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
        speculator: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
        lora_adapter_ids: Optional[list] = [],
        tokenizer_class: PreTrainedTokenizerBase = AutoTokenizer,
        config_class: PreTrainedTokenizerBase = AutoConfig,
        default_dtype=torch.float16,
        aliases=None,
        # Used for Santacoder override of config
        num_kv_heads: Optional[int] = None,
        # Deepseek V2 uses different QK and V dims.
        head_size: Optional[int] = None,
        skip_special_tokens: bool = True,
        kv_cache_dtype: Optional[torch.dtype] = None,
        support_chunking: bool = True,
    ):
        self.quantize = quantize
        self.process_group, rank, world_size = initialize_torch_distributed()
        if torch.cuda.is_available():
            device = torch.device(f"cuda:{rank}")
            dtype = default_dtype if dtype is None else dtype
        elif SYSTEM == "ipex":
            if hasattr(torch, "xpu") and torch.xpu.is_available():
                device = torch.device(f"xpu:{rank}")
                dtype = default_dtype if dtype is None else dtype
            else:
                device = torch.device("cpu")
                dtype = torch.bfloat16 if dtype is None else dtype
                init_cpu_threads_env(rank_id=rank, world_size=world_size)
        else:
            raise NotImplementedError(f"{model_class} is only available on GPU")

        tokenizer = tokenizer_class.from_pretrained(
            model_id,
            revision=revision,
            padding_side="left",
            truncation_side="left",
            trust_remote_code=trust_remote_code,
        )
        try:
            generation_config = GenerationConfig.from_pretrained(
                model_id, revision=revision, trust_remote_code=trust_remote_code
            )
            if isinstance(generation_config.eos_token_id, (list, set)):
                # TODO Huge hack
                tokenizer._eos_token_ids = set(generation_config.eos_token_id)
        except Exception:
            pass

        config = config_class.from_pretrained(
            model_id, revision=revision, trust_remote_code=trust_remote_code
        )
        config.quantize = quantize
        config.speculator = speculator

        torch.distributed.barrier(group=self.process_group)

        weights_loader = get_loader(quantize, model_id, revision)
        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
        weights = Weights(
            filenames,
            device,
            dtype,
            process_group=self.process_group,
            aliases=aliases,
            weights_loader=weights_loader,
        )

        prefix = None
        model = model_class(prefix, config, weights)
        torch.distributed.barrier(group=self.process_group)

        # VLM models define the config we care about in their text_config
        text_config = getattr(config, "text_config", None)
        if text_config is not None:
            config = text_config

        if getattr(config, "sliding_window", None) is None:
            config.sliding_window = None

        self.num_layers = config.num_hidden_layers
        self.num_heads = config.num_attention_heads // self.process_group.size()
        self.config = config
        # Validation is done in the model itself
        if num_kv_heads is None:
            num_kv_heads = getattr(config, "num_key_value_heads", None)
            # GPT-2 workaround
            if num_kv_heads is None:
                num_kv_heads = getattr(config, "n_head", None)
        if num_kv_heads is None:
            raise ValueError("Cannot get the number of key/value heads")
        self.num_kv_heads = (
            num_kv_heads // self.process_group.size()
            if num_kv_heads > 1
            else num_kv_heads
        )
        assert self.num_kv_heads > 0

        if head_size is None:
            # Some models use GQA and different sizes for o_proj
            # and q_proj, that allows for that.
            if getattr(config, "head_dim", None) is not None:
                self.head_size = config.head_dim
            else:
                self.head_size = config.hidden_size // config.num_attention_heads
        else:
            self.head_size = head_size

        self.cuda_graphs = {}
        self.kv_cache = []
        self.kv_cache_dtype = dtype if kv_cache_dtype is None else kv_cache_dtype

        if ATTENTION == "flashinfer":
            from text_generation_server.layers.attention.flashinfer import (
                create_prefill_state,
                create_decode_state,
                create_prefill_with_paged_kv_state,
            )

            self.prefill_state = create_prefill_state(device=device)
            self.prefill_with_paged_kv_state = create_prefill_with_paged_kv_state(
                device=device
            )

            self.decode_state = create_decode_state(
                device=device,
                num_heads=self.num_heads,
                num_kv_heads=self.num_kv_heads,
            )

        super().__init__(
            model_id=model_id,
            model=model,
            tokenizer=tokenizer,
            requires_padding=False,
            dtype=dtype,
            device=device,
            rank=rank,
            world_size=world_size,
            sliding_window=config.sliding_window,
            support_chunking=support_chunking,
        )

    @property
    def batch_type(self) -> Type[FlashCausalLMBatch]:
        return FlashCausalLMBatch

    def init_kv_cache(
        self,
        num_blocks: int,
        num_layers: int,
        num_heads: int,
        head_size: int,
        dtype: torch.dtype,
        device: torch.device,
    ):
        self.kv_cache = []
        empty_cache()
        self.kv_cache = [
            KVCache(
                num_blocks=num_blocks,
                num_heads=num_heads,
                head_size=head_size,
                dtype=dtype,
                device=device,
            )
            for _ in range(num_layers)
        ]

    def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int):
        max_bs = max(self.cuda_graphs.keys()) if self.cuda_graphs else None
        input_lengths = [max_s] * bs
        cache_lengths = [0] * bs
        if max_bs is None:
            input_ids = torch.zeros(bs, dtype=torch.int64, device=self.device)
            position_ids = torch.zeros(bs, dtype=torch.int32, device=self.device)
            config = getattr(self.model, "config", None)
            rope_scaling = getattr(config, "rope_scaling", None) if config else None
            if (  # mrope have position_ids per section, if so repeat n times
                isinstance(rope_scaling, dict) and rope_scaling["rope_type"] == "mrope"
            ):
                n_sections = len(self.model.config.rope_scaling["mrope_section"])
                position_ids = position_ids.unsqueeze(1).repeat(1, n_sections)
            slots = torch.arange(bs, dtype=torch.int64, device=self.device)
            input_lengths_tensor = (
                torch.ones(bs, dtype=torch.int32, device=self.device) * max_s
            )
            cache_lengths_tensor = torch.zeros(
                bs, dtype=torch.int32, device=self.device
            )
            block_tables = torch.arange(
                max_bt, dtype=torch.int32, device=self.device
            ).repeat(bs)
            block_tables = block_tables.reshape((bs, max_bt))
            if ATTENTION == "flashinfer":
                block_tables = block_tables_to_ragged(
                    block_tables=block_tables,
                    input_lengths=input_lengths,
                    cache_lengths=cache_lengths,
                    input_lengths_tensor=input_lengths_tensor,
                    cache_lengths_tensor=cache_lengths_tensor,
                    max_current_length=max_s,
                )
        else:
            if bs > max_bs:
                raise RuntimeError(
                    "Cuda graphs should be generated in decreasing order size to reduce VRAM usage"
                )
            input_ids = self.cuda_graphs[max_bs]["input_ids"][:bs]
            position_ids = self.cuda_graphs[max_bs]["position_ids"][:bs]
            if ATTENTION == "flashinfer":
                block_tables = self.cuda_graphs[max_bs]["block_tables"][: bs * max_bt]
            else:
                block_tables = self.cuda_graphs[max_bs]["block_tables"][:bs]
            slots = self.cuda_graphs[max_bs]["slots"][:bs]
            input_lengths_tensor = self.cuda_graphs[max_bs]["input_lengths"][:bs]
            cache_lengths_tensor = self.cuda_graphs[max_bs]["cache_lengths"][:bs]

        if ATTENTION == "flashinfer":
            from text_generation_server.layers.attention.flashinfer import (
                create_decode_state_cuda_graphs,
            )

            block_tables_ptr = torch.zeros(
                bs + 1, dtype=torch.int32, device=self.device
            )
            last_page_len = torch.ones(bs, dtype=torch.int32, device=self.device)
            state = create_decode_state_cuda_graphs(
                device=input_ids.device,
                block_tables=block_tables,
                block_tables_ptr=block_tables_ptr,
                last_page_len=last_page_len,
                num_heads=self.num_heads,
                num_kv_heads=self.num_kv_heads,
            )
        else:
            state = None

        graph = torch.cuda.CUDAGraph()
        self.cuda_graphs[bs] = {
            "input_ids": input_ids,
            "position_ids": position_ids,
            "kv_cache": self.kv_cache,
            "block_tables": block_tables,
            "slots": slots,
            "input_lengths": input_lengths_tensor,
            "cache_lengths": cache_lengths_tensor,
            "state": state,
            "graph": graph,
        }

        torch.cuda.synchronize()
        # Run once outside to warmup
        with self._forward_context(
            block_tables=block_tables,
            cu_seqlen_prefill=None,
            input_lengths_tensor=input_lengths_tensor,
            state=state,
            cache_lengths_tensor=cache_lengths_tensor,
        ):
            seqlen = Seqlen(
                input_lengths=input_lengths_tensor,
                cache_lengths=cache_lengths_tensor,
                cu_seqlen_q=None,
                max_q=1,
                max_k=max_s,
            )
            self.model.forward(
                input_ids=input_ids,
                position_ids=position_ids,
                cu_seqlen_prefill=None,
                kv_cache=self.kv_cache,
                block_tables=block_tables,
                slots=slots,
                seqlen=seqlen,
                max_s=max_s,
                prefill_cache_indices=None,
                lm_head_indices=None,
            )
            del seqlen

            torch.cuda.synchronize()

            with torch.cuda.graph(graph, pool=MEM_POOL):
                seqlen = Seqlen(
                    input_lengths=input_lengths_tensor,
                    cache_lengths=cache_lengths_tensor,
                    cu_seqlen_q=None,
                    max_q=1,
                    max_k=max_s,
                )
                logits, speculative_logits = self.model.forward(
                    input_ids=input_ids,
                    position_ids=position_ids,
                    cu_seqlen_prefill=None,
                    kv_cache=self.kv_cache,
                    block_tables=block_tables,
                    slots=slots,
                    seqlen=seqlen,
                    max_s=max_s,
                    prefill_cache_indices=None,
                    lm_head_indices=None,
                )
                self.cuda_graphs[bs]["logits"] = logits
                self.cuda_graphs[bs]["speculative_logits"] = speculative_logits
        torch.cuda.synchronize()

    def warmup(
        self,
        batch: FlashCausalLMBatch,
        max_input_tokens: Optional[int],
        max_total_tokens: Optional[int],
    ):
        # The warmup batch is the biggest batch we could ever receive
        self.kv_cache = []
        empty_cache()

        # Inspired by the original implementation in [vllm](https://github.com/vllm-project/vllm)
        # Calculate the number of blocks that can be allocated with the free memory
        dtype_size = torch.tensor([], dtype=self.kv_cache_dtype).element_size()
        cache_block_size = BLOCK_SIZE * self.num_kv_heads * self.head_size
        total_cache_size = self.num_layers * cache_block_size * 2 * dtype_size

        try:
            self.init_kv_cache(
                batch.num_blocks,
                self.num_layers,
                self.num_kv_heads,
                self.head_size,
                self.kv_cache_dtype,
                self.device,
            )

            batch_num_blocks = batch.num_blocks

            num_tokens = batch.to_pb().current_tokens
            if SYSTEM == "rocm" and os.environ.get("PYTORCH_TUNABLEOP_ENABLED", False):
                torch.cuda.tunable.tuning_enable(False)
            synchronize(self.device)
            free_memory = get_free_memory(
                self.device, MEMORY_FRACTION * TGI_WIGGLE_ROOM
            )
            real_free_memory = get_free_memory(self.device, MEMORY_FRACTION)
            log_master(
                logger.debug,
                f"Free memory {free_memory / 1e9:.2f}GB , (real: {real_free_memory / 1e9:.2f}GB",
            )

            _, _batch, _ = self.generate_token(batch)
        except torch.cuda.OutOfMemoryError as e:
            raise RuntimeError(
                f"Not enough memory to handle {num_tokens} prefill tokens. "
                f"You need to decrease `--max-batch-prefill-tokens`"
            ) from e

        synchronize(self.device)
        free_memory = get_free_memory(self.device, MEMORY_FRACTION * TGI_WIGGLE_ROOM)
        kv_memory = free_memory
        num_blocks = (
            # Leave 5% for some wiggle room
            int(kv_memory // total_cache_size)
            # Add batch.num_blocks as we allocated it above, so it is included in the peak memory.
            + batch_num_blocks
        )

        log_master(logger.info, f"KV-cache blocks: {num_blocks}, size: {BLOCK_SIZE}")
        if max_total_tokens is None:
            if get_support_chunking():
                model_max_length = self.tokenizer.model_max_length
                max_position_embeddings = getattr(
                    self.config, "max_position_embeddings", model_max_length
                )
                max_total_tokens = min(
                    num_blocks * BLOCK_SIZE, model_max_length, max_position_embeddings
                )
            else:
                max_total_tokens = sum(batch.cache_lengths)

        if max_input_tokens is None:
            max_input_tokens = max_total_tokens - 1

        del _batch, batch
        self.kv_cache = []
        empty_cache()

        self.init_kv_cache(
            num_blocks,
            self.num_layers,
            self.num_kv_heads,
            self.head_size,
            self.kv_cache_dtype,
            self.device,
        )

        if SYSTEM == "rocm":
            if (
                os.environ.get("PYTORCH_TUNABLEOP_ENABLED") is None
                or os.environ.get("PYTORCH_TUNABLEOP_ENABLED") == "1"
            ):
                torch.cuda.tunable.enable()

                if os.environ.get("PYTORCH_TUNABLEOP_TUNING") != "0":
                    torch.cuda.tunable.tuning_enable(True)

                if os.environ.get("PYTORCH_TUNABLEOP_SEQLENS") is not None:
                    tuning_sequences = [
                        int(val)
                        for val in os.environ["PYTORCH_TUNABLEOP_SEQLENS"].split(",")
                    ]
                elif CUDA_GRAPHS is not None:
                    tuning_sequences = CUDA_GRAPHS
                else:
                    tuning_sequences = [1, 2, 3, 4, 5, 6, 7]

                tunableop_filepath = os.path.join(
                    HUGGINGFACE_HUB_CACHE,
                    f"tunableop_{self.model_id.replace('/', '-')}_tp{self.world_size}_rank{self.rank}.csv",
                )

                log_master(
                    logger.info,
                    f"PyTorch TunableOp is enabled. The warmup may take several minutes, picking the ROCm optimal matrix multiplication kernel for the target lengths {', '.join([str(seqlen) for seqlen in tuning_sequences])}, with typical 5-8% latency improvement for small sequence lengths. The picked GEMMs are saved in the file {tunableop_filepath}. To disable TunableOp, please launch TGI with `PYTORCH_TUNABLEOP_ENABLED=0`.",
                )

                torch.cuda.tunable.set_filename(
                    tunableop_filepath, insert_device_ordinal=False
                )

                if os.path.isfile(tunableop_filepath):
                    log_master(
                        logger.info,
                        f"The file {tunableop_filepath} already exists and will be reused.",
                    )
                    torch.cuda.tunable.read_file(tunableop_filepath)

                os.makedirs(HUGGINGFACE_HUB_CACHE, exist_ok=True)

                for seqlen in tuning_sequences:
                    log_master(logger.info, f"Warming up TunableOp for seqlen={seqlen}")
                    self.tunableop_warmup(seqlen, max_total_tokens)
                    torch.cuda.tunable.write_file(tunableop_filepath)
                if os.environ.get("PYTORCH_TUNABLEOP_TUNING_AFTER_WARMUP") != "1":
                    torch.cuda.tunable.tuning_enable(False)
            else:
                log_master(
                    logger.info,
                    "PyTorch ROCm TunableOp (https://github.com/pytorch/pytorch/tree/main/aten/src/ATen/cuda/tunable) is disabled. TunableOp brings an additional 5-8% latency improvement for small sequence lengths but requires a warmup. If necessary, please use the environment variable PYTORCH_TUNABLEOP_ENABLED=1 to enable TunableOp.",
                )

        if CUDA_GRAPHS:
            try:
                log_master(
                    logger.info, f"Cuda Graphs are enabled for sizes {CUDA_GRAPHS}"
                )
                # Warmup cuda graphs
                for bs in CUDA_GRAPHS:
                    synchronize(self.device)
                    free_memory = get_free_memory(
                        self.device, MEMORY_FRACTION * TGI_WIGGLE_ROOM
                    )
                    log_master(
                        logger.debug,
                        f"Free RAM before cuda graph {bs} {free_memory / 1e9:.2f}GB",
                    )
                    if self.speculate is None or self.speculate + 1 <= bs:
                        self.cuda_graph_warmup(bs, max_total_tokens, max_total_tokens)
                empty_cache()
                synchronize(self.device)
                free_memory = get_free_memory(
                    self.device, MEMORY_FRACTION * TGI_WIGGLE_ROOM
                )
                log_master(
                    logger.debug,
                    f"Free RAM after cuda graphs {free_memory / 1e9:.2f}GB",
                )
            except torch.cuda.OutOfMemoryError:
                logger.exception("Decode cuda graph warmup failed")
        else:
            log_master(
                logger.info, f"Cuda Graphs are disabled (CUDA_GRAPHS={CUDA_GRAPHS})."
            )

        assert max_input_tokens is not None
        assert max_total_tokens is not None
        return int(num_blocks * BLOCK_SIZE), max_input_tokens, max_total_tokens

    def tunableop_warmup(self, seqlen: int, max_bt: int):
        input_ids = torch.zeros(seqlen, dtype=torch.int64, device=self.device)
        position_ids = torch.zeros(seqlen, dtype=torch.int32, device=self.device)
        slots = torch.arange(seqlen, dtype=torch.int64, device=self.device)

        # Dummy value, some models (starcoder2) don't accept `None`.
        input_lengths = torch.ones(seqlen, dtype=torch.int32, device=self.device)
        cache_lengths_tensor = torch.zeros(
            seqlen, dtype=torch.int32, device=self.device
        )
        cu_seqlen_prefill = torch.tensor(
            [0, seqlen], device=self.device, dtype=torch.int32
        )
        max_s = seqlen

        block_tables = torch.arange(
            max_bt, dtype=torch.int32, device=self.device
        ).repeat(seqlen)
        block_tables = block_tables.reshape((seqlen, max_bt))

        seqlen = Seqlen(
            input_lengths=input_lengths,
            cache_lengths=cache_lengths_tensor,
            max_k=seqlen,
        )

        # We pass a `cu_seqlen_prefill` in order not to have to deal with paged attention cache allocation/deallocation.
        self.model.forward(
            input_ids=input_ids,
            position_ids=position_ids,
            cu_seqlen_prefill=cu_seqlen_prefill,
            kv_cache=self.kv_cache,
            block_tables=block_tables,
            seqlen=seqlen,
            slots=slots,
            max_s=max_s,
            lm_head_indices=None,
            prefill_cache_indices=None,
        )

    def forward(
        self, batch: FlashCausalLMBatch, adapter_data: AdapterBatchData
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        # Model Forward
        if batch.speculative_ids is not None:
            input_ids = batch.input_ids
            position_ids = batch.position_ids
            cu_seqlen_prefill = batch.cu_seqlen_prefill
            kv_cache = self.kv_cache
            block_tables = batch.block_tables_tensor
            slots = batch.slots[batch.slot_indices]
            input_lengths = batch.input_lengths_tensor
            max_s = batch.max_current_length
            lm_head_indices = batch.prefill_head_indices

            speculative_ids = batch.speculative_ids

            B, speculative_length = speculative_ids.shape
            new_length = speculative_length + 1
            new_input_ids = torch.cat(
                [input_ids.unsqueeze(-1), speculative_ids], dim=1
            ).reshape(-1)
            arange = torch.arange(new_length, device=position_ids.device).unsqueeze(0)
            arange_int = arange.to(dtype=torch.int32)
            new_position_ids = (
                position_ids.unsqueeze(-1).expand(B, new_length) + arange
            ).view(-1)

            # Slots can be discontiguous when prefix caching is enabled, so we need to expand the slot_indices,
            # then update the slots with the additional indices to ensure we're grabbing the ones that have been
            # allocated
            slot_indices = (
                batch.slot_indices.unsqueeze(-1).expand(B, new_length) + arange_int
            ).view(-1)
            slots = batch.slots[slot_indices]

            input_lengths = (
                input_lengths.unsqueeze(-1).expand(B, new_length) + arange_int
            ).view(-1)
            cache_lengths_tensor = (
                batch.cache_lengths_tensor.unsqueeze(-1).expand(B, new_length)
            ).reshape(-1)

            # Add Copy the block tables for all members
            block_tables = (
                block_tables.unsqueeze(1)
                .expand(B, new_length, -1)
                .reshape(B * new_length, -1)
                .contiguous()
            )
            max_s = max_s + speculative_length

            input_ids = new_input_ids
            position_ids = new_position_ids
        else:
            input_ids = batch.input_ids
            position_ids = batch.position_ids
            cu_seqlen_prefill = batch.cu_seqlen_prefill
            kv_cache = self.kv_cache
            block_tables = batch.block_tables_tensor
            slots = batch.slots[batch.slot_indices]
            input_lengths = batch.input_lengths_tensor
            cache_lengths_tensor = batch.cache_lengths_tensor
            max_s = batch.max_current_length
            lm_head_indices = batch.prefill_head_indices

        bs = input_ids.shape[0]
        sorted_padded_bs = sorted([k for k in self.cuda_graphs.keys() if k >= bs])
        if sorted_padded_bs:
            # Get associated cuda graph
            cuda_graph = self.cuda_graphs[sorted_padded_bs[0]]
        else:
            cuda_graph = None

        if cu_seqlen_prefill is not None or cuda_graph is None:
            if ATTENTION == "flashinfer":
                block_tables = block_tables_to_ragged(
                    block_tables=block_tables,
                    input_lengths=batch.input_lengths,
                    cache_lengths=batch.cache_lengths,
                    input_lengths_tensor=batch.input_lengths_tensor,
                    cache_lengths_tensor=batch.cache_lengths_tensor,
                    max_current_length=batch.max_current_length,
                )
            with self._forward_context(
                block_tables=block_tables,
                cu_seqlen_prefill=cu_seqlen_prefill,
                input_lengths_tensor=input_lengths,
                cache_lengths_tensor=cache_lengths_tensor,
            ):
                seqlen = Seqlen(
                    input_lengths=input_lengths,
                    cache_lengths=cache_lengths_tensor,
                    cu_seqlen_q=cu_seqlen_prefill,
                    max_q=batch.max_input_length,
                    max_k=batch.max_current_length,
                )
                logits, speculative_logits = self.model.forward(
                    input_ids=input_ids,
                    position_ids=position_ids,
                    cu_seqlen_prefill=cu_seqlen_prefill,
                    kv_cache=kv_cache,
                    block_tables=block_tables,
                    slots=slots,
                    seqlen=seqlen,
                    max_s=max_s,
                    prefill_cache_indices=batch.prefill_cache_indices,
                    lm_head_indices=lm_head_indices,
                    adapter_data=adapter_data,
                )
                if batch.prefill_cache_indices is not None:
                    batch.prefill_cache_indices = None
                return logits, speculative_logits

        # Copy inputs to the static inputs of the cuda graph
        # Static inputs are potentially padded
        cuda_graph["input_ids"][: input_ids.shape[0]] = input_ids
        cuda_graph["position_ids"][: position_ids.shape[-1]] = position_ids
        if ATTENTION == "flashinfer":
            block_tables = block_tables_to_ragged(
                block_tables=block_tables,
                input_lengths=batch.input_lengths,
                cache_lengths=batch.cache_lengths,
                input_lengths_tensor=batch.input_lengths_tensor,
                cache_lengths_tensor=batch.cache_lengths_tensor,
                max_current_length=batch.max_current_length,
            )
            # assert block_tables.shape[0] >= slots.shape[0]
            cuda_graph["block_tables"][: block_tables.shape[0]] = block_tables
        else:
            cuda_graph["block_tables"][
                : block_tables.shape[0], : block_tables.shape[1]
            ] = block_tables

        # XXX: This is working only because block 0 is reserved for the healthcheck
        # so it doesn't matter if we override it with bogus values.
        cuda_graph["slots"].fill_(0)
        cuda_graph["slots"][: slots.shape[0]] = slots
        cuda_graph["input_lengths"].zero_()
        cuda_graph["input_lengths"][: input_lengths.shape[0]] = input_lengths
        cuda_graph["cache_lengths"].zero_()
        cuda_graph["cache_lengths"][
            : cache_lengths_tensor.shape[0]
        ] = cache_lengths_tensor

        with self._forward_context(
            block_tables=cuda_graph["block_tables"],
            cu_seqlen_prefill=None,
            input_lengths_tensor=cuda_graph["input_lengths"],
            cache_lengths_tensor=cuda_graph["cache_lengths"],
            state=cuda_graph["state"],
        ):
            # Replay the graph
            cuda_graph["graph"].replay()

        # Slice output to the correct shape
        speculative_logits = (
            cuda_graph["speculative_logits"][:bs]
            if cuda_graph["speculative_logits"] is not None
            else None
        )
        logits = cuda_graph["logits"][:bs]
        return logits, speculative_logits

    @tracer.start_as_current_span("generate_token")
    def generate_token(
        self, batch: FlashCausalLMBatch
    ) -> Tuple[List[Generation], Optional[FlashCausalLMBatch], Tuple[int, int]]:
        start = time.time_ns()
        prefill = batch.prefilling
        if prefill:
            batch.prepare_for_prefill()

        if hasattr(self, "set_inputs_embeds") and callable(self.set_inputs_embeds):
            self.set_inputs_embeds(batch)

        prefill_logprobs = batch.prefill_next_token_indices is not None

        # Update adapter indices for speculative tokens (if present)
        adapter_meta = batch.adapter_meta
        if batch.speculative_ids is not None:
            B, speculative_length = batch.speculative_ids.shape
            new_length = speculative_length + 1
            adapter_indices = (
                adapter_meta.adapter_indices.unsqueeze(-1)
                .expand(B, new_length)
                .reshape(-1)
            )
            adapter_segments = adapter_meta.adapter_segments * new_length
            adapter_meta = AdapterBatchMetadata(
                adapter_indices=adapter_indices,
                adapter_set=adapter_meta.adapter_set,
                adapter_segments=adapter_segments,
                segment_indices=adapter_meta.segment_indices,
            )

        # Assign pointers to adapter weights
        # TODO(travis): don't update this if indices haven't changed
        adapter_data = AdapterBatchData.from_meta(
            adapter_meta,
            self.layer_to_adapter_weights,
            prefill,
            batch.prefill_head_indices,
        )

        out, speculative_logits = self.forward(batch, adapter_data)

        if prefill:
            next_token_logits = (
                out[batch.prefill_next_token_indices] if prefill_logprobs else out
            )
            if speculative_logits is not None:
                speculative_logits = (
                    speculative_logits[batch.prefill_next_token_indices]
                    if prefill_logprobs
                    else speculative_logits
                )
            if len(batch) > 1 and prefill_logprobs:
                # We create the prefill_tokens_indices tensor that will be used to gather prefill logprobs
                # When batch == 1, we will just use the batch.input_ids values directly
                prefill_tokens_indices = batch.input_ids.new_zeros(len(out))
        else:
            prefill_logprobs = None
            next_token_logits = out

        finished_prefilling = True
        next_chunk_lengths = []
        current_prefilling_mask = batch.prefilling_mask
        if prefill:
            if get_support_chunking():
                next_prefilling_mask = []
                # Budget in tokens for the next batch
                # We remove (len(batch) - 1) to always have enough space for at least a single decode
                # for the remaining requests -1 because the first request does not need to be removed from the budget
                # (ex: you have one request in the batch, you want it to take the full budget not budget -1)
                batch_budget = get_max_prefill_tokens() - (len(batch) - 1)
                # We reverse to prioritize older requests
                # zip() is not reversible so reverse the underlying lists instead
                for cache_length, input_length, prompt_length in zip(
                    reversed(batch.cache_lengths),
                    reversed(batch.input_lengths),
                    reversed(batch.prompt_lengths),
                ):
                    remaining_prefill_tokens = max(
                        prompt_length - cache_length - input_length, 0
                    )
                    if remaining_prefill_tokens > 0:
                        next_chunk_length = max(
                            min(remaining_prefill_tokens, batch_budget), 1
                        )
                        batch_budget -= next_chunk_length
                        finished_prefilling = False
                        next_prefilling_mask.append(True)
                    else:
                        # FIXME: use true number of accepted tokens instead of 1
                        # Since speculation will be turned off, this is always true
                        next_chunk_length = 1
                        next_prefilling_mask.append(False)
                    next_chunk_lengths.append(next_chunk_length)

                # Reverse back the obtained values²
                next_chunk_lengths.reverse()
                next_prefilling_mask.reverse()
            else:
                # The model does not support chunking
                # We know we only do a single prefill
                finished_prefilling = True
                next_prefilling_mask = [False] * len(batch)

            batch.prefilling = not finished_prefilling
            batch.prefilling_mask = next_prefilling_mask

        speculate = get_speculate()
        (
            next_input_ids,
            next_token_logprobs,
            logprobs,
            accepted_ids,
            speculative_ids,
        ) = batch.next_token_chooser(
            batch.all_input_ids_tensor[:, : batch.max_current_length],
            next_token_logits,
            speculate,
            batch.speculative_ids,
            speculative_logits,
        )

        batch_top_token_ids, batch_top_token_logprobs = batch_top_tokens(
            batch.top_n_tokens, batch.top_n_tokens_tensor, logprobs, accepted_ids
        )

        # Since we are done prefilling, all the tensors that were concatenating values for all the requests
        # instantly become of shape [BATCH_SIZE]
        if prefill and finished_prefilling:
            indices = batch.cu_seqlen_prefill[1:] - 1
            batch.position_ids = batch.position_ids[indices]
            batch.slot_indices = batch.slot_indices[indices]
            batch.adapter_meta.adapter_indices = batch.adapter_meta.adapter_indices[
                indices
            ]

        # Zipped iterator
        iterator = zip(
            batch.requests,
            batch.prompt_lengths,
            batch.cache_lengths,
            batch.input_lengths,
            batch.all_input_ids,
            accepted_ids,
            current_prefilling_mask,
            batch.prefilling_mask,
        )

        # We do two for loops as the first one can run completely asynchronously from the GPU while for the second
        # one, we need to first do a GPU <-> CPU sync
        # It is faster if we delay this sync for the maximum amount of time

        # For each member of the batch
        # Cumulative length
        cu_accepted_ids = accepted_ids.new_zeros(accepted_ids.shape[0] + 1)
        torch.cumsum(accepted_ids, dim=0, out=cu_accepted_ids[1:])
        cumulative_length = 0
        for i, (
            request,
            prompt_length,
            cache_length,
            input_length,
            all_input_ids,
            n_accepted_ids,
            request_was_prefilling,
            request_is_prefilling,
        ) in enumerate(iterator):
            # Used to gather prefill logprobs
            # Copy batch.all_input_ids_tensor to prefill_token_indices
            if request.prefill_logprobs and request_was_prefilling:
                # Indexing metadata
                out_start_index = batch.prefill_cu_outlens[i]
                out_end_index = batch.prefill_cu_outlens[i + 1]

                # Logprobs generated by the model are for the next token
                # So we need to translate the id tensor by 1
                ids = batch.all_input_ids_tensor[
                    i, cache_length + 1 : cache_length + input_length + 1
                ]
                if len(batch) > 1:
                    prefill_tokens_indices[out_start_index:out_end_index] = ids
                else:
                    # Set prefill_tokens_indices to the correct slice
                    prefill_tokens_indices = ids

            # If the device does not support triton, we copy one by one
            if not request_is_prefilling and not has_triton():
                # Only save tokens if we are done prefilling for this request
                batch.all_input_ids_tensor[
                    i,
                    batch.cache_lengths_tensor[i]
                    + batch.input_lengths[i] : batch.cache_lengths_tensor[i]
                    + batch.input_lengths[i]
                    + accepted_ids[i],
                ] = next_input_ids[cu_accepted_ids[i] : cu_accepted_ids[i + 1]]
            cumulative_length += input_length

        # If the device support triton, we can use a fused kernel
        if has_triton():
            copy_next_input_ids_inplace(
                speculate + 1,
                batch.all_input_ids_tensor,
                batch.cache_lengths_tensor,
                batch.input_lengths_tensor,
                batch.prompt_lengths_tensor,
                next_input_ids,
                cu_accepted_ids,
            )

        # Update values
        # These values can be updated without a GPU -> CPU sync
        if not prefill or (prefill and finished_prefilling):
            batch.input_ids = next_input_ids[cu_accepted_ids[1:] - 1]
            batch.speculative_ids = speculative_ids
            if batch.position_ids.dim() == 2:
                # Qwen2_vl case:
                batch.position_ids += accepted_ids.unsqueeze(-1)
            else:
                batch.position_ids += accepted_ids
            batch.cache_lengths_tensor += batch.input_lengths_tensor + accepted_ids - 1
            batch.input_lengths_tensor = torch.ones_like(batch.input_lengths_tensor)
            batch.slot_indices += accepted_ids

        if prefill and prefill_logprobs:
            # Get prefill logprobs with inplace softmax (avoid copying the `out` tensor (max_batch_prefill_tokens * vocab_size))
            torch.log_softmax(out, -1, out=out)
            prefill_logprobs_tensor = out
            prefill_logprobs = torch.gather(
                prefill_logprobs_tensor, 1, prefill_tokens_indices.view(-1, 1)
            )
            # GPU <-> CPU sync
            prefill_logprobs = prefill_logprobs.view(-1).tolist()

        # Does a GPU <-> CPU sync internally
        if prefill and finished_prefilling:
            # adjust segment lengths to account for all request lengths being 1 during decoding
            adapter_segments, _ = find_segments(batch.adapter_meta.adapter_indices)
            batch.adapter_meta.adapter_segments = torch.tensor(
                adapter_segments,
                dtype=torch.int32,
                device=batch.adapter_meta.adapter_segments.device,
            )

        # GPU <-> CPU sync
        next_token_logprobs = next_token_logprobs.tolist()
        next_token_ids = next_input_ids.tolist()
        accepted_ids = accepted_ids.tolist()

        # Update values if we need to continue prefilling
        # This represents the `else` case of the `Update values` if above
        # but since this require the `next_token_ids` to be on CPU, it is better to do it here
        if prefill and not finished_prefilling:
            # Speculation must be ignored while we prefill even with chunking
            # it simplifies everything
            assert batch.speculative_ids is None

            all_postfix_ids = []
            for i, (
                request_prefilling,
                next_token_id,
                all_input_ids,
                cache_length,
                input_length,
                next_chunk_length,
            ) in enumerate(
                zip(
                    batch.prefilling_mask,
                    next_token_ids,
                    batch.all_input_ids,
                    batch.cache_lengths,
                    batch.input_lengths,
                    next_chunk_lengths,
                )
            ):
                if request_prefilling:
                    next_cache_length = cache_length + input_length
                    # Get new prompt IDs to prefill
                    postfix_ids = all_input_ids[
                        next_cache_length : next_cache_length + next_chunk_length
                    ]
                else:
                    # This request is done prefilling, the new id is the one selected the sampling method
                    postfix_ids = [next_token_id]

                all_postfix_ids.append(postfix_ids)

            batch.input_ids = all_postfix_ids

        start_decode = time.time_ns()

        # Results
        generations: List[Generation] = []
        stopped = True

        # Zipped iterator
        iterator = zip(
            batch.requests,
            batch.prompt_lengths,
            batch.cache_lengths,
            batch.input_lengths,
            batch.prefix_offsets,
            batch.read_offsets,
            batch.stopping_criterias,
            batch.all_input_ids,
            batch.next_token_chooser.do_sample,
            batch.next_token_chooser.seeds,
            batch.top_n_tokens,
            current_prefilling_mask,
            batch.prefilling_mask,
            accepted_ids,
            batch_top_token_ids,
            batch_top_token_logprobs,
        )

        # Reset max_input_length
        batch.max_input_length = 0
        # For each member of the batch
        index = 0
        for i, (
            request,
            prompt_length,
            cache_length,
            input_length,
            prefix_offset,
            read_offset,
            stopping_criteria,
            all_input_ids,
            do_sample,
            seed,
            top_n_tokens,
            request_was_prefilling,
            request_is_prefilling,
            n_accepted_ids,
            top_token_ids,
            top_token_logprobs,
        ) in enumerate(iterator):
            # Compute logprobs first as, even though we might skip the token,
            # it can still be required to compute the logprobs
            # modulo on request.id as it is robust to batch.filter whereas the index in the batch is not and we need
            # this state to be stable
            if request.id % self.world_size == self.rank:
                # Prefill
                if request_was_prefilling and request.prefill_logprobs:
                    out_start_index = batch.prefill_cu_outlens[i]
                    out_end_index = batch.prefill_cu_outlens[i + 1]
                    if not request_is_prefilling:
                        # The request is dones prefilling, meaning that we started generating new tokens
                        # The last logprob is a logprob for a generated token that was not part of the prompt
                        # We need to remove it
                        out_end_index -= 1

                    request_prefill_logprobs = prefill_logprobs[
                        out_start_index:out_end_index
                    ]
                    # Logprobs generated by the model are for the next token
                    # So we need to translate the id tensor by 1
                    prefill_token_ids = all_input_ids[
                        cache_length + 1 : cache_length + input_length + 1
                    ]

                    past_prefill_logprob_tokens = batch.prefill_logprob_tokens[i]

                    if past_prefill_logprob_tokens is None:
                        # add nan for cached prompt tokens/first token
                        request_prefill_logprobs = [float("nan")] * (
                            cache_length + 1
                        ) + request_prefill_logprobs
                        prefill_token_ids = (
                            all_input_ids[: cache_length + 1] + prefill_token_ids
                        )

                    prefill_texts = self.tokenizer.batch_decode(
                        prefill_token_ids,
                        clean_up_tokenization_spaces=False,
                        skip_special_tokens=False,
                    )

                    prefill_logprob_tokens = Tokens(
                        prefill_token_ids,
                        request_prefill_logprobs,
                        prefill_texts,
                        is_special=[],
                    )
                    if past_prefill_logprob_tokens is not None:
                        prefill_logprob_tokens = (
                            past_prefill_logprob_tokens + prefill_logprob_tokens
                        )

                    batch.prefill_logprob_tokens[i] = prefill_logprob_tokens
                else:
                    batch.prefill_logprob_tokens[i] = None

            # If it is, the tokens we decoded should be ignored
            if request_is_prefilling:
                # Make sure that we do not stop as even though this request did not create a token, it is still
                # processing
                stopped = False
                new_input_length = next_chunk_lengths[i]
                new_cache_length = cache_length + input_length
            else:
                new_input_length = 1
                new_cache_length = cache_length + input_length + n_accepted_ids - 1
                # Append next token to all tokens
                next_token_texts = []
                left = 0

                if n_accepted_ids > 1:
                    log_master(logger.debug, f"speculated ids {n_accepted_ids - 1}")

                current_stopped = False
                for j in range(index, index + n_accepted_ids):
                    # Generated token
                    next_token_id = next_token_ids[j]
                    all_input_ids.append(next_token_id)
                    next_token_text, prefix_offset, read_offset = self.decode_token(
                        all_input_ids,
                        prefix_offset,
                        read_offset,
                    )
                    next_token_texts.append(next_token_text)

                    stop, reason = stopping_criteria(
                        next_token_id,
                        next_token_text,
                    )

                    if stop:
                        left = index + n_accepted_ids - j - 1
                        current_stopped = True
                        break
                    else:
                        current_stopped = False
                stopped = stopped and current_stopped

                _next_token_ids = next_token_ids[index : index + n_accepted_ids - left]
                _next_token_logprobs = next_token_logprobs[
                    index : index + n_accepted_ids - left
                ]

                # Shard generations
                # All generations will be appended in the rust sharded client
                if request.id % self.world_size == self.rank:
                    if stop:
                        # Decode generated tokens
                        output_text, _, _ = self.decode_token(
                            all_input_ids,
                            prefix_offset=len(all_input_ids)
                            - stopping_criteria.current_tokens
                            - 1,
                            read_offset=len(all_input_ids)
                            - stopping_criteria.current_tokens,
                            skip_special_tokens=True,
                        )
                        generated_text = GeneratedText(
                            output_text,
                            stopping_criteria.current_tokens,
                            reason,
                            seed if do_sample else None,
                        )
                    else:
                        generated_text = None

                    if top_n_tokens > 0:
                        all_top_tokens = []
                        for top_token_ids, top_token_logprobs in zip(
                            top_token_ids, top_token_logprobs
                        ):
                            toptoken_texts = self.tokenizer.batch_decode(
                                top_token_ids,
                                clean_up_tokenization_spaces=False,
                                skip_special_tokens=False,
                            )
                            special_toptokens = [
                                token_id in self.all_special_ids
                                for token_id in top_token_ids
                            ]
                            top_tokens = Tokens(
                                top_token_ids,
                                top_token_logprobs,
                                toptoken_texts,
                                special_toptokens,
                            )
                            all_top_tokens.append(top_tokens)
                        top_tokens = all_top_tokens
                    else:
                        top_tokens = None

                    generation = Generation(
                        request.id,
                        batch.prefill_logprob_tokens[i],
                        Tokens(
                            _next_token_ids,
                            _next_token_logprobs,
                            next_token_texts,
                            [nid in self.all_special_ids for nid in _next_token_ids],
                        ),
                        generated_text,
                        top_tokens,
                    )

                    generations.append(generation)

                # accept each new token for this specific request since we may
                # have more than one new token per request with speculative decoding
                for next_token_id in _next_token_ids:
                    batch.next_token_chooser = (
                        batch.next_token_chooser.advance_grammar_single(
                            i, next_token_id
                        )
                    )

            # Update values
            index += n_accepted_ids
            batch.cache_lengths[i] = new_cache_length
            batch.max_input_length = max(batch.max_input_length, new_input_length)
            batch.input_lengths[i] = new_input_length
            current_length = new_cache_length + new_input_length
            batch.max_current_length = max(batch.max_current_length, current_length)

            batch.prefix_offsets[i] = prefix_offset
            batch.read_offsets[i] = read_offset
            batch.all_input_ids[i] = all_input_ids

        if stopped:
            # No need to return a batch if we know that all requests stopped
            forward_ns = start_decode - start
            decode_ns = time.time_ns() - start_decode
            return generations, None, (forward_ns, decode_ns)

        if prefill and finished_prefilling:
            # We do not need prefill tensors anymore
            batch.cu_seqlen_prefill = None
            batch.prefill_cache_indices = None
            batch.prefill_cu_outlens = None
            batch.prefill_head_indices = None
            batch.prefill_next_token_indices = None

        forward_ns = start_decode - start
        decode_ns = time.time_ns() - start_decode
        return generations, batch, (forward_ns, decode_ns)

    def _forward_context(
        self,
        *,
        block_tables: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        input_lengths_tensor: torch.Tensor,
        cache_lengths_tensor: torch.Tensor,
        state: Optional[Any] = None,
        attention_mask: Optional[torch.Tensor] = None,
    ) -> ContextManager:
        if ATTENTION != "flashinfer":
            return nullcontext()

        from text_generation_server.layers.attention.flashinfer import (
            use_decode_state,
            use_prefill_with_paged_kv_state,
        )

        if cu_seqlen_prefill is not None:
            return use_prefill_with_paged_kv_state(
                state=(
                    state if state is not None else self.prefill_with_paged_kv_state
                ),
                block_tables=block_tables,
                cu_seqlens=cu_seqlen_prefill,
                custom_mask=attention_mask,
                input_lengths=input_lengths_tensor + cache_lengths_tensor,
                num_heads=self.num_heads,
                num_kv_heads=self.num_kv_heads,
                head_size=self.head_size,
                page_size=BLOCK_SIZE,
                kv_dtype=self.kv_cache_dtype,
                q_dtype=self.dtype,
            )
        else:
            assert input_lengths_tensor is not None
            return use_decode_state(
                state=state if state is not None else self.decode_state,
                input_lengths=input_lengths_tensor + cache_lengths_tensor,
                block_tables=block_tables,
                num_heads=self.num_heads,
                num_kv_heads=self.num_kv_heads,
                head_size=self.head_size,
                page_size=BLOCK_SIZE,
                kv_cache_dtype=self.kv_cache_dtype,
                q_dtype=self.dtype,
            )


================================================
FILE: server/text_generation_server/models/galactica.py
================================================
import re
import torch
import torch.distributed


from transformers import (
    PreTrainedTokenizerBase,
)
from text_generation_server.models.causal_lm import CausalLMBatch
from text_generation_server.pb import generate_pb2
from text_generation_server.utils import (
    NextTokenChooser,
    StoppingCriteria,
)
from text_generation_server.utils.chunks import concat_text_chunks

# CREDIT: Papers with code => https://github.com/paperswithcode/galai/blob/main/galai/utils.py

# we split individual characters inside special tokens like [START_DNA]
CUSTOM_SEQ_RE = re.compile(r"(\[START_(DNA|SMILES|I_SMILES|AMINO)])(.*?)(\[END_\2])")

# token added to implement a custom sequence tokenization. This token is added at
# corpus cleaning step and removed in pretokenization. The digits are added to increase the chance
# that they do not occur in the corpus. The digits are escaped so that the token does not appear
# literally in the source code in case we ever include it in the training data.
SPLIT_MARKER = f"SPL{1}T-TH{1}S-Pl3A5E"


def _insert_split_marker(m: re.Match):
    """
    Applies split marker based on a regex match of special tokens such as
    [START_DNA].
    Parameters
    ----------
    n : str
        Input text to split
    Returns
    ----------
    str - the text with the split token added
    """
    start_token, _, sequence, end_token = m.groups()
    sequence = re.sub(r"(.)", rf"{SPLIT_MARKER}\1", sequence, flags=re.DOTALL)
    return f"{start_token}{sequence}{SPLIT_MARKER}{end_token}"


def escape_custom_split_sequence(text):
    """
    Applies custom splitting to the text for GALILEO's tokenization
    Parameters
    ----------
    text : str
        Input text to split
    Returns
    ----------
    str - the text with the split token added
    """
    return CUSTOM_SEQ_RE.sub(_insert_split_marker, text)


# END CREDIT


class GalacticaCausalLMBatch(CausalLMBatch):
    @classmethod
    def from_pb(
        cls,
        pb: generate_pb2.Batch,
        tokenizer: PreTrainedTokenizerBase,
        dtype: torch.dtype,
        device: torch.device,
    ) -> "GalacticaCausalLMBatch":
        inputs = []
        next_token_choosers = []
        stopping_criterias = []
        prefix_offsets = []
        top_n_tokens = []
        read_offsets = []
        requests_idx_mapping = {}

        # Parse batch
        max_truncation = 0
        padding_right_offset = 0
        max_decode_tokens = 0
        for i, r in enumerate(pb.requests):
            requests_idx_mapping[r.id] = i
            # Add escape_custom_split_sequence to the CausalLMBatch logic
            inputs.append(
                escape_custom_split_sequence(concat_text_chunks(r.input_chunks.chunks))
            )
            next_token_choosers.append(
                NextTokenChooser.from_pb(r.parameters, device, tokenizer)
            )
            stopping_criteria = StoppingCriteria.from_pb(
                r.stopping_parameters, tokenizer
            )
            stopping_criterias.append(stopping_criteria)
            top_n_tokens.append(r.top_n_tokens)
            max_truncation = max(max_truncation, r.truncate)
            max_decode_tokens += stopping_criteria.max_new_tokens
            padding_right_offset = max(
                padding_right_offset, stopping_criteria.max_new_tokens
            )

        tokenized_inputs = tokenizer(
            inputs,
            return_tensors="pt",
            padding=True,
            return_token_type_ids=False,
            truncation=True,
            max_length=max_truncation,
        ).to(device)
        for _ in pb.requests:
            input_len = tokenized_inputs["input_ids"].shape[1]
            prefix_offsets.append(0)
            read_offsets.append(input_len)

        input_lengths = tokenized_inputs["attention_mask"].sum(1)
        max_input_length = input_lengths.max()

        input_ids = tokenized_inputs["input_ids"]
        # Allocate maximum attention_mask
        attention_mask = input_ids.new_zeros(
            (pb.size, max_input_length + padding_right_offset)
        )
        # Copy tokenizer attention_mask into fully allocated attention_mask
        attention_mask[:, :max_input_length] = tokenized_inputs["attention_mask"]

        position_ids = tokenized_inputs["attention_mask"].long().cumsum(-1) - 1
        position_ids.masked_fill_(tokenized_inputs["attention_mask"] == 0, 1)
        all_input_ids = tokenized_inputs["input_ids"].T.split(1, dim=1)
        top_n_tokens_tensor = torch.tensor(
            top_n_tokens, device=device, dtype=torch.int64
        )

        max_tokens = len(inputs) * max_input_length + max_decode_tokens

        return cls(
            batch_id=pb.id,
            requests=pb.requests,
            requests_idx_mapping=requests_idx_mapping,
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=None,
            all_input_ids=list(all_input_ids),
            input_lengths=input_lengths.tolist(),
            prefix_offsets=prefix_offsets,
            read_offsets=read_offsets,
            next_token_choosers=next_token_choosers,
            stopping_criterias=stopping_criterias,
            top_n_tokens=top_n_tokens,
            top_n_tokens_tensor=top_n_tokens_tensor,
            max_input_length=max_input_length.item(),
            padding_right_offset=padding_right_offset,
            max_tokens=max_tokens,
        )


================================================
FILE: server/text_generation_server/models/globals.py
================================================
import torch
import os
from loguru import logger
from typing import Dict, Optional

from text_generation_server.utils.log import log_master

REQUEST_LOGPROBS = os.getenv("REQUEST_LOGPROBS", "0").lower() in {"1", "true"}
ATTENTION = os.environ["ATTENTION"]
# default_prefix_caching = "1" if ATTENTION in {"flashinfer", "flashdecoding"} else "0"
PREFIX_CACHING = os.environ["PREFIX_CACHING"].lower() in {
    "1",
    "true",
}
PREFILL_CHUNKING = os.getenv("PREFILL_CHUNKING", "1").lower() in {"1", "true"}
log_master(logger.info, f"Using prefix caching = {PREFIX_CACHING}")
_expected = {"paged", "flashdecoding", "flashdecoding-ipex", "flashinfer"}
assert (
    ATTENTION in _expected
), f"Attention is not valid {ATTENTION}, expected {_expected}"
log_master(logger.info, f"Using Attention = {ATTENTION}")

if PREFIX_CACHING and ATTENTION not in {
    "flashinfer",
    "flashdecoding",
    "flashdecoding-ipex",
}:
    raise RuntimeError("Prefix caching is only supported with flashinfer")

MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
# Test a 70B model on 4xA100 under load for latest failure
TGI_WIGGLE_ROOM = float(os.getenv("TGI_WIGGLE_ROOM", "0.90"))
assert TGI_WIGGLE_ROOM > 0
assert TGI_WIGGLE_ROOM < 1


# This is overridden by the cli
BLOCK_SIZE: int
if ATTENTION == "flashdecoding":
    BLOCK_SIZE = 256
elif ATTENTION == "flashinfer":
    BLOCK_SIZE = 1
elif ATTENTION == "flashdecoding-ipex":
    BLOCK_SIZE = 64
else:
    BLOCK_SIZE = 16

cuda_graphs = os.getenv("CUDA_GRAPHS")
if cuda_graphs is not None:
    try:
        cuda_graphs = [int(item) for item in cuda_graphs.split(",")]
    except Exception as e:
        raise RuntimeError(
            f"Could not parse cuda graphs {cuda_graphs}, expected comma separated list for batch sizes to run on: {e}"
        )
else:
    cuda_graphs = None
# sorting the cuda graphs in descending order helps reduce the
# memory impact and results in less memory usage
if cuda_graphs is not None:
    cuda_graphs.sort(reverse=True)

CUDA_GRAPHS = cuda_graphs

# NOTE: eventually we should move this into the router and pass back the
# index in all cases.
ADAPTER_TO_INDEX: Optional[Dict[str, int]] = None


def set_adapter_to_index(adapter_to_index: Dict[str, int]):
    global ADAPTER_TO_INDEX
    ADAPTER_TO_INDEX = adapter_to_index


def get_adapter_to_index():
    global ADAPTER_TO_INDEX
    return ADAPTER_TO_INDEX


================================================
FILE: server/text_generation_server/models/idefics_causal_lm.py
================================================
from io import BytesIO
from PIL import Image
import torch
import time

from dataclasses import dataclass
from opentelemetry import trace
from transformers import (
    AutoConfig,
    AutoProcessor,
    AutoTokenizer,
    PreTrainedTokenizerBase,
    ProcessorMixin,
)
from typing import Optional, Tuple, List, Type, Dict

from text_generation_server.models import Model
from text_generation_server.models.types import (
    Batch,
    Tokens,
    Generation,
    GeneratedText,
)
from text_generation_server.pb import generate_pb2
from text_generation_server.utils import NextTokenChooser, StoppingCriteria, Sampling
import torch.distributed
from text_generation_server.models.custom_modeling.idefics_modeling import (
    IdeficsForVisionText2Text,
)
from text_generation_server.utils import (
    initialize_torch_distributed,
    weight_files,
    Weights,
)
from text_generation_server.utils.quantization import get_loader

from text_generation_server.utils.import_utils import SYSTEM


tracer = trace.get_tracer(__name__)


@dataclass
class IdeficsCausalLMBatch(Batch):
    batch_id: int
    requests: List[generate_pb2.Request]
    requests_idx_mapping: Dict[int, int]

    # Decoder values
    input_ids: torch.Tensor
    attention_mask: torch.Tensor
    position_ids: torch.Tensor
    pixel_values: Optional[torch.Tensor]
    image_hidden_states: Optional[torch.Tensor]
    image_attention_mask: Optional[torch.Tensor]
    past_key_values: Optional[List[Tuple]]

    # All tokens
    all_input_ids: List[torch.Tensor]

    # Lengths of all generations present in the batch
    input_lengths: List[int]
    prefix_offsets: List[int]
    read_offsets: List[int]

    # Generation helpers
    next_token_choosers: List[NextTokenChooser]
    stopping_criterias: List[StoppingCriteria]

    # Metadata used for padding
    max_input_length: int
    padding_right_offset: int

    # Maximum number of tokens this batch will grow to
    max_tokens: int

    # Past metadata
    keys_head_dim_last: bool = True

    def to_pb(self) -> generate_pb2.CachedBatch:
        return generate_pb2.CachedBatch(
            id=self.batch_id,
            request_ids=[r.id for r in self.requests],
            size=len(self),
            max_tokens=self.max_tokens,
            current_tokens=len(self),
        )

    @classmethod
    def from_pb(
        cls,
        pb: generate_pb2.Batch,
        tokenizer: PreTrainedTokenizerBase,
        dtype: torch.dtype,
        device: torch.device,
    ) -> "IdeficsCausalLMBatch":
        raise NotImplementedError

    @classmethod
    def from_pb_processor(
        cls,
        pb: generate_pb2.Batch,
        tokenizer: PreTrainedTokenizerBase,
        processor: ProcessorMixin,  # Hack
        config,
        dtype: torch.dtype,
        device: torch.device,
    ) -> "IdeficsCausalLMBatch":
        inputs = []
        next_token_choosers = []
        stopping_criterias = []
        prefix_offsets = []
        read_offsets = []
        requests_idx_mapping = {}

        # Parse batch
        max_truncation = 0
        padding_right_offset = 0
        max_decode_tokens = 0
        for i, r in enumerate(pb.requests):
            requests_idx_mapping[r.id] = i
            inputs.append(r.input_chunks.chunks)
            next_token_choosers.append(
                NextTokenChooser.from_pb(r.parameters, device, tokenizer)
            )
            stopping_criteria = StoppingCriteria.from_pb(
                r.stopping_parameters, tokenizer
            )
            stopping_criterias.append(stopping_criteria)
            max_truncation = max(max_truncation, r.truncate)
            max_decode_tokens += stopping_criteria.max_new_tokens
            padding_right_offset = max(
                padding_right_offset, stopping_criteria.max_new_tokens
            )

        # TODO Check impact on idefics
        prompts = []
        for inp in inputs:
            # Each input is encoded into a list, where each element of this input list is either a string or a URL
            prompt = []
            for chunk in inp:
                chunk_type = chunk.WhichOneof("chunk")
                if chunk_type == "text":
                    prompt.append(chunk.text)
                elif chunk_type == "image":
                    image = Image.open(BytesIO(chunk.image.data))
                    prompt.append(image)
                else:
                    raise RuntimeError(f"Invalid chunk type {chunk_type}")
            prompts.append(prompt)

        # The processor replaces the call to tokenizer, and
        # a/ takes care of fetching images from the URL
        # b/ generate the correct input_ids, attention_mask, pixel_values, image_attention_mask to feed to the model
        tokenized_inputs = processor(
            prompts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_truncation,
            # TODO Check impact on idefics
            # add_end_of_utterance_token=False,  # Already taken care of inside the prompts, so bypassing the processor's handling of this token
        ).to(device)
        for _ in pb.requests:
            input_len = tokenized_inputs["input_ids"].shape[1]
            prefix_offsets.append(
                input_len - 5
            )  # To decode without potential fallbacks errors
            read_offsets.append(
                input_len
            )  # To decode without potential fallbacks errors

        input_lengths = tokenized_inputs["attention_mask"].sum(1)
        max_input_length = input_lengths.max()

        input_ids = tokenized_inputs["input_ids"]
        pixel_values = tokenized_inputs.get("pixel_values", None)
        image_hidden_states = None
        # Allocate maximum attention_mask
        attention_mask = input_ids.new_zeros(
            (pb.size, max_input_length + padding_right_offset)
        )
        # Copy tokenizer attention_mask into fully allocated attention_mask
        attention_mask[:, :max_input_length] = tokenized_inputs["attention_mask"]
        # Do the same for image_attention_mask
        if pixel_values is None:
            image_attention_mask = None
        else:
            image_attention_mask = input_ids.new_zeros(
                (
                    pb.size,
                    max_input_length + padding_right_offset,
                    pixel_values.size(1),
                )
            )
            image_attention_mask[:, :max_input_length, :] = tokenized_inputs[
                "image_attention_mask"
            ]

        position_ids = tokenized_inputs["attention_mask"].long().cumsum(-1) - 1
        position_ids.masked_fill_(tokenized_inputs["attention_mask"] == 0, 1)
        all_input_ids = tokenized_inputs["input_ids"].T.split(
            1, dim=1
        )  # It's input_ids but splitted into a tuple of tensors where each tensor is (seq_len, 1) size. It is then transformed into a list

        max_tokens = len(inputs) * (max_input_length + max_decode_tokens)

        return cls(
            batch_id=pb.id,
            requests=pb.requests,
            requests_idx_mapping=requests_idx_mapping,
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            pixel_values=pixel_values,
            image_hidden_states=image_hidden_states,
            image_attention_mask=image_attention_mask,
            past_key_values=None,
            all_input_ids=list(all_input_ids),
            input_lengths=input_lengths.tolist(),
            prefix_offsets=prefix_offsets,
            read_offsets=read_offsets,
            next_token_choosers=next_token_choosers,
            stopping_criterias=stopping_criterias,
            max_input_length=max_input_length.item(),
            padding_right_offset=padding_right_offset,
            max_tokens=max_tokens,
        )

    @tracer.start_as_current_span("filter")
    def filter(self, request_ids: List[int]) -> Optional["IdeficsCausalLMBatch"]:
        # It deletes requests from the batch. For instance when client lost connection
        if len(request_ids) == 0:
            raise ValueError("Batch must have at least one request")
        if len(request_ids) == len(self):
            return self

        keep_indices = []

        # New values after filtering
        requests_idx_mapping = {}
        requests = []
        input_lengths = []
        prefix_offsets = []
        read_offsets = []
        all_input_ids = []
        max_input_length = 0

        next_token_choosers = []
        stopping_criterias = []

        total_remaining_decode_tokens = 0
        new_padding_right_offset = 0

        for i, request_id in enumerate(request_ids):
            idx = self.requests_idx_mapping[request_id]
            requests_idx_mapping[request_id] = i
            keep_indices.append(idx)

            requests.append(self.requests[idx])
            prefix_offsets.append(self.prefix_offsets[idx])
            read_offsets.append(self.read_offsets[idx])
            all_input_ids.append(self.all_input_ids[idx])

            request_input_length = self.input_lengths[idx]
            input_lengths.append(request_input_length)
            max_input_length = max(max_input_length, request_input_length)

            next_token_choosers.append(self.next_token_choosers[idx])
            stopping_criteria = self.stopping_criterias[idx]
            stopping_criterias.append(stopping_criteria)
            remaining_decode_tokens = (
                stopping_criteria.max_new_tokens - stopping_criteria.current_tokens
            )
            total_remaining_decode_tokens += remaining_decode_tokens
            new_padding_right_offset = max(
                new_padding_right_offset, remaining_decode_tokens
            )

        # Apply indices to input_ids, attention mask, past key values and other items that need to be cached
        input_ids = self.input_ids[keep_indices]
        position_ids = self.position_ids[keep_indices]
        self.attention_mask = self.attention_mask[
            keep_indices,
            -(self.padding_right_offset + max_input_length) : (
                self.attention_mask.shape[1] - self.padding_right_offset
            )
            + new_padding_right_offset,
        ]
        # Do the same for pixel_values and image_attention_mask
        pixel_values = self.pixel_values[keep_indices]
        self.image_attention_mask = self.image_attention_mask[
            keep_indices,
            -(self.padding_right_offset + max_input_length) : (
                self.image_attention_mask.shape[1] - self.padding_right_offset
            )
            + new_padding_right_offset,
            :,
        ]
        if self.image_hidden_states is None:
            image_hidden_states = None
        else:
            image_hidden_states = self.image_hidden_states[keep_indices]

        # Ensure that past_key_values tensors can be updated in-place
        if type(self.past_key_values[0]) is tuple:
            self.past_key_values = [list(layer) for layer in self.past_key_values]

        # Update tensors in-place to allow incremental garbage collection
        past_kv_length = max_input_length - 1
        for layer in self.past_key_values:
            past_keys, past_values = layer
            if len(past_keys.shape) == 3:
                # Force past to be of dim [self_size, num_heads, ...] for easy indexing
                past_keys = past_keys.view(len(self), -1, *past_keys.shape[-2:])
                past_values = past_values.view(len(self), -1, *past_values.shape[-2:])
            if self.keys_head_dim_last:
                layer[0] = past_keys[keep_indices, :, -past_kv_length:, :]
            else:
                layer[0] = past_keys[keep_indices, :, :, -past_kv_length:]
            del past_keys
            layer[1] = past_values[keep_indices, :, -past_kv_length:, :]
            del past_values

        max_tokens = len(request_ids) * max_input_length + total_remaining_decode_tokens

        self.requests = requests
        self.requests_idx_mapping = requests_idx_mapping
        self.input_ids = input_ids
        self.pixel_values = pixel_values
        self.image_hidden_states = image_hidden_states
        self.position_ids = position_ids
        self.all_input_ids = all_input_ids
        self.input_lengths = input_lengths
        self.prefix_offsets = prefix_offsets
        self.read_offsets = read_offsets
        self.next_token_choosers = next_token_choosers
        self.stopping_criterias = stopping_criterias
        self.max_input_length = max_input_length
        self.padding_right_offset = new_padding_right_offset
        self.max_tokens = max_tokens

        return self

    @classmethod
    @tracer.start_as_current_span("concatenate")
    def concatenate(
        cls, batches: List["IdeficsCausalLMBatch"]
    ) -> "IdeficsCausalLMBatch":
        # It adds new requests to the batch
        # Used for padding
        total_batch_size = 0
        max_input_length = 0
        max_num_images = 0
        padding_right_offset = 0
        for batch in batches:
            total_batch_size += len(batch)
            max_input_length = max(max_input_length, batch.max_input_length)
            max_num_images = max(max_num_images, batch.pixel_values.size(1))
            padding_right_offset = max(padding_right_offset, batch.padding_right_offset)

        # Batch attributes
        requests = []
        requests_idx_mapping = {}
        input_lengths = []
        prefix_offsets = []
        read_offsets = []
        all_input_ids = []
        next_token_choosers = []
        stopping_criterias = []
        max_tokens = 0

        # Batch tensors
        input_ids = None
        attention_mask = None
        position_ids = None
        pixel_values = None
        image_hidden_states = None
        image_attention_mask = None
        past_key_values = []

        # Used for slicing correctly inside the tensors
        # Equivalent to a cumsum on batch sizes
        start_index = 0
        for i, batch in enumerate(batches):
            requests.extend(batch.requests)
            input_lengths.extend(batch.input_lengths)
            prefix_offsets.extend(batch.prefix_offsets)
            read_offsets.extend(batch.read_offsets)
            all_input_ids.extend(batch.all_input_ids)
            next_token_choosers.extend(batch.next_token_choosers)
            stopping_criterias.extend(batch.stopping_criterias)

            if i == 0:
                requests_idx_mapping = batch.requests_idx_mapping
            else:
                # We need to offset the mapping for each batch by the cumulative batch size
                for k, v in batch.requests_idx_mapping.items():
                    requests_idx_mapping[k] = v + start_index

            # Slicing end index for this batch
            end_index = start_index + len(batch)

            # We only concatenate batches that did at least one step
            if batch.past_key_values is None:
                raise ValueError("only concatenate prefilled batches")

            # Create empty tensor
            # input_ids is always of shape [batch_size, 1]
            # We do not need to pad it
            if input_ids is None:
                input_ids = batch.input_ids.new_empty((total_batch_size, 1))
            # Copy to correct indices
            input_ids[start_index:end_index] = batch.input_ids

            # Create padded tensor
            if attention_mask is None:
                attention_mask = batch.attention_mask.new_zeros(
                    (total_batch_size, max_input_length + padding_right_offset),
                )

            curr_batch_max_num_images = batch.pixel_values.size(1)
            if pixel_values is None:
                pixel_values = batch.pixel_values.new_zeros(
                    (total_batch_size, max_num_images, 3, 224, 224)
                )
            pixel_values[start_index:end_index, :curr_batch_max_num_images] = (
                batch.pixel_values
            )

            if image_attention_mask is None:
                image_attention_mask = batch.image_attention_mask.new_zeros(
                    (
                        total_batch_size,
                        max_input_length + padding_right_offset,
                        max_num_images,
                    )
                )

            # We need to slice the attention mask to remove padding from previous steps
            # and to remove unused allocated space
            left_offset = max_input_length - batch.max_input_length
            batch_left_offset = (
                batch.attention_mask.shape[1]
                - batch.max_input_length
                - batch.padding_right_offset
            )
            attention_mask[
                start_index:end_index,
                left_offset:-padding_right_offset,
            ] = batch.attention_mask[
                :,
                batch_left_offset : -batch.padding_right_offset,
            ]
            image_attention_mask[
                start_index:end_index,
                left_offset:-padding_right_offset,
                :curr_batch_max_num_images,
            ] = batch.image_attention_mask[
                :, batch_left_offset : -batch.padding_right_offset, :
            ]

            # Create empty tensor
            # position_ids is always of shape [batch_size, 1]
            if position_ids is None:
                position_ids = batch.position_ids.new_empty((total_batch_size, 1))
            position_ids[start_index:end_index] = batch.position_ids

            # Shenanigans to get dimensions because BLOOM outputs a past with a different shape
            # BLOOM Keys:   [batch_size * num_heads, head_dim, seq_length]
            # BLOOM Values: [batch_size * num_heads, seq_length, head_dim]
            # And ensure that we can update tensors in-place
            if isinstance(batch.past_key_values[0], tuple):
                batch.past_key_values = [
                    [t.view(len(batch), -1, *t.shape[-2:]) for t in layer]
                    for layer in batch.past_key_values
                ]
            elif len(batch.past_key_values[0][0].shape) == 3:
                for layer in batch.past_key_values:
                    for k, t in enumerate(layer):
                        layer[k] = t.view(len(batch), -1, *t.shape[-2:])

            # Add eventual padding tokens that were added while concatenating
            max_tokens += batch.max_tokens + (
                max_input_length - batch.max_input_length
            ) * len(batch)

            start_index = end_index

        first_past_kvs = batches[0].past_key_values
        _, num_heads, padded_sequence_length, head_dim = first_past_kvs[0][1].shape

        padded_past_values_shape = (
            total_batch_size,
            num_heads,
            max_input_length - 1,
            head_dim,
        )

        if batches[0].keys_head_dim_last:
            padded_past_keys_shape = padded_past_values_shape
        else:
            # seq_length is last for BLOOM
            padded_past_keys_shape = (
                total_batch_size,
                num_heads,
                head_dim,
                max_input_length - 1,
            )

        # Iterate over attention layers
        # Concatenate past key values layer by layer to allow incremental garbage collection
        for j in range(len(first_past_kvs)):
            padded_past_keys = first_past_kvs[j][0].new_zeros(padded_past_keys_shape)
            start_index = 0
            for batch in batches:
                past_keys = batch.past_key_values[j][0]
                # Clear reference to the original tensor
                batch.past_key_values[j][0] = None

                # Slicing end index for this batch
                end_index = start_index + len(batch)
                # We slice the keys to remove the padding from previous batches
                past_seq_len = batch.max_input_length - 1
                if batch.keys_head_dim_last:
                    padded_past_keys[start_index:end_index, :, -past_seq_len:, :] = (
                        past_keys[:, :, -past_seq_len:, :]
                    )
                else:
                    # BLOOM case
                    padded_past_keys[start_index:end_index, :, :, -past_seq_len:] = (
                        past_keys[:, :, :, -past_seq_len:]
                    )
                del past_keys

                start_index = end_index

            padded_past_values = first_past_kvs[j][1].new_zeros(
                padded_past_values_shape
            )
            start_index = 0
            for batch in batches:
                past_values = batch.past_key_values[j][1]
                # Clear reference to the original tensor
                batch.past_key_values[j][1] = None

                # Slicing end index for this batch
                end_index = start_index + len(batch)
                # We slice the past values to remove the padding from previous batches
                past_seq_len = batch.max_input_length - 1
                padded_past_values[start_index:end_index, :, -past_seq_len:, :] = (
                    past_values[:, :, -past_seq_len:, :]
                )
                del past_values

                # Update values
                start_index = end_index

            past_key_values.append([padded_past_keys, padded_past_values])

        return cls(
            batch_id=batches[0].batch_id,
            requests=requests,
            requests_idx_mapping=requests_idx_mapping,
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            pixel_values=pixel_values,
            image_hidden_states=image_hidden_states,
            image_attention_mask=image_attention_mask,
            past_key_values=past_key_values,
            all_input_ids=all_input_ids,
            input_lengths=input_lengths,
            prefix_offsets=prefix_offsets,
            read_offsets=read_offsets,
            next_token_choosers=next_token_choosers,
            stopping_criterias=stopping_criterias,
            max_input_length=max_input_length,
            padding_right_offset=padding_right_offset,
            keys_head_dim_last=batches[0].keys_head_dim_last,
            max_tokens=max_tokens,
        )

    def __len__(self):
        return len(self.requests)


class IdeficsCausalLM(Model):
    def __init__(
        self,
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
        speculator: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
    ):
        self.quantize = quantize
        self.process_group, rank, world_size = initialize_torch_distributed()
        if torch.cuda.is_available():
            device = torch.device(f"cuda:{rank}")
            # 9b seems to work correctly enough in float16, but 80b seems
            # to be really saturating for f16.
            dtype = torch.float16 if dtype is None else dtype
        elif SYSTEM == "ipex":
            if hasattr(torch, "xpu") and torch.xpu.is_available():
                device = torch.device(f"xpu:{rank}")
                dtype = torch.float16 if dtype is None else dtype
            else:
                device = torch.device("cpu")
                # Float16 doesn't exist on target.
                dtype = torch.bfloat16 if dtype is None else dtype
        else:
            device = torch.device("cpu")
            dtype = torch.float32 if dtype is None else dtype
        self.device, self.dtype = device, dtype

        config = AutoConfig.from_pretrained(
            model_id,
            revision=revision,
            trust_remote_code=trust_remote_code,
        )
        config.quantize = quantize
        config.speculator = speculator
        config.vision_config.quantize = quantize

        tokenizer = AutoTokenizer.from_pretrained(
            model_id,
            revision=revision,
            padding_side="left",
            truncation_side="left",
            trust_remote_code=trust_remote_code,
        )
        self.processor = AutoProcessor.from_pretrained(
            model_id,
            revision=revision,
            padding_side="left",
            truncation_side="left",
            trust_remote_code=trust_remote_code,
        )

        weights_loader = get_loader(
            quantize=quantize, model_id=model_id, revision=revision
        )
        torch.distributed.barrier(group=self.process_group)
        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
        weights = Weights(
            filenames,
            device=device,
            dtype=dtype,
            process_group=self.process_group,
            weights_loader=weights_loader,
        )

        model = IdeficsForVisionText2Text(config, weights)

        self.config = config

        torch.distributed.barrier(group=self.process_group)
        super().__init__(
            model_id=model_id,
            model=model,
            tokenizer=tokenizer,
            requires_padding=True,
            dtype=dtype,
            device=device,
            rank=rank,
            world_size=world_size,
        )

    @property
    def batch_type(self) -> Type[IdeficsCausalLMBatch]:
        return IdeficsCausalLMBatch

    def forward(
        self,
        input_ids,
        attention_mask,
        position_ids,
        pixel_values,
        image_hidden_states,
        image_attention_mask,
        past_key_values: Optional = None,
    ) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]]]:
        # Model Forward
        kwargs = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "pixel_values": pixel_values,
            "image_hidden_states": image_hidden_states,
            "image_attention_mask": image_attention_mask,
            "past_key_values": past_key_values,
            "use_cache": True,
            "return_dict": True,
        }
        if self.has_position_ids:
            kwargs["position_ids"] = position_ids

        outputs, speculative_logits = self.model.forward(**kwargs)
        return (
            outputs.logits,
            speculative_logits,
            outputs.past_key_values,
            outputs.image_hidden_states,
        )

    @tracer.start_as_current_span("generate_token")
    def generate_token(
        self, batch: IdeficsCausalLMBatch
    ) -> Tuple[List[Generation], Optional[IdeficsCausalLMBatch], Tuple[int, int]]:
        start = time.time_ns()
        # slice the attention mask to the correct shape
        attention_mask = batch.attention_mask[:, : -batch.padding_right_offset]
        if batch.image_attention_mask is None:
            image_attention_mask = None
        else:
            if batch.input_ids.size(1) == 1:
                # THIS is a hack: when calling idefics.generate, the first time, we need the whole image_attention_mask (size bs x max_seq_len x max_num_images),
                # but the subsequent times, we only need the last attention mask along the `max_seq_len` dimension
                # this is due to the nature IDEFICS: it's an encoder decoder, and so when decoding, only the currently generated
                # token need to attend to the encoder hidden states (i.e. the vision encoder)
                # Also see seq2seq_lm.Seq2SeqLM.generate_token which has roughly the same logic
                image_attention_mask = batch.image_attention_mask[
                    :, -(batch.padding_right_offset + 1)
                ].unsqueeze(1)
            else:
                image_attention_mask = batch.image_attention_mask[
                    :, : -batch.padding_right_offset
                ]

        logits, speculative_logits, past, image_hidden_states = self.forward(
            input_ids=batch.input_ids,
            attention_mask=attention_mask,
            position_ids=batch.position_ids,
            pixel_values=batch.pixel_values,
            image_hidden_states=batch.image_hidden_states,
            image_attention_mask=image_attention_mask,
            past_key_values=batch.past_key_values,
        )
        # Hardcoded remove image tokens
        logits[:, 32000:32001] = torch.finfo(logits.dtype).min

        start_decode = time.time_ns()

        # Results
        generations: List[Generation] = []
        stopped = True

        # Zipped iterator
        iterator = zip(
            batch.requests,
            batch.input_lengths,
            batch.prefix_offsets,
            batch.read_offsets,
            logits,
            batch.next_token_choosers,
            batch.stopping_criterias,
            batch.all_input_ids,
        )

        # For each member of the batch
        for i, (
            request,
            input_length,
            prefix_offset,
            read_offset,
            logits,
            next_token_chooser,
            stopping_criteria,
            all_input_ids,
        ) in enumerate(iterator):
            # Select next token
            next_token_id, logprobs = next_token_chooser(
                all_input_ids.view(1, -1), logits[-1:, :]
            )

            # Append next token to all tokens
            all_input_ids = torch.cat([all_input_ids, next_token_id])
            new_input_length = input_length + 1

            # Generated token
            next_token_logprob = logprobs[-1, next_token_id]
            next_token_id_squeezed = next_token_id.squeeze()
            next_token_text, prefix_offset, read_offset = self.decode_token(
                all_input_ids[:, 0], prefix_offset, read_offset
            )

            # Evaluate stopping criteria
            stop, reason = stopping_criteria(
                next_token_id_squeezed,
                next_token_text,
            )

            if not stop:
                stopped = False

            # Shard generations
            # All generations will be appended in the rust sharded client
            if i % self.world_size == self.rank:
                if stop:
                    # Decode generated tokens
                    output_text, _, _ = self.decode_token(
                        all_input_ids[:, 0],
                        prefix_offset=len(all_input_ids)
                        - stopping_criteria.current_tokens
                        - 1,
                        read_offset=len(all_input_ids)
                        - stopping_criteria.current_tokens,
                        skip_special_tokens=True,
                    )
                    # Get seed
                    if isinstance(next_token_chooser.choice, Sampling):
                        seed = next_token_chooser.choice.seed
                    else:
                        seed = None

                    generated_text = GeneratedText(
                        output_text, stopping_criteria.current_tokens, reason, seed
                    )
                else:
                    generated_text = None

                # Prefill
                if stopping_criteria.current_tokens == 1 and request.prefill_logprobs:
                    # Remove generated token to only have prefill and add nan for first prompt token
                    prefill_logprobs = [float("nan")] + torch.log_softmax(
                        logits, -1
                    ).gather(1, all_input_ids[1:]).squeeze(1)[
                        -new_input_length:-1
                    ].tolist()
                    prefill_token_ids = all_input_ids[-new_input_length:-1]
                    prefill_texts = self.tokenizer.batch_decode(
                        prefill_token_ids,
                        clean_up_tokenization_spaces=False,
                        skip_special_tokens=False,
                    )
                    prefill_tokens = Tokens(
                        prefill_token_ids,
                        prefill_logprobs,
                        prefill_texts,
                        is_special=[],
                    )
                else:
                    prefill_tokens = None

                top_tokens = None

                generation = Generation(
                    request.id,
                    prefill_tokens,
                    Tokens(
                        [next_token_id_squeezed],
                        [next_token_logprob],
                        [next_token_text],
                        [next_token_id_squeezed.item() in self.all_special_ids],
                    ),
                    generated_text,
                    top_tokens,
                )

                generations.append(generation)

            # Update values
            batch.next_token_choosers[i] = batch.next_token_choosers[i].advance_grammar(
                next_token_id_squeezed.item()
            )
            batch.input_ids[i, 0] = next_token_id
            batch.all_input_ids[i] = all_input_ids
            batch.input_lengths[i] = new_input_length
            batch.prefix_offsets[i] = prefix_offset
            batch.read_offsets[i] = read_offset
            batch.max_input_length = max(batch.max_input_length, new_input_length)

        # We finished all generations in the batch; there is no next batch
        if stopped:
            forward_ns = start_decode - start
            decode_ns = time.time_ns() - start_decode
            return generations, None, (forward_ns, decode_ns)

        # Slice unused values from prefill
        batch.input_ids = batch.input_ids[:, :1]

        # Update attention_mask as we added a new token to input_ids
        batch.attention_mask[:, -batch.padding_right_offset] = 1
        batch.image_attention_mask[:, -batch.padding_right_offset, :] = (
            batch.image_attention_mask[:, -(batch.padding_right_offset + 1), :]
        )
        # Decrease right offset
        batch.padding_right_offset -= 1

        # Update position_ids
        batch.position_ids = batch.position_ids[:, -1:] + 1

        # Update past key values
        batch.past_key_values = past
        batch.image_hidden_states = image_hidden_states

        forward_ns = start_decode - start
        decode_ns = time.time_ns() - start_decode
        return generations, batch, (forward_ns, decode_ns)


================================================
FILE: server/text_generation_server/models/mamba.py
================================================
import torch
import torch.distributed
from transformers import AutoTokenizer, PreTrainedTokenizerBase
from typing import Optional, Union
from text_generation_server.models.custom_modeling.mamba_modeling import (
    MambaConfig,
)
from loguru import logger
from text_generation_server.pb import generate_pb2
from text_generation_server.utils import (
    initialize_torch_distributed,
    weight_files,
    Weights,
)
from text_generation_server.models.globals import CUDA_GRAPHS, MEM_POOL
import time
from text_generation_server.models.custom_modeling.mamba_modeling import (
    MambaModel,
    InferenceParams,
)
from text_generation_server.models import Model
from typing import Any, List, Tuple, Type, Dict
from text_generation_server.models.types import (
    Batch,
    Tokens,
    Generation,
    GeneratedText,
)
from text_generation_server.utils.chunks import concat_text_chunks
from text_generation_server.utils.quantization import get_loader
from text_generation_server.utils.tokens import batch_top_tokens, Sampling
from dataclasses import dataclass
from text_generation_server.utils import NextTokenChooser, StoppingCriteria


def new_inference_params(
    n_blocks: int,
    batch_size: int,
    d_inner: int,
    d_conv: int,
    d_state: int,
    seqlen_offset: int,
    dtype: torch.dtype,
    device: torch.device,
):
    max_seqlen = 0
    conv_states = torch.zeros(
        (
            n_blocks,
            batch_size,
            d_inner,
            d_conv,
        ),
        device=device,
        dtype=dtype,
    )
    ssm_states = torch.zeros(
        (
            n_blocks,
            batch_size,
            d_inner,
            d_state,
        ),
        device=device,
        dtype=dtype,
    )
    inference_params = InferenceParams(
        max_seqlen=max_seqlen,
        max_batch_size=batch_size,
        seqlen_offset=seqlen_offset,
        conv_states=conv_states,
        ssm_states=ssm_states,
    )
    return inference_params


@dataclass
class MambaBatch(Batch):
    batch_id: int
    requests: List[generate_pb2.Request]
    requests_idx_mapping: Dict[int, int]

    # Decoder values
    input_ids: torch.Tensor

    # All tokens
    all_input_ids: List[torch.Tensor]

    # Lengths of all generations present in the batch
    input_lengths: List[int]
    prefix_offsets: List[int]
    read_offsets: List[int]

    # Generation helpers
    next_token_choosers: List[NextTokenChooser]
    stopping_criterias: List[StoppingCriteria]
    top_n_tokens: List[int]
    top_n_tokens_tensor: torch.Tensor

    # Metadata used for padding
    max_input_length: int
    padding_right_offset: int

    # Maximum number of tokens this batch will grow to
    max_tokens: int

    # Past metadata
    keys_head_dim_last: bool = True

    # Inference params
    inference_params: Optional[Dict[str, Any]] = None

    def to_pb(self) -> generate_pb2.CachedBatch:
        return generate_pb2.CachedBatch(
            id=self.batch_id,
            request_ids=[r.id for r in self.requests],
            size=len(self),
            max_tokens=self.max_tokens,
            current_tokens=len(self),
        )

    @classmethod
    def from_pb(
        cls,
        pb: generate_pb2.Batch,
        tokenizer: PreTrainedTokenizerBase,
        dtype: torch.dtype,
        device: torch.device,
    ) -> "MambaBatch":
        inputs = []
        next_token_choosers = []
        stopping_criterias = []
        top_n_tokens = []
        prefix_offsets = []
        read_offsets = []
        requests_idx_mapping = {}

        # Parse batch
        max_truncation = 0
        padding_right_offset = 0
        max_decode_tokens = 0
        for i, r in enumerate(pb.requests):
            requests_idx_mapping[r.id] = i
            inputs.append(concat_text_chunks(r.input_chunks.chunks))
            next_token_choosers.append(
                NextTokenChooser.from_pb(r.parameters, device, tokenizer)
            )
            stopping_criteria = StoppingCriteria.from_pb(
                r.stopping_parameters, tokenizer
            )
            stopping_criterias.append(stopping_criteria)
            top_n_tokens.append(r.top_n_tokens)
            max_truncation = max(max_truncation, r.truncate)
            max_decode_tokens += stopping_criteria.max_new_tokens
            padding_right_offset = max(
                padding_right_offset, stopping_criteria.max_new_tokens
            )

        tokenized_inputs = tokenizer(
            inputs,
            return_tensors="pt",
            padding=True,
            return_token_type_ids=False,
            truncation=True,
            max_length=max_truncation,
        ).to(device)
        for _ in pb.requests:
            input_len = tokenized_inputs["input_ids"].shape[1]
            prefix_offsets.append(input_len - 5)
            read_offsets.append(input_len)

        input_lengths = tokenized_inputs["attention_mask"].sum(1)
        max_input_length = input_lengths.max()
        input_ids = tokenized_inputs["input_ids"]
        all_input_ids = tokenized_inputs["input_ids"].T.split(1, dim=1)
        top_n_tokens_tensor = torch.tensor(
            top_n_tokens, device=device, dtype=torch.int64
        )
        max_tokens = len(inputs) * (max_input_length + max_decode_tokens)
        return cls(
            batch_id=pb.id,
            requests=pb.requests,
            requests_idx_mapping=requests_idx_mapping,
            input_ids=input_ids,
            # past_input_ids=None,
            all_input_ids=list(all_input_ids),
            input_lengths=input_lengths.tolist(),
            prefix_offsets=prefix_offsets,
            read_offsets=read_offsets,
            next_token_choosers=next_token_choosers,
            stopping_criterias=stopping_criterias,
            top_n_tokens=top_n_tokens,
            top_n_tokens_tensor=top_n_tokens_tensor,
            max_input_length=max_input_length.item(),
            padding_right_offset=padding_right_offset,
            max_tokens=max_tokens,
        )

    def filter(self, request_ids: List[int]) -> Optional["MambaBatch"]:
        if len(request_ids) == 0:
            raise ValueError("Batch must have at least one request")
        if len(request_ids) == len(self):
            return self

        keep_indices = []

        # New values after filtering
        requests_idx_mapping = {}
        requests = []
        input_lengths = []
        prefix_offsets = []
        read_offsets = []
        all_input_ids = []
        max_input_length = 0

        next_token_choosers = []
        stopping_criterias = []
        top_n_tokens = []

        total_remaining_decode_tokens = 0
        new_padding_right_offset = 0

        indices = []
        for i, request_id in enumerate(request_ids):
            idx = self.requests_idx_mapping[request_id]
            requests_idx_mapping[request_id] = i
            keep_indices.append(idx)

            requests.append(self.requests[idx])
            prefix_offsets.append(self.prefix_offsets[idx])
            read_offsets.append(self.read_offsets[idx])
            all_input_ids.append(self.all_input_ids[idx])

            request_input_length = self.input_lengths[idx]
            input_lengths.append(request_input_length)
            max_input_length = max(max_input_length, request_input_length)
            indices.append(idx)

            next_token_choosers.append(self.next_token_choosers[idx])
            stopping_criteria = self.stopping_criterias[idx]
            stopping_criterias.append(stopping_criteria)
            top_n_tokens.append(self.top_n_tokens[idx])
            remaining_decode_tokens = (
                stopping_criteria.max_new_tokens - stopping_criteria.current_tokens
            )
            total_remaining_decode_tokens += remaining_decode_tokens
            new_padding_right_offset = max(
                new_padding_right_offset, remaining_decode_tokens
            )

        # Apply indices to input_ids, attention mask, past key values and other items that need to be cached
        input_ids = self.input_ids[keep_indices]

        top_n_tokens_tensor = self.top_n_tokens_tensor[keep_indices]
        max_tokens = len(request_ids) * max_input_length + total_remaining_decode_tokens

        self.requests = requests
        self.requests_idx_mapping = requests_idx_mapping
        self.input_ids = input_ids
        self.all_input_ids = all_input_ids
        self.input_lengths = input_lengths
        self.prefix_offsets = prefix_offsets
        self.read_offsets = read_offsets
        self.next_token_choosers = next_token_choosers
        self.stopping_criterias = stopping_criterias
        self.top_n_tokens = top_n_tokens
        self.top_n_tokens_tensor = top_n_tokens_tensor
        self.max_input_length = max_input_length
        self.padding_right_offset = new_padding_right_offset
        self.max_tokens = max_tokens

        # TODO
        # Kept it simple by just updating the state, maybe updating the other CPU values is necessary.
        self.inference_params.conv_states = self.inference_params.conv_states[
            :, indices
        ]
        self.inference_params.ssm_states = self.inference_params.ssm_states[:, indices]
        return self

    @classmethod
    def concatenate(cls, batches: List["MambaBatch"]) -> "MambaBatch":
        # Used for padding
        total_batch_size = 0
        max_input_length = 0
        padding_right_offset = 0
        for batch in batches:
            total_batch_size += len(batch)
            max_input_length = max(max_input_length, batch.max_input_length)
            padding_right_offset = max(padding_right_offset, batch.padding_right_offset)

        # Batch attributes
        requests = []
        requests_idx_mapping = {}
        input_lengths = []
        prefix_offsets = []
        read_offsets = []
        all_input_ids = []
        next_token_choosers = []
        stopping_criterias = []
        top_n_tokens = []
        max_tokens = 0
        seqlen_offset = 0

        (n_blocks, _, d_inner, d_conv) = batches[0].inference_params.conv_states.shape
        (_, _, _, d_state) = batches[0].inference_params.ssm_states.shape
        dtype = batches[0].inference_params.conv_states.dtype
        device = batches[0].inference_params.conv_states.device
        inference_params = new_inference_params(
            n_blocks=n_blocks,
            batch_size=total_batch_size,
            d_state=d_state,
            d_conv=d_conv,
            d_inner=d_inner,
            seqlen_offset=seqlen_offset,
            device=device,
            dtype=dtype,
        )

        # Batch tensors
        input_ids = None
        top_n_tokens_tensor = None

        # Used for slicing correctly inside the tensors
        # Equivalent to a cumsum on batch sizes
        start_index = 0
        for i, batch in enumerate(batches):
            requests.extend(batch.requests)
            input_lengths.extend(batch.input_lengths)
            prefix_offsets.extend(batch.prefix_offsets)
            read_offsets.extend(batch.read_offsets)
            all_input_ids.extend(batch.all_input_ids)
            next_token_choosers.extend(batch.next_token_choosers)
            stopping_criterias.extend(batch.stopping_criterias)
            top_n_tokens.extend(batch.top_n_tokens)

            if i == 0:
                requests_idx_mapping = batch.requests_idx_mapping
            else:
                # We need to offset the mapping for each batch by the cumulative batch size
                for k, v in batch.requests_idx_mapping.items():
                    requests_idx_mapping[k] = v + start_index

            # Slicing end index for this batch
            end_index = start_index + len(batch)

            # Create empty tensor
            # input_ids is always of shape [batch_size, 1]
            # We do not need to pad it
            if input_ids is None:
                input_ids = batch.input_ids.new_empty((total_batch_size, 1))
            # Copy to correct indices
            input_ids[start_index:end_index] = batch.input_ids

            if top_n_tokens_tensor is None:
                top_n_tokens_tensor = batches[0].top_n_tokens_tensor.new_zeros(
                    total_batch_size,
                )
            top_n_tokens_tensor[start_index:end_index] = batch.top_n_tokens_tensor

            # Add eventual padding tokens that were added while concatenating
            max_tokens += batch.max_tokens + (
                max_input_length - batch.max_input_length
            ) * len(batch)

            inference_params.max_seqlen = max(
                inference_params.max_seqlen, batch.inference_params.max_seqlen
            )
            assert batch.inference_params.seqlen_offset != 0, "Invalid seqlen offset"
            inference_params.seqlen_offset = max(
                inference_params.seqlen_offset, batch.inference_params.seqlen_offset
            )

            inference_params.conv_states[:, start_index:end_index] = (
                batch.inference_params.conv_states
            )
            inference_params.ssm_states[:, start_index:end_index] = (
                batch.inference_params.ssm_states
            )

            start_index = end_index

        return cls(
            batch_id=batches[0].batch_id,
            requests=requests,
            requests_idx_mapping=requests_idx_mapping,
            input_ids=input_ids,
            all_input_ids=all_input_ids,
            input_lengths=input_lengths,
            prefix_offsets=prefix_offsets,
            read_offsets=read_offsets,
            next_token_choosers=next_token_choosers,
            stopping_criterias=stopping_criterias,
            top_n_tokens=top_n_tokens,
            top_n_tokens_tensor=top_n_tokens_tensor,
            max_input_length=max_input_length,
            padding_right_offset=padding_right_offset,
            keys_head_dim_last=batches[0].keys_head_dim_last,
            max_tokens=max_tokens,
            inference_params=inference_params,
        )

    def __len__(self):
        return len(self.requests)


class Mamba(Model):
    def __init__(
        self,
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
        speculator: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
    ):
        self.quantize = quantize
        self.process_group, _rank, world_size = initialize_torch_distributed()
        if world_size > 1:
            raise RuntimeError("Mamba does not support Tensor Parallelism (TP)")
        self.cuda_graphs = {}
        if torch.cuda.is_available():
            device = torch.device("cuda")
            # Bf16 is important. In f16 accumulations in the matmul are causing
            # differences while the server is under load.
            # This is detectable by the integration load test
            dtype = torch.bfloat16 if dtype is None else dtype
        else:
            if quantize:
                raise ValueError("quantization is not available on CPU")

            device = torch.device("cpu")
            dtype = torch.float32 if dtype is None else dtype

        tokenizer = AutoTokenizer.from_pretrained(
            "EleutherAI/gpt-neox-20b",
            revision=revision,
            padding_side="left",
            truncation_side="left",
            trust_remote_code=trust_remote_code,
        )
        config = MambaConfig.from_pretrained(
            model_id, revision=revision, trust_remote_code=trust_remote_code
        )

        tokenizer.bos_token_id = config.bos_token_id
        tokenizer.eos_token_id = config.eos_token_id
        tokenizer.pad_token = tokenizer.eos_token

        config.quantize = quantize
        config.speculator = speculator
        torch.distributed.barrier(group=self.process_group)
        weights_loader = get_loader(
            quantize=quantize, model_id=model_id, revision=revision
        )
        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
        weights = Weights(
            filenames,
            device,
            dtype,
            process_group=self.process_group,
            weights_loader=weights_loader,
        )
        model = MambaModel(config, weights)
        torch.distributed.barrier(group=self.process_group)
        super(Mamba, self).__init__(
            model_id=model_id,
            model=model,
            tokenizer=tokenizer,
            requires_padding=True,
            dtype=dtype,
            device=device,
        )

    @property
    def batch_type(self) -> Type[MambaBatch]:
        return MambaBatch

    def warmup(
        self, batch, max_input_tokens: Optional[int], max_total_tokens: Optional[int]
    ) -> Union[Optional[int], Optional[int], Optional[int]]:
        # TODO: implement warmup for Mamba if needed
        if CUDA_GRAPHS:
            if self.speculate is None or self.speculate == 0:
                try:
                    logger.info(f"Cuda Graphs are enabled for sizes {CUDA_GRAPHS}")
                    # Warmup cuda graphs
                    for bs in CUDA_GRAPHS:
                        self.cuda_graph_warmup(bs)
                except Exception:
                    logger.exception("Decode cuda graph warmup failed")
        else:
            logger.info(f"Cuda Graphs are disabled (CUDA_GRAPHS={CUDA_GRAPHS}).")

        if max_total_tokens is None:
            max_total_tokens = min(self.tokenizer.model_max_length, 4096)

        if max_input_tokens is None:
            max_input_tokens = max_total_tokens - 1
        return None, max_input_tokens, max_total_tokens

    def cuda_graph_warmup(self, batch_size: int):
        input_ids = torch.zeros((batch_size, 1), dtype=torch.int64, device=self.device)
        n_blocks = len(self.model.blocks)

        d_state = self.model.config.d_state
        d_conv = self.model.config.d_conv
        # Inner takes the expand multiplication
        d_inner = self.model.config.d_inner

        # Important seqlen_offset to go through the update mecanism with the state
        seqlen_offset = 1
        inference_params = new_inference_params(
            n_blocks=n_blocks,
            batch_size=batch_size,
            d_state=d_state,
            d_conv=d_conv,
            d_inner=d_inner,
            seqlen_offset=seqlen_offset,
            device=self.device,
            dtype=self.dtype,
        )

        graph = torch.cuda.CUDAGraph()

        torch.cuda.synchronize()
        # Run once outside to warmup
        self.model.forward(input_ids=input_ids, inference_params=inference_params)
        torch.cuda.synchronize()

        with torch.cuda.graph(graph, pool=MEM_POOL):
            logits, speculative_logits = self.model.forward(
                input_ids=input_ids, inference_params=inference_params
            )
        torch.cuda.synchronize()
        graph_dict = {
            "input_ids": input_ids,
            "inference_params": inference_params,
            "graph": graph,
            "logits": logits,
            "speculative_logits": speculative_logits,
        }
        self.cuda_graphs[batch_size] = graph_dict

    def tunableop_warmup(self, batch_size: int, seqlen: int):
        input_ids = torch.zeros((batch_size, 1), dtype=torch.int64, device=self.device)
        n_blocks = len(self.model.blocks)

        d_state = self.model.config.d_state
        d_conv = self.model.config.d_conv
        # Inner takes the expand multiplication
        d_inner = self.model.config.d_inner

        # Important seqlen_offset to go through the update mecanism with the state
        seqlen_offset = 1
        inference_params = new_inference_params(
            n_blocks=n_blocks,
            batch_size=seqlen,
            d_state=d_state,
            d_conv=d_conv,
            d_inner=d_inner,
            seqlen_offset=seqlen_offset,
            device=self.device,
            dtype=self.dtype,
        )

        self.model.forward(input_ids=input_ids, inference_params=inference_params)

    def forward(
        self, input_ids: torch.Tensor, inference_params: Any
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        bs = input_ids.shape[0]
        padded_bs = bs
        if bs == 3:
            padded_bs = 4
        elif 3 < bs <= 8:
            padded_bs = 8
        elif bs > 8:
            padded_bs = (bs + 7) // 8 * 8

        # Try to find an associated cuda graph
        cuda_graph = self.cuda_graphs.get(padded_bs, None)
        is_prefill = inference_params is None or inference_params.seqlen_offset == 0

        if is_prefill or cuda_graph is None:
            return self.model(
                input_ids,
                inference_params=inference_params,
            )

        # Copy inputs to the static inputs of the cuda graph
        # Static inputs are potentially padded
        cuda_graph["input_ids"][:bs] = input_ids
        cuda_graph["inference_params"].conv_states[
            :, :bs
        ] = inference_params.conv_states
        cuda_graph["inference_params"].ssm_states[:, :bs] = inference_params.ssm_states

        # Replay the graph
        cuda_graph["graph"].replay()

        inference_params.conv_states.copy_(
            cuda_graph["inference_params"].conv_states[:, :bs]
        )
        inference_params.ssm_states.copy_(
            cuda_graph["inference_params"].ssm_states[:, :bs]
        )
        # Slice output to the correct shape
        speculative_logits = (
            cuda_graph["speculative_logits"][:bs]
            if cuda_graph["speculative_logits"] is not None
            else None
        )
        logits = cuda_graph["logits"][:bs]
        return logits, speculative_logits

    def generate_token(self, batch) -> Tuple[List[Any], Optional[Any], Tuple[int, int]]:
        start = time.time_ns()
        input_ids = (
            batch.input_ids
        )  # batch.past_input_ids if batch.past_input_ids is not None else batch.input_ids

        batch_size, max_seqlen = input_ids.shape
        # Inference params

        if batch.inference_params is None:
            # 0 is important here
            seqlen_offset = 0
            n_blocks = len(self.model.blocks)
            d_state = self.model.config.d_state
            d_conv = self.model.config.d_conv
            d_inner = self.model.config.d_inner
            inference_params = new_inference_params(
                n_blocks=n_blocks,
                batch_size=batch_size,
                d_state=d_state,
                d_conv=d_conv,
                d_inner=d_inner,
                seqlen_offset=seqlen_offset,
                device=self.device,
                dtype=self.dtype,
            )
            batch.inference_params = inference_params

        # Forward pass
        logits, speculative_logits = self.forward(
            input_ids, inference_params=batch.inference_params
        )

        # batch.inference_params = new_inference_params
        # Results
        generations: List[Generation] = []
        stopped = True

        # Speculation is not active for causal
        accepted_ids = torch.ones_like(batch.input_ids)[:, 0]
        batch_top_token_ids, batch_top_token_logprobs = batch_top_tokens(
            batch.top_n_tokens,
            batch.top_n_tokens_tensor,
            torch.log_softmax(logits[:, -1], -1),
            accepted_ids,
        )

        start_decode = time.time_ns()

        # Zipped iterator
        iterator = zip(
            batch.requests,
            batch.input_lengths,
            batch.prefix_offsets,
            batch.read_offsets,
            logits,
            batch.next_token_choosers,
            batch.stopping_criterias,
            batch.all_input_ids,
            batch.top_n_tokens,
            batch_top_token_ids,
            batch_top_token_logprobs,
        )

        # For each member of the batch
        for i, (
            request,
            input_length,
            prefix_offset,
            read_offset,
            logits,
            next_token_chooser,
            stopping_criteria,
            all_input_ids,
            top_n_tokens,
            top_token_ids,
            top_token_logprobs,
        ) in enumerate(iterator):
            # Select next token
            next_token_id, logprobs = next_token_chooser(
                all_input_ids.view(1, -1), logits[-1:, :]
            )

            # Append next token to all tokens
            all_input_ids = torch.cat([all_input_ids, next_token_id])
            new_input_length = input_length + 1

            # Generated token
            next_token_logprob = logprobs[-1, next_token_id]
            next_token_id_squeezed = next_token_id.squeeze()
            next_token_text, prefix_offset, read_offset = self.decode_token(
                all_input_ids[:, 0], prefix_offset, read_offset
            )

            # Evaluate stopping criteria
            stop, reason = stopping_criteria(
                next_token_id_squeezed,
                next_token_text,
            )

            if not stop:
                stopped = False

            # Shard generations
            # All generations will be appended in the rust sharded client
            if i % self.world_size == self.rank:
                if stop:
                    # Decode generated tokens
                    output_text, _, _ = self.decode_token(
                        all_input_ids[:, 0],
                        prefix_offset=len(all_input_ids)
                        - stopping_criteria.current_tokens
                        - 1,
                        read_offset=len(all_input_ids)
                        - stopping_criteria.current_tokens,
                        skip_special_tokens=True,
                    )
                    # Get seed
                    if isinstance(next_token_chooser.choice, Sampling):
                        seed = next_token_chooser.choice.seed
                    else:
                        seed = None

                    generated_text = GeneratedText(
                        output_text, stopping_criteria.current_tokens, reason, seed
                    )
                else:
                    generated_text = None

                if stopping_criteria.current_tokens == 1 and request.prefill_logprobs:
                    # Remove generated token to only have prefill and add nan for first prompt token
                    prefill_logprobs = [float("nan")] + torch.log_softmax(
                        logits, -1
                    ).gather(1, all_input_ids[1:]).squeeze(1)[
                        -new_input_length:-1
                    ].tolist()
                    prefill_token_ids = all_input_ids[-new_input_length:-1]
                    prefill_texts = self.tokenizer.batch_decode(
                        prefill_token_ids,
                        clean_up_tokenization_spaces=False,
                        skip_special_tokens=False,
                    )
                    prefill_tokens = Tokens(
                        prefill_token_ids,
                        prefill_logprobs,
                        prefill_texts,
                        is_special=[],
                    )
                else:
                    prefill_tokens = None

                if top_n_tokens > 0:
                    toptoken_texts = self.tokenizer.batch_decode(
                        top_token_ids,
                        clean_up_tokenization_spaces=False,
                        skip_special_tokens=False,
                    )
                    special_toptokens = [
                        token_id in self.all_special_ids for token_id in top_token_ids
                    ]
                    top_tokens = Tokens(
                        top_token_ids,
                        top_token_logprobs,
                        toptoken_texts,
                        special_toptokens,
                    )
                else:
                    top_tokens = None

                generation = Generation(
                    request.id,
                    prefill_tokens,
                    Tokens(
                        [next_token_id_squeezed],
                        [next_token_logprob],
                        [next_token_text],
                        [next_token_id_squeezed.item() in self.all_special_ids],
                    ),
                    generated_text,
                    top_tokens,
                )

                generations.append(generation)

                # Update values
                batch.next_token_choosers[i] = batch.next_token_choosers[
                    i
                ].advance_grammar(next_token_id_squeezed.item())
                batch.input_ids[i, 0] = next_token_id
                batch.all_input_ids[i] = all_input_ids
                batch.input_lengths[i] = new_input_length
                batch.prefix_offsets[i] = prefix_offset
                batch.read_offsets[i] = read_offset
                batch.max_input_length = max(batch.max_input_length, new_input_length)

        # We finished all generations in the batch; there is no next batch
        if stopped:
            forward_ns = start_decode - start
            decode_ns = time.time_ns() - start_decode
            return generations, None, (forward_ns, decode_ns)

        # Slice unused values from prefill
        batch.input_ids = batch.input_ids[:, :1]

        forward_ns = start_decode - start
        decode_ns = time.time_ns() - start_decode
        return generations, batch, (forward_ns, decode_ns)


================================================
FILE: server/text_generation_server/models/metadata_kernels.py
================================================
import torch
import triton

import triton.language as tl

from loguru import logger
from typing import List, Optional
from torch.utils._triton import has_triton as has_triton_torch

from text_generation_server.utils.import_utils import (
    SYSTEM,
)
from text_generation_server.utils.log import log_master

_HAS_TRITON: Optional[bool] = None


def has_triton():
    global _HAS_TRITON
    if _HAS_TRITON is None:
        # FIXME: it seems that has_triton_torch is bugged on RocM
        #        For now, only accept cuda
        _HAS_TRITON = has_triton_torch() if SYSTEM == "cuda" else False
        if _HAS_TRITON:
            log_master(logger.info, "Using optimized Triton indexing kernels.")

    return _HAS_TRITON


def block_tables_to_padded(
    max_blocks: int,
    cu_seqlen: torch.Tensor,
    block_tables: torch.Tensor,
    block_tables_ragged: torch.Tensor,
):
    def grid(meta):
        return (
            triton.cdiv(max_blocks, meta["BLOCK_SIZE"]),
            len(block_tables),
        )

    triton_block_tables_to_padded[grid](
        cu_seqlen,
        block_tables,
        block_tables_ragged,
        block_tables.shape[1],
        BLOCK_SIZE=256,
    )


def block_tables_to_ragged(
    *,
    block_tables: torch.Tensor,
    input_lengths: List[int],
    cache_lengths: List[int],
    input_lengths_tensor: torch.Tensor,
    cache_lengths_tensor: torch.Tensor,
    max_current_length: int,
) -> torch.Tensor:
    """Convert block table to ragged format compatible with FlashInfer."""
    assert len(input_lengths) == len(cache_lengths)

    total_len = sum(input_lengths) + sum(cache_lengths)
    block_tables_ragged = torch.empty(
        total_len, dtype=torch.int32, device=block_tables.device
    )

    if has_triton():
        cu_seqlen = input_lengths_tensor.new_zeros(input_lengths_tensor.shape[0] + 1)
        torch.cumsum(
            input_lengths_tensor + cache_lengths_tensor, out=cu_seqlen[1:], dim=0
        )

        def grid(meta):
            return (
                triton.cdiv(max_current_length, meta["BLOCK_SIZE"]),
                len(cache_lengths),
            )

        triton_block_tables_to_ragged[grid](
            cu_seqlen,
            block_tables,
            block_tables_ragged,
            block_tables.shape[1],
            BLOCK_SIZE=256,
        )
    else:
        offset = 0
        for i, (input_length, cache_length) in enumerate(
            zip(input_lengths, cache_lengths)
        ):
            seq_len = cache_length + input_length
            block_tables_ragged[offset : offset + seq_len] = block_tables[i][:seq_len]
            offset += seq_len

    return block_tables_ragged


def copy_next_input_ids_inplace(
    max_next_input_ids: int,
    all_input_ids: torch.Tensor,
    cache_lengths: torch.Tensor,
    input_lengths: torch.Tensor,
    prompt_lengths: torch.Tensor,
    next_input_ids: torch.Tensor,
    cu_accepted_ids: torch.Tensor,
):
    def grid(meta):
        return (
            triton.cdiv(max_next_input_ids, meta["BLOCK_SIZE"]),
            len(all_input_ids),
        )

    triton_copy_next_input_ids_inplace[grid](
        all_input_ids,
        cache_lengths,
        input_lengths,
        prompt_lengths,
        next_input_ids,
        cu_accepted_ids,
        all_input_ids.shape[1],
        BLOCK_SIZE=16,
    )


def prepare_position_slot_ids(
    max_input_length: int,
    cache_lengths: torch.Tensor,
    cu_seqlen: torch.Tensor,
    cu_slots: torch.Tensor,
    position_ids: torch.Tensor,
    slot_indices: torch.Tensor,
):
    def grid(meta):
        return (
            triton.cdiv(max_input_length, meta["BLOCK_SIZE"]),
            len(cache_lengths),
        )

    triton_prepare_position_slot_ids[grid](
        cache_lengths, cu_seqlen, cu_slots, position_ids, slot_indices, BLOCK_SIZE=256
    )


def slots_filtering(
    max_slots: int,
    slots: torch.Tensor,
    filtered_slots: torch.Tensor,
    cu_slots: torch.Tensor,
    slots_start: torch.Tensor,
):
    def grid(meta):
        return (
            triton.cdiv(max_slots, meta["BLOCK_SIZE"]),
            len(slots_start),
        )

    triton_slots_filtering[grid](
        slots, filtered_slots, slots_start, cu_slots, BLOCK_SIZE=256
    )


@triton.jit
def triton_slots_filtering(
    # Inputs
    slots_ptr,
    filtered_slots_ptr,
    slots_start_ptr,
    cu_slots_ptr,
    # Const values
    BLOCK_SIZE: "tl.constexpr",
):
    # Position in block_tables_ragged.numel() / BLOCK_SIZE
    pid = tl.program_id(axis=0)
    # Position in batch
    bid = tl.program_id(axis=1)

    block_start = pid * BLOCK_SIZE
    block_arange = block_start + tl.arange(0, BLOCK_SIZE)

    filter_start = tl.load(slots_start_ptr + bid)

    slot_start = tl.load(cu_slots_ptr + bid)
    slot_end = tl.load(cu_slots_ptr + bid + 1)

    mask = (slot_start + block_arange) < slot_end

    slots = tl.load(slots_ptr + filter_start + block_arange, mask=mask)
    tl.store(filtered_slots_ptr + slot_start + block_arange, slots, mask=mask)


@triton.jit
def triton_block_tables_to_padded(
    # Inputs
    cu_seqlen_ptr,
    # Outputs
    block_tables_ptr,
    block_tables_ragged_ptr,
    # Stride
    stride_block_tables,
    # Const values
    BLOCK_SIZE: "tl.constexpr",
):
    # Position in block_tables_ragged.numel() / BLOCK_SIZE
    pid = tl.program_id(axis=0)
    # Position in batch
    bid = tl.program_id(axis=1)

    block_start = pid * BLOCK_SIZE
    block_arange = block_start + tl.arange(0, BLOCK_SIZE)

    seq_start = tl.load(cu_seqlen_ptr + bid)
    seq_end = tl.load(cu_seqlen_ptr + bid + 1)

    mask = (seq_start + block_arange) < seq_end

    blocks = tl.load(block_tables_ragged_ptr + seq_start + block_arange, mask=mask)
    tl.store(
        block_tables_ptr + bid * stride_block_tables + block_arange, blocks, mask=mask
    )


@triton.jit
def triton_block_tables_to_ragged(
    # Inputs
    cu_seqlen_ptr,
    # Outputs
    block_tables_ptr,
    block_tables_ragged_ptr,
    # Stride
    stride_block_tables,
    # Const values
    BLOCK_SIZE: "tl.constexpr",
):
    # Position in block_tables_ragged.numel() / BLOCK_SIZE
    pid = tl.program_id(axis=0)
    # Position in batch
    bid = tl.program_id(axis=1)

    block_start = pid * BLOCK_SIZE
    block_arange = block_start + tl.arange(0, BLOCK_SIZE)

    seq_start = tl.load(cu_seqlen_ptr + bid)
    seq_end = tl.load(cu_seqlen_ptr + bid + 1)

    mask = (seq_start + block_arange) < seq_end

    blocks = tl.load(
        block_tables_ptr + bid * stride_block_tables + block_arange, mask=mask
    )
    tl.store(block_tables_ragged_ptr + seq_start + block_arange, blocks, mask=mask)


@triton.jit
def triton_copy_next_input_ids_inplace(
    # Inputs
    all_input_ids_ptr,
    cache_lengths_ptr,
    input_lengths_ptr,
    prompt_lengths_ptr,
    next_input_ids_ptr,
    cu_accepted_ids_ptr,
    # Stride
    stride_all_input_ids,
    # Const values
    BLOCK_SIZE: "tl.constexpr",
):
    # Position in max_accepted_ids / BLOCK_SIZE
    pid = tl.program_id(axis=0)
    # Position in batch
    bid = tl.program_id(axis=1)

    block_start = pid * BLOCK_SIZE
    block_arange = block_start + tl.arange(0, BLOCK_SIZE)

    # Used for correctly indexing in all_input_ids
    cache_length = tl.load(cache_lengths_ptr + bid)
    input_length = tl.load(input_lengths_ptr + bid)
    prompt_length = tl.load(prompt_lengths_ptr + bid)

    # Start/End of next_input_ids for this request
    next_input_ids_start = tl.load(cu_accepted_ids_ptr + bid)
    next_input_ids_end = tl.load(cu_accepted_ids_ptr + bid + 1)

    # Mask values out of range
    mask = (next_input_ids_start + block_arange) < next_input_ids_end

    # Mask values for request still prefilling
    decode_mask = (cache_length + input_length + block_arange) >= prompt_length

    mask = mask & decode_mask

    # Load this request next input ids
    next_input_ids = tl.load(
        next_input_ids_ptr + next_input_ids_start + block_arange, mask=mask
    )

    # Store in all_input_ids, since it is a 2D tensor, apply stride * bid
    tl.store(
        all_input_ids_ptr
        + stride_all_input_ids * bid
        + cache_length
        + input_length
        + block_arange,
        next_input_ids,
        mask=mask,
    )


@triton.jit
def triton_prepare_position_slot_ids(
    # Inputs
    cache_lengths_ptr,
    cu_seqlen_ptr,
    cu_slots_ptr,
    # Outputs
    position_ids_ptr,
    slot_indices_ptr,
    # Const values
    BLOCK_SIZE: "tl.constexpr",
):
    # Position in max_input_length / BLOCK_SIZE
    pid = tl.program_id(axis=0)
    # Position in batch
    bid = tl.program_id(axis=1)

    block_start = pid * BLOCK_SIZE
    block_arange = block_start + tl.arange(0, BLOCK_SIZE)

    cache_length = tl.load(cache_lengths_ptr + bid)

    seq_start = tl.load(cu_seqlen_ptr + bid)
    seq_end = tl.load(cu_seqlen_ptr + bid + 1)

    slot_start = tl.load(cu_slots_ptr + bid)

    mask = (seq_start + block_arange) < seq_end

    tl.store(
        position_ids_ptr + seq_start + block_arange,
        cache_length + block_arange,
        mask=mask,
    )
    tl.store(
        slot_indices_ptr + seq_start + block_arange,
        slot_start + cache_length + block_arange,
        mask=mask,
    )


================================================
FILE: server/text_generation_server/models/mllama_causal_lm.py
================================================
import torch

import numpy as np

from typing import Iterable, Optional, Tuple, List, Dict
from text_generation_server.pb.generate_pb2 import Request
from io import BytesIO
from PIL import Image
from dataclasses import dataclass
from opentelemetry import trace
from transformers import (
    PreTrainedTokenizerBase,
)

from text_generation_server.models.vlm_causal_lm import VlmCausalLMBatch, VlmCausalLM
from text_generation_server.pb import generate_pb2
from text_generation_server.models.globals import PREFIX_CACHING, ATTENTION
from text_generation_server.layers.attention import Seqlen
from text_generation_server.models.metadata_kernels import block_tables_to_ragged


tracer = trace.get_tracer(__name__)


@dataclass
class MllamaCausalLMBatch(VlmCausalLMBatch):
    image_indices: List[int] = 42
    aspect_ratio_ids: Optional[torch.Tensor] = None
    aspect_ratio_mask: Optional[torch.Tensor] = None
    cross_attention_states: Optional[torch.Tensor] = None

    def prepare_for_prefill(self):
        super(VlmCausalLMBatch, self).prepare_for_prefill()

    @classmethod
    @tracer.start_as_current_span("concatenate")
    def concatenate(cls, batches):
        batch = super(VlmCausalLMBatch, cls).concatenate(batches)
        batch.pixel_values = None
        batch.pixel_attention_mask = None

        offset = 0
        image_indices = []
        attention_states = []
        for b in batches:
            if b.cross_attention_states is not None:
                attention_states.append(b.cross_attention_states)
            image_indices.extend([i + offset for i in b.image_indices])
            offset += len(b.image_indices)
        if len(attention_states) > 0:
            assert len(image_indices) > 0
            batch.cross_attention_states = torch.cat(attention_states, dim=0)
            batch.image_indices = image_indices
        else:
            batch.cross_attention_states = None
            batch.image_indices = []
        return batch

    @tracer.start_as_current_span("filter")
    def filter(self, request_ids: List[int]):
        assert self.image_indices is not None
        batch = super(VlmCausalLMBatch, self).filter(request_ids)
        assert self.image_indices is not None
        indices = []
        for i, request_id in enumerate(request_ids):
            idx = self.requests_idx_mapping[request_id]
            indices.append(idx)

        offset = 0
        new_image_indices = []
        prev_i = None
        for i in self.image_indices:
            if i in indices:
                new_image_indices.append(offset)
                if i != prev_i:
                    offset += 1
                prev_i = i

        batch.image_indices = new_image_indices
        if len(new_image_indices) > 0:
            assert max(new_image_indices) < self.cross_attention_states.shape[0]
            assert offset <= self.cross_attention_states.shape[0]
            batch.cross_attention_states = self.cross_attention_states[
                new_image_indices
            ]
        else:
            batch.cross_attention_states = None
        batch.pixel_values = None
        return batch

    @classmethod
    def batch_tokenized_inputs(
        cls, requests: Iterable[Request], tokenizer, processor, config
    ):
        image_inputs = []
        texts = []
        image_indices = []
        batch_tokenized_inputs = []

        for i, r in enumerate(requests):
            # Each input is encoded into a list, where each element of this input list is either a string or a URL
            curr_text = ""
            curr_image = None
            curr_i = None
            for chunk in r.input_chunks.chunks:
                chunk_type = chunk.WhichOneof("chunk")
                if chunk_type == "text":
                    curr_text += chunk.text
                elif chunk_type == "image":
                    image = Image.open(BytesIO(chunk.image.data))
                    # TODO unsure about BOS
                    curr_text += "<|image|>"
                    image_input = processor.image_processor(image, return_tensors="pt")
                    curr_image = image_input
                    curr_i = i
                    # image_inputs.append(image_input)
                    # image_indices.append(i)
                else:
                    raise RuntimeError(f"Invalid chunk type {chunk_type}")
            texts.append(curr_text)
            if curr_image is not None:
                image_inputs.append(curr_image)
                image_indices.append(curr_i)

            input_ids = tokenizer(
                curr_text,
                truncation=True,
                max_length=r.truncate,
                add_special_tokens=r.add_special_tokens,
            )["input_ids"]
            batch_tokenized_inputs.append(input_ids)
        if image_inputs:
            image_input = image_inputs[0]
            new_image_inputs = {
                "pixel_values": torch.cat(
                    [img["pixel_values"] for img in image_inputs], dim=0
                ),
            }
            if "aspect_ratio_ids" in image_input:
                new_image_inputs["aspect_ratio_ids"] = torch.cat(
                    [img["aspect_ratio_ids"] for img in image_inputs], dim=0
                )
            if "aspect_ratio_mask" in image_input:
                new_image_inputs["aspect_ratio_mask"] = torch.cat(
                    [img["aspect_ratio_mask"] for img in image_inputs], dim=0
                )
            image_inputs = new_image_inputs
            image_inputs["image_indices"] = image_indices
        else:
            image_inputs = None

        if image_inputs is not None:
            assert len(image_indices) == image_inputs["pixel_values"].shape[0]

        return batch_tokenized_inputs, image_inputs

    @classmethod
    def from_pb_processor(
        cls,
        pb: generate_pb2.Batch,
        tokenizer: PreTrainedTokenizerBase,
        processor,
        config,
        dtype: torch.dtype,
        device: torch.device,
    ) -> "VlmCausalLMBatch":
        batch_tokenized_inputs, image_inputs = cls.batch_tokenized_inputs(
            pb.requests, tokenizer, processor, config
        )
        batch = cls.from_tokenized(pb, tokenizer, batch_tokenized_inputs, dtype, device)
        # XXX: <|image|> token is actually out of bounds and bugs out the logit processors.
        batch.all_input_ids_tensor = batch.all_input_ids_tensor.clamp(
            max=config.text_config.vocab_size - 1
        )
        if isinstance(batch.input_ids, list):
            if len(batch) > 1:
                input_ids = np.concatenate(batch.input_ids, dtype=np.int64)
            else:
                input_ids = batch.input_ids[0]
            batch.input_ids = torch.tensor(input_ids, dtype=torch.int64, device=device)

        batch.input_ids = batch.input_ids.clamp(max=config.text_config.vocab_size - 1)

        if image_inputs is not None:
            batch.pixel_values = image_inputs["pixel_values"].to(
                device=device, dtype=dtype
            )
            batch.aspect_ratio_ids = image_inputs["aspect_ratio_ids"].to(device=device)
            batch.aspect_ratio_mask = image_inputs["aspect_ratio_mask"].to(
                device=device
            )
            batch.image_indices = image_inputs["image_indices"]
        else:
            batch.pixel_values = None
            batch.aspect_ratio_ids = None
            batch.aspect_ratio_mask = None
            batch.image_indices = []
        assert batch.image_indices is not None
        return batch


class MllamaCausalLM(VlmCausalLM):
    def set_inputs_embeds(self, batch):
        # Set the input embeddings to None, as we are using the input_ids for the model
        batch.inputs_embeds = None

    def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int):
        super(VlmCausalLM, self).cuda_graph_warmup(bs, max_s, max_bt)

    def forward(
        self,
        batch: MllamaCausalLMBatch,
        adapter_data: Optional[Dict[str, torch.Tensor]] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        # Model Forward
        if batch.speculative_ids is not None:
            input_ids = batch.input_ids
            position_ids = batch.position_ids
            cu_seqlen_prefill = batch.cu_seqlen_prefill
            kv_cache = self.kv_cache
            block_tables = batch.block_tables_tensor
            slots = batch.slots[batch.slot_indices]
            input_lengths = batch.input_lengths_tensor
            max_s = batch.max_current_length
            lm_head_indices = batch.prefill_head_indices

            speculative_ids = batch.speculative_ids

            B, speculative_length = speculative_ids.shape
            new_length = speculative_length + 1
            new_input_ids = torch.cat(
                [input_ids.unsqueeze(-1), speculative_ids], dim=1
            ).reshape(-1)
            arange = torch.arange(new_length, device=position_ids.device).unsqueeze(0)
            arange_int = arange.to(dtype=torch.int32)
            new_position_ids = (
                position_ids.unsqueeze(-1).expand(B, new_length) + arange
            ).view(-1)
            slots = (slots.unsqueeze(-1).expand(B, new_length) + arange_int).view(-1)
            input_lengths = (
                input_lengths.unsqueeze(-1).expand(B, new_length) + arange_int
            ).view(-1)
            cache_lengths_tensor = (
                batch.cache_lengths_tensor.unsqueeze(-1).expand(B, new_length)
            ).reshape(-1)

            # Add Copy the block tables for all members
            block_tables = (
                block_tables.unsqueeze(1)
                .expand(B, new_length, -1)
                .reshape(B * new_length, -1)
                .contiguous()
            )
            max_s = max_s + speculative_length

            input_ids = new_input_ids
            position_ids = new_position_ids
        else:
            input_ids = batch.input_ids
            position_ids = batch.position_ids
            cu_seqlen_prefill = batch.cu_seqlen_prefill
            kv_cache = self.kv_cache
            block_tables = batch.block_tables_tensor
            slots = batch.slots[batch.slot_indices]
            input_lengths = batch.input_lengths_tensor
            cache_lengths_tensor = batch.cache_lengths_tensor
            max_s = batch.max_current_length
            lm_head_indices = batch.prefill_head_indices

        # Try to find an associated cuda graph
        bs = input_ids.shape[0]
        sorted_padded_bs = sorted([k for k in self.cuda_graphs.keys() if k >= bs])
        if sorted_padded_bs:
            # Get associated cuda graph
            cuda_graph = self.cuda_graphs[sorted_padded_bs[0]]
        else:
            cuda_graph = None
        if (
            cu_seqlen_prefill is not None
            or cuda_graph is None
            # Only run cuda graphs when there's no images.
            or batch.cross_attention_states is not None
        ):
            if PREFIX_CACHING:
                block_tables = block_tables_to_ragged(
                    block_tables=block_tables,
                    input_lengths=batch.input_lengths,
                    cache_lengths=batch.cache_lengths,
                    input_lengths_tensor=batch.input_lengths_tensor,
                    cache_lengths_tensor=batch.cache_lengths_tensor,
                    max_current_length=batch.max_current_length,
                )
            with self._forward_context(
                block_tables=block_tables,
                cu_seqlen_prefill=cu_seqlen_prefill,
                input_lengths_tensor=input_lengths,
                cache_lengths_tensor=cache_lengths_tensor,
            ):
                seqlen = Seqlen(
                    input_lengths=input_lengths,
                    cache_lengths=cache_lengths_tensor,
                    cu_seqlen_q=cu_seqlen_prefill,
                    max_q=batch.max_input_length,
                    max_k=batch.max_current_length,
                )

                if batch.pixel_values is not None:
                    cross_attention_states = self.model.vision_forward(
                        pixel_values=batch.pixel_values,
                        aspect_ratio_ids=batch.aspect_ratio_ids,
                        aspect_ratio_mask=batch.aspect_ratio_mask,
                    )
                    batch.cross_attention_states = cross_attention_states

                cross_attention_states = batch.cross_attention_states

                logits, speculative_logits = self.model.forward(
                    input_ids=input_ids,
                    position_ids=position_ids,
                    cu_seqlen_prefill=cu_seqlen_prefill,
                    kv_cache=kv_cache,
                    block_tables=block_tables,
                    slots=slots,
                    seqlen=seqlen,
                    max_s=max_s,
                    prefill_cache_indices=batch.prefill_cache_indices,
                    lm_head_indices=lm_head_indices,
                    cross_attention_states=cross_attention_states,
                    adapter_data=adapter_data,
                    image_indices=batch.image_indices[:],
                )
                if batch.prefill_cache_indices is not None:
                    batch.prefill_cache_indices = None
                if batch.pixel_values is not None:
                    batch.pixel_values = None
                return logits, speculative_logits

        # Copy inputs to the static inputs of the cuda graph
        # Static inputs are potentially padded
        cuda_graph["input_ids"][: input_ids.shape[0]] = input_ids
        cuda_graph["position_ids"][: position_ids.shape[0]] = position_ids
        if ATTENTION == "flashinfer":
            block_tables = block_tables_to_ragged(
                block_tables=block_tables,
                input_lengths=batch.input_lengths,
                cache_lengths=batch.cache_lengths,
                input_lengths_tensor=batch.input_lengths_tensor,
                cache_lengths_tensor=batch.cache_lengths_tensor,
                max_current_length=batch.max_current_length,
            )
            cuda_graph["block_tables"][: block_tables.shape[0]] = block_tables
        else:
            cuda_graph["block_tables"][
                : block_tables.shape[0], : block_tables.shape[1]
            ] = block_tables

        # XXX: This is working only because block 0 is reserved for the healthcheck
        # so it doesn't matter if we override it with bogus values.
        cuda_graph["slots"].fill_(0)
        cuda_graph["slots"][: slots.shape[0]] = slots
        cuda_graph["input_lengths"].zero_()
        cuda_graph["input_lengths"][: input_lengths.shape[0]] = input_lengths
        cuda_graph["cache_lengths"].zero_()
        cuda_graph["cache_lengths"][
            : cache_lengths_tensor.shape[0]
        ] = cache_lengths_tensor

        with self._forward_context(
            block_tables=cuda_graph["block_tables"],
            cu_seqlen_prefill=None,
            input_lengths_tensor=cuda_graph["input_lengths"],
            cache_lengths_tensor=cuda_graph["cache_lengths"],
            state=cuda_graph["state"],
        ):
            # Replay the graph
            cuda_graph["graph"].replay()

        # Slice output to the correct shape
        speculative_logits = (
            cuda_graph["speculative_logits"][:bs]
            if cuda_graph["speculative_logits"] is not None
            else None
        )
        logits = cuda_graph["logits"][:bs]
        return logits, speculative_logits


================================================
FILE: server/text_generation_server/models/model.py
================================================
import inspect
import torch

from abc import ABC, abstractmethod
from typing import List, Tuple, Optional, TypeVar, Type, Dict
from collections import defaultdict
from transformers import PreTrainedTokenizerBase
from loguru import logger

from text_generation_server.models.globals import (
    ATTENTION,
    PREFIX_CACHING,
    BLOCK_SIZE,
    PREFILL_CHUNKING,
)
from text_generation_server.models.types import Batch, Generation
from text_generation_server.utils.log import log_master
from text_generation_server.utils.prefill_chunking import set_support_chunking
from text_generation_server.utils.speculate import get_speculate
from text_generation_server.pb.generate_pb2 import InfoResponse
from text_generation_server.adapters.weights import LayerAdapterWeights

BASE_MODEL_ADAPTER_ID = "__base_model__"


B = TypeVar("B", bound=Batch)


class Model(ABC):
    def __init__(
        self,
        model_id: str,
        model: torch.nn.Module,
        tokenizer: PreTrainedTokenizerBase,
        requires_padding: bool,
        dtype: torch.dtype,
        device: torch.device,
        rank: int = 0,
        world_size: int = 1,
        sliding_window: Optional[int] = None,
        speculate: Optional[int] = None,
        adapter_id: str = BASE_MODEL_ADAPTER_ID,
        support_chunking: bool = False,
    ):
        self.model_id = model_id
        self.model = model.eval()
        self.tokenizer = tokenizer

        # all_special_ids is not set correctly if the rust tokenizer is unpacked
        # TODO report this to transformers.
        other_special_ids = {
            id for id, token in tokenizer.added_tokens_decoder.items() if token.special
        }
        self.all_special_ids = set(tokenizer.all_special_ids)
        self.all_special_ids.update(other_special_ids)
        self.requires_padding = requires_padding
        self.dtype = dtype
        self.device = device
        self.rank = rank
        self.world_size = world_size
        self.sliding_window = sliding_window if sliding_window != -1 else None

        self.layer_to_adapter_weights: Dict[str, LayerAdapterWeights] = defaultdict(
            LayerAdapterWeights
        )
        self.loaded_adapters = set()
        self.static_adapter_id = adapter_id

        if speculate is None:
            speculate = get_speculate()
        self.speculate = speculate

        support_chunking = support_chunking and PREFILL_CHUNKING

        if speculate != 0 and support_chunking:
            log_master(
                logger.warning,
                "Prefill chunking does not support speculation yet. "
                "Prefill chunking will be turned off",
            )
            support_chunking = False
        if (
            ATTENTION not in ["flashinfer", "flashdecoding", "flashdecoding-ipex"]
            and support_chunking
        ):
            log_master(
                logger.warning,
                "Prefill chunking is only supported with `flashinfer` or `flashdecoding` or `flashdecoding-ipex` attention types.",
            )
            support_chunking = False

        log_master(logger.info, f"Using prefill chunking = {support_chunking}")

        self.support_chunking = support_chunking
        set_support_chunking(support_chunking)

        self.has_position_ids = (
            inspect.signature(model.forward).parameters.get("position_ids", None)
            is not None
        )

        self.check_initialized()

    @property
    def info(self) -> InfoResponse:
        if self.requires_padding and self.sliding_window is not None:
            raise NotImplementedError("sliding_window is not implemented with padding")

        return InfoResponse(
            requires_padding=self.requires_padding,
            dtype=str(self.dtype),
            device_type=self.device.type,
            window_size=None,  # Setting this parameter to None disabled the block logic with sliding window.
            speculate=self.speculate,
            support_chunking=self.support_chunking,
            use_prefix_caching=PREFIX_CACHING,
            attention_impl=ATTENTION,
            block_size=BLOCK_SIZE,
        )

    @property
    @abstractmethod
    def batch_type(self) -> Type[B]:
        raise NotImplementedError

    @abstractmethod
    def generate_token(
        self, batch: B
    ) -> Tuple[List[Generation], Optional[B], Tuple[int, int]]:
        raise NotImplementedError

    def warmup(
        self, batch: B, max_input_tokens: Optional[int], max_total_tokens: Optional[int]
    ) -> Tuple[Optional[int], int, int]:
        self.generate_token(batch)
        total = sum(len(i) for i in batch.input_ids)
        if max_total_tokens is None:
            max_total_tokens = total

        if max_input_tokens is None:
            max_input_tokens = max_total_tokens - 1
        return None, max_input_tokens, max_total_tokens

    def decode_token(
        self,
        all_input_ids: List[int],
        prefix_offset: int = 0,
        read_offset: int = 0,
        skip_special_tokens: bool = False,
    ) -> Tuple[str, int, int]:
        """Hack to hopefully support generate_stream for the maximum number of tokenizers"""

        # The prefix text is necessary only to defeat cleanup algorithms in the decode
        # which decide to add a space or not depending on the surrounding ids.
        prefix_text = self.tokenizer.decode(
            all_input_ids[prefix_offset:read_offset],
            skip_special_tokens=skip_special_tokens,
        )
        new_text = self.tokenizer.decode(
            all_input_ids[prefix_offset:], skip_special_tokens=skip_special_tokens
        )

        if len(new_text) > len(prefix_text) and not new_text.endswith("�"):
            # utf-8 char at the end means it's a potential unfinished byte sequence
            # from byte fallback tokenization.
            # If it's in the middle, it's probably a real invalid id generated
            # by the model
            new_text = new_text[len(prefix_text) :]
            return new_text, read_offset, len(all_input_ids)
        else:
            return "", prefix_offset, read_offset

    def check_initialized(self):
        uninitialized_parameters = []
        for n, p in self.model.named_parameters():
            if p.data.device == torch.device("meta"):
                uninitialized_parameters.append(n)
        if uninitialized_parameters:
            raise RuntimeError(
                f"found uninitialized parameters in model {self.__class__.__name__}: {uninitialized_parameters}"
            )


================================================
FILE: server/text_generation_server/models/seq2seq_lm.py
================================================
import torch
import torch.distributed
import time
from dataclasses import dataclass
from opentelemetry import trace
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    PreTrainedTokenizerBase,
    AutoConfig,
)
from typing import Optional, Tuple, List, Type, Dict
from text_generation_server.utils.import_utils import SYSTEM
from text_generation_server.utils import (
    initialize_torch_distributed,
    weight_files,
    Weights,
)
from text_generation_server.utils.chunks import concat_text_chunks
from text_generation_server.utils.quantization import get_loader
from text_generation_server.utils.tokens import batch_top_tokens
from text_generation_server.models import Model
from text_generation_server.models.types import (
    GeneratedText,
    Batch,
    Generation,
    Tokens,
)
from text_generation_server.pb import generate_pb2
from text_generation_server.utils import NextTokenChooser, StoppingCriteria, Sampling

tracer = trace.get_tracer(__name__)


@dataclass
class Seq2SeqLMBatch(Batch):
    batch_id: int
    requests: List[generate_pb2.Request]
    requests_idx_mapping: Dict[int, int]

    # Encoder values
    input_ids: Optional[torch.Tensor]
    attention_mask: torch.Tensor

    # Decoder values
    decoder_input_ids: torch.Tensor
    decoder_attention_mask: Optional[torch.Tensor]
    encoder_last_hidden_state: Optional[torch.Tensor]

    # All tokens
    all_decoder_input_ids: List[torch.Tensor]

    # Seq2SeqLM keeps track of both encoder and decoder attention keys and values
    past_key_values: Optional[List[Tuple]]

    # Lengths of all generations present in the batch
    input_lengths: List[int]
    decoder_input_lengths: List[int]
    prefix_offsets: List[int]
    read_offsets: List[int]

    # Generation helpers
    next_token_choosers: List[NextTokenChooser]
    stopping_criterias: List[StoppingCriteria]
    top_n_tokens: List[int]
    top_n_tokens_tensor: torch.Tensor

    # Metadata used for padding
    max_input_length: int
    max_decoder_input_length: int
    padding_right_offset: int

    # Maximum number of tokens this batch will grow to
    max_tokens: int

    def to_pb(self) -> generate_pb2.CachedBatch:
        """Convert a Seq2SeqLMBatch to a text_generation_server.v1.CachedBatch protobuf"""
        return generate_pb2.CachedBatch(
            id=self.batch_id,
            request_ids=[r.id for r in self.requests],
            size=len(self),
            max_tokens=self.max_tokens,
            current_tokens=len(self.decoder_input_ids),
        )

    @classmethod
    def from_pb(
        cls,
        pb: generate_pb2.Batch,
        tokenizer: PreTrainedTokenizerBase,
        dtype: torch.dtype,
        device: torch.device,
    ) -> "Seq2SeqLMBatch":
        """Convert a text_generation_server.v1.Batch protobuf to a Seq2SeqLMBatch"""
        inputs = []
        next_token_choosers = []
        stopping_criterias = []
        top_n_tokens = []
        decoder_input_lengths = []
        prefix_offsets = []
        read_offsets = []
        requests_idx_mapping = {}

        # Parse batch
        max_truncation = 0
        padding_right_offset = 0
        max_decode_tokens = 0
        for i, r in enumerate(pb.requests):
            inputs.append(concat_text_chunks(r.input_chunks.chunks))
            requests_idx_mapping[r.id] = i
            decoder_input_lengths.append(1)
            next_token_choosers.append(
                NextTokenChooser.from_pb(r.parameters, device, tokenizer)
            )
            stopping_criteria = StoppingCriteria.from_pb(
                r.stopping_parameters, tokenizer
            )
            stopping_criterias.append(stopping_criteria)
            top_n_tokens.append(r.top_n_tokens)
            max_truncation = max(max_truncation, r.truncate)
            max_decode_tokens += stopping_criteria.max_new_tokens
            padding_right_offset = max(
                padding_right_offset, stopping_criteria.max_new_tokens
            )

        # Tokenize batch
        tokenized_inputs = tokenizer(
            inputs,
            return_tensors="pt",
            padding=True,
            return_token_type_ids=False,
            truncation=True,
            max_length=max_truncation,
        ).to(device)

        input_lengths = tokenized_inputs["attention_mask"].sum(1)
        max_input_length = input_lengths.max()

        # Decoder sequence only contains the bos_token
        decoder_input_ids = (
            torch.tensor(tokenizer.bos_token_id, device=device)
            .repeat(len(pb.requests))
            .view(-1, 1)
        )
        for _ in pb.requests:
            prefix_offsets.append(0)
            read_offsets.append(1)
        all_decoder_input_ids = decoder_input_ids.view(-1).split(1)
        top_n_tokens_tensor = torch.tensor(
            top_n_tokens, device=device, dtype=torch.int64
        )

        max_tokens = len(inputs) * (max_input_length + max_decode_tokens)

        return cls(
            batch_id=pb.id,
            requests=pb.requests,
            requests_idx_mapping=requests_idx_mapping,
            input_ids=tokenized_inputs["input_ids"],
            attention_mask=tokenized_inputs["attention_mask"],
            decoder_input_ids=decoder_input_ids,
            all_decoder_input_ids=list(all_decoder_input_ids),
            decoder_attention_mask=None,
            encoder_last_hidden_state=None,
            past_key_values=None,
            input_lengths=input_lengths.tolist(),
            decoder_input_lengths=decoder_input_lengths,
            prefix_offsets=prefix_offsets,
            read_offsets=read_offsets,
            next_token_choosers=next_token_choosers,
            stopping_criterias=stopping_criterias,
            top_n_tokens=top_n_tokens,
            top_n_tokens_tensor=top_n_tokens_tensor,
            max_input_length=max_input_length.item(),
            max_decoder_input_length=1,
            padding_right_offset=padding_right_offset,
            max_tokens=max_tokens,
        )

    @tracer.start_as_current_span("filter")
    def filter(self, request_ids: List[int]) -> Optional["Seq2SeqLMBatch"]:
        if len(request_ids) == 0:
            raise ValueError("Batch must have at least one request")
        if len(request_ids) == len(self):
            return self

        keep_indices = []

        # New values after filtering
        requests_idx_mapping = {}
        requests = []
        input_lengths = []
        decoder_input_lengths = []
        prefix_offsets = []
        read_offsets = []

        all_decoder_input_ids = []

        next_token_choosers = []
        stopping_criterias = []
        top_n_tokens = []

        max_input_length = 0
        max_decoder_input_length = 0
        padding_right_offset = 0

        total_remaining_decode_tokens = 0

        for i, request_id in enumerate(request_ids):
            idx = self.requests_idx_mapping[request_id]
            requests_idx_mapping[request_id] = i
            keep_indices.append(idx)

            requests.append(self.requests[idx])
            prefix_offsets.append(self.prefix_offsets[idx])
            read_offsets.append(self.read_offsets[idx])

            all_decoder_input_ids.append(self.all_decoder_input_ids[idx])

            request_input_length = self.input_lengths[idx]
            input_lengths.append(request_input_length)
            max_input_length = max(max_input_length, request_input_length)

            request_decoder_input_length = self.decoder_input_lengths[idx]
            decoder_input_lengths.append(request_decoder_input_length)
            max_decoder_input_length = max(
                max_decoder_input_length, request_decoder_input_length
            )

            next_token_choosers.append(self.next_token_choosers[idx])
            stopping_criteria = self.stopping_criterias[idx]
            stopping_criterias.append(stopping_criteria)
            top_n_tokens.append(self.top_n_tokens[idx])
            remaining_decode_tokens = (
                stopping_criteria.max_new_tokens - stopping_criteria.current_tokens
            )
            total_remaining_decode_tokens += remaining_decode_tokens
            padding_right_offset = max(padding_right_offset, remaining_decode_tokens)

        # Apply indices to input_ids, attention mask, past key values and other items that need to be cached
        self.decoder_input_ids = self.decoder_input_ids[keep_indices]
        self.attention_mask = self.attention_mask[keep_indices, -max_input_length:]
        if self.decoder_attention_mask is not None:
            self.decoder_attention_mask = self.decoder_attention_mask[
                keep_indices,
                -(self.padding_right_offset + max_decoder_input_length) : (
                    self.decoder_attention_mask.shape[1] - self.padding_right_offset
                )
                + padding_right_offset,
            ]

        self.encoder_last_hidden_state = self.encoder_last_hidden_state[
            keep_indices, -max_input_length:
        ]

        # Ensure that past_key_values tensors can be updated in-place
        if type(self.past_key_values[0]) is tuple:
            self.past_key_values = [
                [t for t in layer] for layer in self.past_key_values
            ]

        decoder_past_seq_len = max_decoder_input_length - 1
        for layer in self.past_key_values:
            layer[0] = layer[0][keep_indices, :, -decoder_past_seq_len:]
            layer[1] = layer[1][keep_indices, :, -decoder_past_seq_len:]
            layer[2] = layer[2][keep_indices, :, -max_input_length:]
            layer[3] = layer[3][keep_indices, :, -max_input_length:]

        top_n_tokens_tensor = self.top_n_tokens_tensor[keep_indices]
        max_tokens = (
            len(request_ids) * (max_input_length + max_decoder_input_length)
            + remaining_decode_tokens
        )

        self.requests = requests
        self.requests_idx_mapping = requests_idx_mapping
        self.input_ids = None
        self.all_decoder_input_ids = all_decoder_input_ids
        self.input_lengths = input_lengths
        self.decoder_input_lengths = decoder_input_lengths
        self.prefix_offsets = prefix_offsets
        self.read_offsets = read_offsets
        self.next_token_choosers = next_token_choosers
        self.stopping_criterias = stopping_criterias
        self.top_n_tokens = top_n_tokens
        self.top_n_tokens_tensor = top_n_tokens_tensor
        self.max_input_length = max_input_length
        self.max_decoder_input_length = max_decoder_input_length
        self.padding_right_offset = padding_right_offset
        self.max_tokens = max_tokens

        return self

    @classmethod
    @tracer.start_as_current_span("concatenate")
    def concatenate(cls, batches: List["Seq2SeqLMBatch"]) -> "Seq2SeqLMBatch":
        """Concatenate multiple batches together by padding internal torch tensors"""

        # Used for padding
        total_batch_size = 0
        max_input_length = 0
        max_decoder_input_length = 0
        padding_right_offset = 0
        for batch in batches:
            total_batch_size += len(batch)
            max_input_length = max(max_input_length, batch.max_input_length)
            max_decoder_input_length = max(
                max_decoder_input_length, batch.max_decoder_input_length
            )
            padding_right_offset = max(padding_right_offset, batch.padding_right_offset)

        # Batch attributes
        requests = []
        requests_idx_mapping = {}
        all_decoder_input_ids = []
        input_lengths = []
        decoder_input_lengths = []
        prefix_offsets = []
        read_offsets = []
        next_token_choosers = []
        stopping_criterias = []
        top_n_tokens = []
        max_tokens = 0

        # Batch tensors
        attention_mask = None
        decoder_input_ids = None
        decoder_attention_mask = None
        encoder_last_hidden_state = None
        top_n_tokens_tensor = None
        past_key_values = []

        # Used for slicing correctly inside the tensors
        # Equivalent to a cumsum on batch sizes
        start_index = 0

        for i, batch in enumerate(batches):
            # Extend all list attributes
            requests.extend(batch.requests)
            all_decoder_input_ids.extend(batch.all_decoder_input_ids)
            input_lengths.extend(batch.input_lengths)
            decoder_input_lengths.extend(batch.decoder_input_lengths)
            prefix_offsets.extend(batch.prefix_offsets)
            read_offsets.extend(batch.read_offsets)
            next_token_choosers.extend(batch.next_token_choosers)
            stopping_criterias.extend(batch.stopping_criterias)
            top_n_tokens.extend(batch.top_n_tokens)

            if i == 0:
                requests_idx_mapping = batch.requests_idx_mapping
            else:
                # We need to offset the mapping for each batch by the cumulative batch size
                for k, v in batch.requests_idx_mapping.items():
                    requests_idx_mapping[k] = v + start_index

            # Slicing end index for this batch
            end_index = start_index + len(batch)

            # We only concatenate batches that did at least one step
            if batch.encoder_last_hidden_state is None:
                raise ValueError("Batch encoder_last_hidden_state cannot be None")

            # Create padded tensor
            if attention_mask is None:
                attention_mask = batch.attention_mask.new_zeros(
                    (total_batch_size, max_input_length),
                )
            # Copy to correct indices
            attention_mask[start_index:end_index, -batch.max_input_length :] = (
                batch.attention_mask[:, -batch.max_input_length :]
            )

            # Create padded tensor
            if decoder_input_ids is None:
                decoder_input_ids = batch.decoder_input_ids.new_zeros(
                    (total_batch_size, 1),
                )
            # Copy to correct indices
            decoder_input_ids[start_index:end_index] = batch.decoder_input_ids

            # Create padded tensor
            if decoder_attention_mask is None:
                # As decoder_attention_mask might not exist, we use `batch.attention_mask` for device here
                decoder_attention_mask = batch.attention_mask.new_zeros(
                    (total_batch_size, max_decoder_input_length + padding_right_offset),
                )
            # If the decoder mask does not exist yet, all generations started at the same time and we never concatenated
            # this batch. All generations are of length `batch.max_decoder_input_length`.
            left_offset = max_decoder_input_length - batch.max_decoder_input_length
            if batch.decoder_attention_mask is None:
                decoder_attention_mask[
                    start_index:end_index,
                    left_offset:-padding_right_offset,
                ] = 1
            # If it exists, we need to index
            else:
                batch_left_offset = (
                    batch.decoder_attention_mask.shape[1]
                    - batch.max_decoder_input_length
                    - batch.padding_right_offset
                )
                decoder_attention_mask[
                    start_index:end_index,
                    left_offset:-padding_right_offset,
                ] = batch.decoder_attention_mask[
                    :,
                    batch_left_offset : -batch.padding_right_offset,
                ]

            # Create padded tensor
            if encoder_last_hidden_state is None:
                encoder_last_hidden_state = batch.encoder_last_hidden_state.new_zeros(
                    (
                        total_batch_size,
                        max_input_length,
                        batch.encoder_last_hidden_state.shape[-1],
                    ),
                )

            if top_n_tokens_tensor is None:
                top_n_tokens_tensor = batches[0].top_n_tokens_tensor.new_zeros(
                    total_batch_size,
                )
            top_n_tokens_tensor[start_index:end_index] = batch.top_n_tokens_tensor

            # Copy to correct indices
            encoder_last_hidden_state[
                start_index:end_index, -batch.max_input_length :, :
            ] = batch.encoder_last_hidden_state[:, -batch.max_input_length :, :]
            batch.encoder_last_hidden_state = None

            # Ensure that we can update tensors in-place
            if isinstance(batch.past_key_values[0], tuple):
                batch.past_key_values = [
                    [t for t in layer] for layer in batch.past_key_values
                ]

            # Add eventual padding tokens that were added while concatenating
            max_tokens += batch.max_tokens + (
                max_input_length
                - batch.max_input_length
                + max_decoder_input_length
                - batch.max_decoder_input_length
            ) * len(batch)

            start_index = end_index

        # Determine shapes for new past kv tensors
        first_past_kvs = batches[0].past_key_values
        _, num_heads, _, head_dim = first_past_kvs[0][0].shape

        padded_dec_t_shape = (
            total_batch_size,
            num_heads,
            (max_decoder_input_length - 1),
            head_dim,
        )

        padded_enc_t_shape = (
            total_batch_size,
            num_heads,
            max_input_length,
            head_dim,
        )

        # Iterate over attention layers
        for j in range(len(first_past_kvs)):
            past_key_values.append([])

            # Decoder past
            for k in range(0, 2):
                # Initialize tensors
                padded_past_values = first_past_kvs[j][k].new_zeros(padded_dec_t_shape)
                past_key_values[j].append(padded_past_values)

                start_index = 0
                for batch in batches:
                    t = batch.past_key_values[j][k]
                    # Clear reference to the original tensor
                    batch.past_key_values[j][k] = None
                    # Slicing end index for this batch
                    end_index = start_index + len(batch)
                    # We slice the past keys and values to remove the padding from previous batches
                    past_seq_len = batch.max_decoder_input_length - 1
                    padded_past_values[start_index:end_index, :, -past_seq_len:, :] = t[
                        :, :, -past_seq_len:, :
                    ]
                    del t

                    start_index = end_index

            # Encoder past
            for k in range(2, 4):
                # Initialize tensors
                padded_past_values = first_past_kvs[j][k].new_zeros(padded_enc_t_shape)
                past_key_values[j].append(padded_past_values)

                start_index = 0
                for batch in batches:
                    t = batch.past_key_values[j][k]
                    # Clear reference to the original tensor
                    batch.past_key_values[j][k] = None
                    # Slicing end index for this batch
                    end_index = start_index + len(batch)
                    # We slice the past keys and values to remove the padding from previous batches
                    padded_past_values[
                        start_index:end_index, :, -batch.max_input_length :, :
                    ] = t[:, :, -batch.max_input_length :, :]
                    del t

                    start_index = end_index

        return cls(
            batch_id=batches[0].batch_id,
            requests=requests,
            requests_idx_mapping=requests_idx_mapping,
            input_ids=None,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            all_decoder_input_ids=all_decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            encoder_last_hidden_state=encoder_last_hidden_state,
            past_key_values=past_key_values,
            input_lengths=input_lengths,
            decoder_input_lengths=decoder_input_lengths,
            prefix_offsets=prefix_offsets,
            read_offsets=read_offsets,
            next_token_choosers=next_token_choosers,
            stopping_criterias=stopping_criterias,
            top_n_tokens=top_n_tokens,
            top_n_tokens_tensor=top_n_tokens_tensor,
            max_input_length=max_input_length,
            max_decoder_input_length=max_decoder_input_length,
            padding_right_offset=padding_right_offset,
            max_tokens=max_tokens,
        )

    def __len__(self):
        return len(self.requests)


class Seq2SeqLM(Model):
    def __init__(
        self,
        model_id: str,
        model_class,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
        speculator: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        default_dtype=torch.float16,
        trust_remote_code: bool = False,
        config_class=AutoConfig,
        tokenizer_class=AutoTokenizer,
        aliases=None,
    ):
        self.quantize = quantize
        self.process_group, rank, world_size = initialize_torch_distributed()
        if torch.cuda.is_available():
            device = torch.device(f"cuda:{rank}")
            dtype = default_dtype if dtype is None else dtype
        elif hasattr(torch, "xpu") and torch.xpu.is_available():
            device = torch.device(f"xpu:{rank}")
            dtype = default_dtype if dtype is None else dtype
        elif SYSTEM == "ipex":
            device = torch.device("cpu")
            # Float16 doesn't exist on target.
            dtype = torch.bfloat16 if dtype is None else dtype
        else:
            device = torch.device("cpu")
            dtype = torch.float32 if dtype is None else dtype

        config = config_class.from_pretrained(
            model_id,
            revision=revision,
            trust_remote_code=trust_remote_code,
        )
        config.quantize = quantize
        config.speculator = speculator

        tokenizer = tokenizer_class.from_pretrained(
            model_id,
            revision=revision,
            padding_side="left",
            truncation_side="left",
            trust_remote_code=trust_remote_code,
        )
        tokenizer.bos_token_id = config.decoder_start_token_id

        weights_loader = get_loader(
            quantize=quantize, model_id=model_id, revision=revision
        )
        torch.distributed.barrier(group=self.process_group)
        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
        weights = Weights(
            filenames,
            device=device,
            dtype=dtype,
            process_group=self.process_group,
            aliases=aliases,
            weights_loader=weights_loader,
        )
        if config.quantize in ["awq", "exl2", "gptq", "marlin"]:
            weights._set_gptq_params(model_id, revision)

        model = model_class(config, weights)

        torch.distributed.barrier(group=self.process_group)
        super().__init__(
            model_id=model_id,
            model=model,
            tokenizer=tokenizer,
            requires_padding=True,
            dtype=dtype,
            device=device,
            rank=rank,
            world_size=world_size,
        )

    @classmethod
    def fallback(
        cls,
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
        speculator: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
    ):
        if speculator:
            raise RuntimeError("Speculator decoding is not enabled for AutoModel")

        device_count = 0
        if torch.cuda.is_available():
            device = torch.device("cuda")
            device_count = torch.cuda.device_count()
            dtype = torch.float16 if dtype is None else dtype
        elif hasattr(torch, "xpu") and torch.xpu.is_available():
            device = torch.device("xpu")
            device_count = torch.xpu.device_count()
            dtype = torch.float16 if dtype is None else dtype
        else:
            if quantize:
                raise ValueError("quantization is not available on CPU")

            device = torch.device("cpu")
            dtype = torch.float32 if dtype is None else dtype

        model = AutoModelForSeq2SeqLM.from_pretrained(
            model_id,
            revision=revision,
            torch_dtype=dtype,
            device_map=("auto" if device_count > 1 else None),
            load_in_8bit=quantize == "bitsandbytes",
            trust_remote_code=trust_remote_code,
        )
        if device_count == 1:
            model = model.to(device)

        tokenizer = AutoTokenizer.from_pretrained(
            model_id,
            revision=revision,
            padding_side="left",
            truncation_side="left",
            trust_remote_code=trust_remote_code,
        )
        tokenizer.bos_token_id = model.config.decoder_start_token_id

        self = cls.__new__(
            cls,
        )
        super().__init__(
            self,
            model_id=model_id,
            model=model,
            tokenizer=tokenizer,
            requires_padding=True,
            dtype=dtype,
            device=device,
        )
        self.quantize = quantize
        return self

    @property
    def batch_type(self) -> Type[Seq2SeqLMBatch]:
        return Seq2SeqLMBatch

    def forward(
        self,
        input_ids,
        attention_mask,
        decoder_input_ids,
        decoder_attention_mask: Optional,
        encoder_last_hidden_state: Optional,
        past_key_values: Optional = None,
    ) -> Tuple[
        torch.Tensor,
        Optional[torch.Tensor],
        torch.Tensor,
        List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]],
    ]:
        # Model Forward
        outputs = self.model.forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            encoder_outputs=encoder_last_hidden_state,
            past_key_values=past_key_values,
            use_cache=True,
        )
        if isinstance(outputs, tuple):
            # Our custom models
            outputs, speculative_logits = outputs
        else:
            # Generic transformers models
            speculative_logits = None
        return (
            outputs.logits,
            speculative_logits,
            outputs.encoder_last_hidden_state,
            outputs.past_key_values,
        )

    @tracer.start_as_current_span("generate_token")
    def generate_token(
        self, batch: Seq2SeqLMBatch
    ) -> Tuple[List[Generation], Optional[Seq2SeqLMBatch], Tuple[int, int]]:
        start = time.time_ns()
        if batch.decoder_attention_mask is not None:
            # slice to the correct shape
            decoder_attention_mask = batch.decoder_attention_mask[
                :, : -batch.padding_right_offset
            ]
        else:
            decoder_attention_mask = None

        # Wrap `encoder_last_hidden_state` because for some reason, Transformers does a `encoder_last_hidden_state[0]`
        # internally...
        if batch.encoder_last_hidden_state is not None:
            encoder_last_hidden_state = [batch.encoder_last_hidden_state]
        else:
            encoder_last_hidden_state = None

        logits, speculative_logits, encoder_last_hidden_state, past = self.forward(
            batch.input_ids,
            batch.attention_mask,
            batch.decoder_input_ids,
            decoder_attention_mask,
            encoder_last_hidden_state,
            batch.past_key_values,
        )

        # Speculation is not active for seq2seq
        accepted_ids = torch.ones_like(batch.decoder_input_ids)[:, 0]
        batch_top_token_ids, batch_top_token_logprobs = batch_top_tokens(
            batch.top_n_tokens,
            batch.top_n_tokens_tensor,
            torch.log_softmax(logits[:, -1], -1),
            accepted_ids,
        )

        start_decode = time.time_ns()

        # Finished requests
        generations: List[Generation] = []
        stopped = True

        # Zipped iterator
        iterator = zip(
            batch.requests,
            batch.input_lengths,
            batch.prefix_offsets,
            batch.read_offsets,
            batch.decoder_input_lengths,
            logits,
            batch.next_token_choosers,
            batch.stopping_criterias,
            batch.all_decoder_input_ids,
            batch.top_n_tokens,
            batch_top_token_ids,
            batch_top_token_logprobs,
        )

        # For each member of the batch
        for i, (
            request,
            input_length,
            prefix_offset,
            read_offset,
            decoder_input_length,
            logits,
            next_token_chooser,
            stopping_criteria,
            all_decoder_input_ids,
            top_n_tokens,
            top_token_ids,
            top_token_logprobs,
        ) in enumerate(iterator):
            # Select next token
            next_token_id, logprobs = next_token_chooser(
                all_decoder_input_ids.view(1, -1), logits[-1:, :]
            )

            # Append next token to decoder tokens
            all_decoder_input_ids = torch.cat(
                [all_decoder_input_ids, next_token_id.squeeze(1)]
            )
            new_decoder_input_length = decoder_input_length + 1

            # Generated token
            next_token_logprob = logprobs[-1, next_token_id]
            next_token_id_squeezed = next_token_id.squeeze()
            next_token_text, prefix_offset, read_offset = self.decode_token(
                all_decoder_input_ids, prefix_offset, read_offset
            )

            # Evaluate stopping criteria
            stop, reason = stopping_criteria(next_token_id, next_token_text)

            if not stop:
                stopped = False

            # Shard generations
            # All generations will be appended in the rust sharded client
            if i % self.world_size == self.rank:
                if stop:
                    # Slice with decoder_input_length to remove padding
                    # Decode all tokens
                    output_text, _, _ = self.decode_token(
                        all_decoder_input_ids,
                        prefix_offset=len(all_decoder_input_ids)
                        - decoder_input_length
                        - 1,
                        read_offset=len(all_decoder_input_ids) - decoder_input_length,
                        skip_special_tokens=True,
                    )

                    # Get seed
                    if isinstance(next_token_chooser.choice, Sampling):
                        seed = next_token_chooser.choice.seed
                    else:
                        seed = None

                    generated_text = GeneratedText(
                        output_text, stopping_criteria.current_tokens, reason, seed
                    )
                else:
                    generated_text = None

                # Prefill
                if stopping_criteria.current_tokens == 1 and request.prefill_logprobs:
                    prefill_tokens = Tokens(
                        [self.tokenizer.bos_token_id],
                        [float("nan")],
                        [self.tokenizer.bos_token],
                        [False],
                    )
                else:
                    prefill_tokens = None

                if top_n_tokens > 0:
                    all_top_tokens = []
                    for top_token_ids, top_token_logprobs in zip(
                        top_token_ids, top_token_logprobs
                    ):
                        toptoken_texts = self.tokenizer.batch_decode(
                            top_token_ids,
                            clean_up_tokenization_spaces=False,
                            skip_special_tokens=False,
                        )
                        special_toptokens = [
                            token_id in self.all_special_ids
                            for token_id in top_token_ids
                        ]
                        top_tokens = Tokens(
                            top_token_ids,
                            top_token_logprobs,
                            toptoken_texts,
                            special_toptokens,
                        )
                        all_top_tokens.append(top_tokens)
                    top_tokens = all_top_tokens
                else:
                    top_tokens = None

                generation = Generation(
                    request.id,
                    prefill_tokens,
                    Tokens(
                        [next_token_id_squeezed],
                        [next_token_logprob],
                        [next_token_text],
                        [next_token_id_squeezed.item() in self.all_special_ids],
                    ),
                    generated_text,
                    top_tokens,
                )

                generations.append(generation)

            # Update values
            batch.next_token_choosers[i] = batch.next_token_choosers[i].advance_grammar(
                next_token_id_squeezed.item()
            )
            batch.decoder_input_ids[i] = next_token_id
            batch.all_decoder_input_ids[i] = all_decoder_input_ids
            batch.input_lengths[i] = input_length
            batch.decoder_input_lengths[i] = new_decoder_input_length
            batch.prefix_offsets[i] = prefix_offset
            batch.read_offsets[i] = read_offset
            batch.max_input_length = max(batch.max_input_length, input_length)
            batch.max_decoder_input_length = max(
                batch.max_decoder_input_length, new_decoder_input_length
            )

        # We finished all generations in the batch; there is no next batch
        if stopped:
            forward_ns = start_decode - start
            decode_ns = time.time_ns() - start_decode
            return generations, None, (forward_ns, decode_ns)

        # We don't need input_ids after the prefill forward
        batch.input_ids = None
        batch.encoder_last_hidden_state = encoder_last_hidden_state
        batch.past_key_values = past
        # Update decoder_attention_mask as we added a new token to input_ids
        if batch.decoder_attention_mask is not None:
            batch.decoder_attention_mask[:, -batch.padding_right_offset] = 1
        batch.padding_right_offset -= 1

        forward_ns = start_decode - start
        decode_ns = time.time_ns() - start_decode
        return generations, batch, (forward_ns, decode_ns)


================================================
FILE: server/text_generation_server/models/transformers_flash_causal_lm.py
================================================
import math
from typing import List, Optional

import torch
from opentelemetry import trace
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers.modeling_utils

from text_generation_server.models.flash_causal_lm import FlashCausalLM
from text_generation_server.utils import initialize_torch_distributed

from text_generation_server.layers.attention import paged_attention, attention, Seqlen
from text_generation_server.layers.attention.kv_cache import KVScales, KVCache
from text_generation_server.models.globals import ATTENTION
from text_generation_server.utils.import_utils import SYSTEM

tracer = trace.get_tracer(__name__)


def tgi_flash_attention_forward(
    module,
    query_states: torch.Tensor,
    key_states: torch.Tensor,
    value_states: torch.Tensor,
    attention_mask: Optional[torch.Tensor],  # This is a positional arg in Transformers
    kv_cache: List[KVCache],
    kv_head_mapping: torch.Tensor,
    slots: torch.Tensor,
    cu_seqlen_prefill: Optional[torch.Tensor],
    seqlen: Seqlen,
    block_tables: torch.Tensor,
    max_s: int,
    kv_scales: KVScales,
    softmax_scale: Optional[float] = None,
    sliding_window: Optional[int] = None,
    softcap: Optional[float] = None,
    **kwargs,  # This is needed to "absorb" other args passed by Transformers modeling
):

    kv_cache = kv_cache[module.layer_idx]
    query_states = query_states.transpose(1, 2).squeeze(dim=0)
    key_states = key_states.transpose(1, 2).squeeze(dim=0)
    value_states = value_states.transpose(1, 2).squeeze(dim=0)

    # Take care of updating the cache in-place
    kv_cache.store(key=key_states, value=value_states, slots=slots, kv_scales=kv_scales)

    _, num_heads, head_dim = query_states.shape
    softmax_scale = 1 / math.sqrt(head_dim) if softmax_scale is None else softmax_scale
    sliding_window = -1 if sliding_window is None else sliding_window

    if cu_seqlen_prefill is not None:
        attn_output = attention(
            query=query_states,
            key=key_states,
            value=value_states,
            kv_cache=kv_cache,
            kv_scales=kv_scales,
            seqlen=seqlen,
            block_tables=block_tables,
            softmax_scale=softmax_scale,
            window_size_left=sliding_window,
            softcap=softcap,
        )
    else:
        attn_output = paged_attention(
            query_states,
            kv_cache,
            kv_head_mapping,
            softmax_scale,
            block_tables,
            seqlen,
            max_s,
            kv_scales=kv_scales,
            softcap=softcap,
            window_size_left=sliding_window,
        )

    attn_output = attn_output.view(-1, num_heads * head_dim)

    return attn_output, None


transformers.modeling_utils.ALL_ATTENTION_FUNCTIONS["tgi"] = tgi_flash_attention_forward

# The base TP plan of these models has replicated q/k/v. This means that each process will see the full states,
# hence we should not divide the number of heads by the world size. This is a known waste of VRAM (the cache
# will be fully replicated on each process) and GPU communication (additional all-gather operations), however due
# to internal constraints it was not (yet?) possible to circumvent
REPLICATED_ATTENTION_MODELS = [
    "olmo2",
    "phi3",
]


class TransformersFlashCausalLM(FlashCausalLM):
    def __init__(
        self,
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
        speculator: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        default_dtype=torch.float16,
        trust_remote_code: bool = False,
        tokenizer_class=AutoTokenizer,
        kv_cache_dtype: Optional[torch.dtype] = None,
    ):
        self.quantize = quantize
        self.process_group, rank, world_size = initialize_torch_distributed()

        if speculator:
            raise RuntimeError("Speculator decoding is not enabled for AutoModel")

        if torch.cuda.is_available():
            device = torch.device(f"cuda:{rank}")
            dtype = default_dtype if dtype is None else dtype
        elif SYSTEM == "ipex":
            if hasattr(torch, "xpu") and torch.xpu.is_available():
                device = torch.device(f"xpu:{rank}")
            else:
                device = torch.device("cpu")
            dtype = default_dtype if dtype is None else dtype
        else:
            raise ValueError(
                "Flash `Transformers` modeling backend is not available on cpu."
            )

        tokenizer = tokenizer_class.from_pretrained(
            model_id,
            revision=revision,
            padding_side="left",
            truncation_side="left",
            trust_remote_code=trust_remote_code,
        )

        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            revision=revision,
            torch_dtype=dtype,
            load_in_8bit=quantize == "bitsandbytes",
            trust_remote_code=trust_remote_code,
            attn_implementation="tgi",
            device_map=device if world_size == 1 else None,
            tp_plan="auto" if world_size > 1 else None,
        )

        torch.distributed.barrier(group=self.process_group)

        if tokenizer.pad_token_id is None:
            if model.config.pad_token_id is not None:
                tokenizer.pad_token_id = model.config.pad_token_id
            elif model.config.eos_token_id is not None and isinstance(
                model.config.eos_token_id, int
            ):
                tokenizer.pad_token_id = model.config.eos_token_id
            elif tokenizer.eos_token_id is not None:
                tokenizer.pad_token_id = tokenizer.eos_token_id
            else:
                tokenizer.add_special_tokens({"pad_token": "[PAD]"})

        self.num_layers = model.config.num_hidden_layers
        self.num_heads = model.config.num_attention_heads
        self.num_kv_heads = model.config.num_key_value_heads
        # Some models use GQA and different sizes for o_proj
        # and q_proj, that allows for that.
        if hasattr(model.config, "head_dim"):
            self.head_size = model.config.head_dim
        else:
            self.head_size = (
                model.config.hidden_size // model.config.num_attention_heads
            )

        # Skip it for models in the exception list
        if model.config.model_type not in REPLICATED_ATTENTION_MODELS:
            self.num_heads = self.num_heads // self.process_group.size()
            self.num_kv_heads = (
                self.num_kv_heads // self.process_group.size()
                if self.num_kv_heads > 1
                else self.num_kv_heads
            )

        self.cuda_graphs = {}
        self.kv_cache = []
        self.kv_cache_dtype = dtype if kv_cache_dtype is None else kv_cache_dtype

        if ATTENTION == "flashinfer":
            from text_generation_server.layers.attention.flashinfer import (
                create_prefill_state,
                create_decode_state,
                create_prefill_with_paged_kv_state,
            )

            self.prefill_state = create_prefill_state(device=device)
            self.prefill_with_paged_kv_state = create_prefill_with_paged_kv_state(
                device=device
            )

            self.decode_state = create_decode_state(
                device=device,
                num_heads=self.num_heads,
                num_kv_heads=self.num_kv_heads,
            )

        self.num_groups = self.num_heads // self.num_kv_heads

        # Those will never change and will be used in the forwards
        self.kv_head_mapping = torch.arange(
            0, self.num_kv_heads, dtype=torch.int32, device=device
        ).repeat_interleave(self.num_groups)
        # This means no scale
        self.kv_scales = KVScales(
            torch.tensor(1.0, device=device),
            torch.tensor(1.0, device=device),
        )

        # Skip FlashCausalLM init.
        super(FlashCausalLM, self).__init__(
            model_id=model_id,
            model=model,
            tokenizer=tokenizer,
            requires_padding=False,
            dtype=dtype,
            device=device,
            rank=rank,
            world_size=world_size,
        )

        # Monkey patch of `self.model.forward` to match `FlashCausalLM`. It avoids duplicating a lot of code
        # We first copy the original model.forward because we still need it in the monkey patch
        self.model.original_forward = self.model.forward
        self.model.forward = self._model_forward

        torch.distributed.barrier(group=self.process_group)

    @classmethod
    def fallback(
        cls,
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
        speculator: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
    ):
        return cls(
            model_id=model_id,
            revision=revision,
            quantize=quantize,
            speculator=speculator,
            dtype=dtype,
            trust_remote_code=trust_remote_code,
        )

    def _model_forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[KVCache],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        lm_head_indices: Optional[torch.Tensor],
        prefill_cache_indices=None,  # not used, but passed to match original signature
        adapter_data=None,  # not supported, but passed to match original signature
    ):
        # A value of `None` (i.e. no logit slicing) translates to `0` in Transformers
        logits_to_keep = lm_head_indices if lm_head_indices is not None else 0

        # This is equivalent to `self.model.forward`, see the monkey patch in __init__
        logits = self.model.original_forward(
            input_ids=input_ids.unsqueeze(0),  # expand dim to fit Transformers
            position_ids=position_ids.unsqueeze(0),  # expand dim to fit Transformers
            past_key_values=None,  # we use self.kv_cache instead of transformers cache object
            use_cache=False,  # we use self.kv_cache instead of transformers cache object
            logits_to_keep=logits_to_keep,
            return_dict=True,
            cu_seqlen_prefill=cu_seqlen_prefill,
            kv_cache=kv_cache,
            block_tables=block_tables,
            slots=slots,
            seqlen=seqlen,
            max_s=max_s,
            kv_head_mapping=self.kv_head_mapping,
            kv_scales=self.kv_scales,
        ).logits.squeeze(dim=0)

        return logits, None


================================================
FILE: server/text_generation_server/models/transformers_flash_vlm.py
================================================
import math
from typing import List, Optional

import torch
from opentelemetry import trace
from transformers import AutoTokenizer, AutoProcessor
import transformers.modeling_utils

from text_generation_server.models.flash_causal_lm import FlashCausalLM
from text_generation_server.models.vlm_causal_lm import VlmCausalLM, VlmCausalLMBatch
from text_generation_server.utils import initialize_torch_distributed

from text_generation_server.layers.attention import paged_attention, attention, Seqlen
from text_generation_server.layers.attention.kv_cache import KVScales, KVCache
from text_generation_server.models.globals import ATTENTION
import torch.nn.functional as F
from text_generation_server.utils.import_utils import SYSTEM

tracer = trace.get_tracer(__name__)

# The base TP plan of these models has replicated q/k/v. This means that each process will see the full states,
# hence we should not divide the number of heads by the world size. This is a known waste of VRAM (the cache
# will be fully replicated on each process) and GPU communication (additional all-gather operations), however due
# to internal constraints it was not (yet?) possible to circumvent
REPLICATED_ATTENTION_MODELS = [
    "olmo2",
    "phi3",
]


# # Qwen2VL
# transformers.models.qwen2_vl.modeling_qwen2_vl.QWEN2_VL_VISION_ATTENTION_CLASSES[
#     "tgi"
# ] = transformers.models.qwen2_vl.modeling_qwen2_vl.QWEN2_VL_VISION_ATTENTION_CLASSES[
#     "eager"
# ]
def tgi_flash_attention_forward(
    module,
    query_states: torch.Tensor,
    key_states: torch.Tensor,
    value_states: torch.Tensor,
    attention_mask: Optional[torch.Tensor],  # This is a positional arg in Transformers
    kv_cache: List[KVCache],
    kv_head_mapping: torch.Tensor,
    slots: torch.Tensor,
    cu_seqlen_prefill: Optional[torch.Tensor],
    seqlen: Seqlen,
    block_tables: torch.Tensor,
    max_s: int,
    kv_scales: KVScales,
    softmax_scale: Optional[float] = None,
    sliding_window: Optional[int] = None,
    softcap: Optional[float] = None,
    use_sdpa: Optional[bool] = False,
    **kwargs,  # This is needed to "absorb" other args passed by Transformers modeling
):
    kv_cache = kv_cache[module.layer_idx]
    query_states = query_states.transpose(1, 2).squeeze(dim=0)
    key_states = key_states.transpose(1, 2).squeeze(dim=0)
    value_states = value_states.transpose(1, 2).squeeze(dim=0)

    # Take care of updating the cache in-place
    kv_cache.store(key=key_states, value=value_states, slots=slots, kv_scales=kv_scales)

    _, num_heads, head_dim = query_states.shape
    softmax_scale = 1 / math.sqrt(head_dim) if softmax_scale is None else softmax_scale
    sliding_window = -1 if sliding_window is None else sliding_window

    if cu_seqlen_prefill is not None:
        if not use_sdpa:
            attn_output = attention(
                query=query_states,
                key=key_states,
                value=value_states,
                kv_cache=kv_cache,
                kv_scales=kv_scales,
                seqlen=seqlen,
                block_tables=block_tables,
                softmax_scale=softmax_scale,
                window_size_left=sliding_window,
                softcap=softcap,
            )
        else:
            lengths = cu_seqlen_prefill[1:] - cu_seqlen_prefill[:-1]
            max_length = max(lengths)
            attention_mask = attention_mask[:, :, :, :max_length]
            enable_gqa = query_states.shape[1] != key_states.shape[1]
            # Split tensors using vectorized split
            query_list = torch.split(query_states, lengths.tolist(), dim=0)
            key_list = torch.split(key_states, lengths.tolist(), dim=0)
            value_list = torch.split(value_states, lengths.tolist(), dim=0)

            padded_query = torch.nn.utils.rnn.pad_sequence(query_list, batch_first=True)
            padded_key = torch.nn.utils.rnn.pad_sequence(key_list, batch_first=True)
            padded_value = torch.nn.utils.rnn.pad_sequence(value_list, batch_first=True)

            padded_query = padded_query.transpose(1, 2).contiguous()
            padded_key = padded_key.transpose(1, 2).contiguous()
            padded_value = padded_value.transpose(1, 2).contiguous()

            # Compute attention
            attn_output = F.scaled_dot_product_attention(
                padded_query,
                padded_key,
                padded_value,
                attn_mask=attention_mask,
                scale=softmax_scale,
                enable_gqa=enable_gqa,
            )

            attn_output = attn_output.transpose(
                1, 2
            )  # [batch_size, seq_len, num_heads, head_dim]
            max_seq_len = padded_query.size(2)
            seq_range = torch.arange(max_seq_len, device=padded_query.device).unsqueeze(
                0
            )
            lengths_tensor = torch.tensor(
                lengths, device=padded_query.device
            ).unsqueeze(1)
            mask = seq_range < lengths_tensor  # [batch, max_seq_len]
            attn_output = attn_output[mask]  # [total_seq_len, num_heads, head_dim]

    else:
        attn_output = paged_attention(
            query_states,
            kv_cache,
            kv_head_mapping,
            softmax_scale,
            block_tables,
            seqlen,
            max_s,
            kv_scales=kv_scales,
            softcap=softcap,
            window_size_left=sliding_window,
        )

    attn_output = attn_output.view(-1, num_heads * head_dim)

    return attn_output, None


transformers.modeling_utils.ALL_ATTENTION_FUNCTIONS["tgi"] = tgi_flash_attention_forward


# TODO: implement
# tgi_cross_attention_forward


class TransformersFlashVlmCausalLM(VlmCausalLM):
    def __init__(
        self,
        model_id: str,
        model_class,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
        speculator: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        default_dtype=torch.float16,
        trust_remote_code: bool = False,
        tokenizer_class=AutoTokenizer,
        processor_class=AutoProcessor,
        processor_kwargs=None,
        kv_cache_dtype: Optional[torch.dtype] = None,
        batch_class=VlmCausalLMBatch,
        support_chunking: bool = True,
    ):
        self.batch_class = batch_class
        self.quantize = quantize
        self.process_group, rank, world_size = initialize_torch_distributed()
        self.dtype = dtype

        if speculator:
            raise RuntimeError("Speculator decoding is not enabled for AutoModel")

        if torch.cuda.is_available():
            device = torch.device(f"cuda:{rank}")
            dtype = default_dtype if dtype is None else dtype
        elif SYSTEM == "ipex":
            if hasattr(torch, "xpu") and torch.xpu.is_available():
                device = torch.device(f"xpu:{rank}")
            else:
                device = torch.device("cpu")
            dtype = default_dtype if dtype is None else dtype
        else:
            raise ValueError(
                "Flash `Transformers` modeling backend is not available on cpu."
            )

        tokenizer = tokenizer_class.from_pretrained(
            model_id,
            revision=revision,
            padding_side="left",
            truncation_side="left",
            trust_remote_code=trust_remote_code,
        )

        if processor_kwargs is None:
            processor_kwargs = {}

        self.processor = processor_class.from_pretrained(
            model_id,
            revision=revision,
            trust_remote_code=trust_remote_code,
            **processor_kwargs,
        )

        attn_implementation = {
            "text_config": "tgi",
            "vision_config": "sdpa",
        }

        model = model_class.from_pretrained(
            model_id,
            revision=revision,
            torch_dtype=dtype,
            load_in_8bit=quantize == "bitsandbytes",
            trust_remote_code=trust_remote_code,
            attn_implementation=attn_implementation,
            device_map=device if world_size == 1 else None,
            tp_plan="auto" if world_size > 1 else None,
        )

        torch.distributed.barrier(group=self.process_group)
        self.config = model.config
        config = model.config

        # VLM models define the config we care about in their text_config
        text_config = getattr(model.config, "text_config", None)
        if text_config is not None:
            config = text_config

        if tokenizer.pad_token_id is None:
            if model.config.pad_token_id is not None:
                tokenizer.pad_token_id = model.config.pad_token_id
            elif model.config.eos_token_id is not None and isinstance(
                model.config.eos_token_id, int
            ):
                tokenizer.pad_token_id = model.config.eos_token_id
            elif tokenizer.eos_token_id is not None:
                tokenizer.pad_token_id = tokenizer.eos_token_id
            else:
                tokenizer.add_special_tokens({"pad_token": "[PAD]"})

        self.num_layers = config.num_hidden_layers
        self.num_heads = config.num_attention_heads
        self.num_kv_heads = config.num_key_value_heads
        # Some models use GQA and different sizes for o_proj
        # and q_proj, that allows for that.
        if hasattr(config, "head_dim"):
            self.head_size = config.head_dim
        else:
            self.head_size = config.hidden_size // config.num_attention_heads

        # Skip it for models in the exception list
        if config.model_type not in REPLICATED_ATTENTION_MODELS:
            self.num_heads = self.num_heads // self.process_group.size()
            self.num_kv_heads = (
                self.num_kv_heads // self.process_group.size()
                if self.num_kv_heads > 1
                else self.num_kv_heads
            )

        self.cuda_graphs = {}
        self.kv_cache = []
        self.kv_cache_dtype = dtype if kv_cache_dtype is None else kv_cache_dtype

        if ATTENTION == "flashinfer":
            from text_generation_server.layers.attention.flashinfer import (
                create_prefill_state,
                create_decode_state,
                create_prefill_with_paged_kv_state,
            )

            self.prefill_state = create_prefill_state(device=device)
            self.prefill_with_paged_kv_state = create_prefill_with_paged_kv_state(
                device=device
            )

            self.decode_state = create_decode_state(
                device=device,
                num_heads=self.num_heads,
                num_kv_heads=self.num_kv_heads,
            )

        self.num_groups = self.num_heads // self.num_kv_heads

        # Those will never change and will be used in the forwards
        self.kv_head_mapping = torch.arange(
            0, self.num_kv_heads, dtype=torch.int32, device=device
        ).repeat_interleave(self.num_groups)
        # This means no scale
        self.kv_scales = KVScales(
            torch.tensor(1.0, device=device),
            torch.tensor(1.0, device=device),
        )

        # Skip FlashCausalLM init.
        super(FlashCausalLM, self).__init__(
            model_id=model_id,
            model=model,
            tokenizer=tokenizer,
            requires_padding=False,
            dtype=dtype,
            device=device,
            rank=rank,
            world_size=world_size,
            support_chunking=support_chunking,
        )

        # Monkey patch of `self.model.forward` to match `FlashCausalLM`. It avoids duplicating a lot of code
        # We first copy the original model.forward because we still need it in the monkey patch
        self.model.original_forward = self.model.forward
        self.model.forward = self._model_forward
        self.model.get_position_ids = self.get_position_ids

        torch.distributed.barrier(group=self.process_group)

    def get_position_ids(self, input_ids, image_grid_thw, position_ids):
        return position_ids

    def pre_process_inputs(self, input_ids, position_ids, cu_seqlen_prefill):
        return {
            "input_ids": input_ids.unsqueeze(0),
            "position_ids": position_ids.unsqueeze(0),
        }

    def post_process_outputs(self, logits, lm_head_indices):
        return logits.squeeze(dim=0)

    @classmethod
    def fallback(
        cls,
        model_id: str,
        model_class,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
        speculator: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
        batch_class: Optional[type] = VlmCausalLMBatch,
        processor_kwargs: Optional[dict] = None,
        support_chunking: bool = True,
    ):
        return cls(
            model_id=model_id,
            model_class=model_class,
            revision=revision,
            quantize=quantize,
            speculator=speculator,
            dtype=dtype,
            trust_remote_code=trust_remote_code,
            batch_class=batch_class,
            processor_kwargs=processor_kwargs,
            support_chunking=support_chunking,
        )

    def _model_forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[KVCache],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        seqlen: Seqlen,
        max_s: int,
        lm_head_indices: Optional[torch.Tensor],
        prefill_cache_indices=None,  # not used, but passed to match original signature
        adapter_data=None,  # not supported, but passed to match original signature
        pixel_values: torch.FloatTensor = None,
        image_grid_thw: Optional[torch.LongTensor] = None,
        pixel_attention_mask=None,
        image_sizes: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
    ):
        # A value of `None` (i.e. no logit slicing) translates to `0` in Transformers
        logits_to_keep = lm_head_indices if lm_head_indices is not None else 0

        inputs = self.pre_process_inputs(
            input_ids=input_ids,
            position_ids=position_ids,
            cu_seqlen_prefill=cu_seqlen_prefill,
        )
        inputs["input_ids"] = None

        # This is equivalent to `self.model.forward`, see the monkey patch in __init__
        logits = self.model.original_forward(
            input_ids=inputs["input_ids"],
            inputs_embeds=inputs_embeds.unsqueeze(0),
            position_ids=inputs["position_ids"],
            past_key_values=None,  # we use self.kv_cache instead of transformers cache object
            use_cache=False,  # we use self.kv_cache instead of transformers cache object
            logits_to_keep=logits_to_keep,
            return_dict=True,
            cu_seqlen_prefill=cu_seqlen_prefill,
            kv_cache=kv_cache,
            block_tables=block_tables,
            slots=slots,
            seqlen=seqlen,
            max_s=max_s,
            kv_head_mapping=self.kv_head_mapping,
            kv_scales=self.kv_scales,
            pixel_values=pixel_values,
            pixel_attention_mask=pixel_attention_mask,
            image_sizes=image_sizes,
            image_grid_thw=image_grid_thw,
            attention_mask=inputs.get("attention_mask", None),
            use_sdpa=inputs.get("use_sdpa", False),
            cache_position=inputs.get("cache_position", None),
        ).logits

        logits = self.post_process_outputs(logits, lm_head_indices)

        return logits, None


class TransformersQwen2VlmCausalLM(TransformersFlashVlmCausalLM):
    def get_position_ids(self, input_ids: torch.Tensor, image_grid_thw: torch.Tensor):
        if image_grid_thw is None:
            return (
                torch.arange(input_ids.shape[0], device=input_ids.device)
                .unsqueeze(1)
                .repeat(1, 3)
            )

        spatial_merge_size = self.config.vision_config.spatial_merge_size
        vision_start_token_id = self.config.vision_start_token_id
        vision_end_token_id = self.config.vision_end_token_id
        device = input_ids.device
        dtype = input_ids.dtype
        input_ids_len = input_ids.shape[0]

        vision_starts = torch.where(input_ids == vision_start_token_id)[0]
        vision_ends = torch.where(input_ids == vision_end_token_id)[0]
        vision_segments = torch.stack((vision_starts, vision_ends), dim=1)
        prev_vision_end = torch.cat(
            [torch.zeros(1, device=vision_ends.device, dtype=dtype), vision_ends[:-1]]
        )
        text_lengths_between_vision = vision_segments[:, 0] - prev_vision_end + 1
        vision_widths_max = torch.cat(
            [
                torch.zeros(1, device=image_grid_thw.device, dtype=dtype),
                image_grid_thw[:-1, 2] // spatial_merge_size,
            ]
        )
        vision_segment_lengths = vision_widths_max + text_lengths_between_vision
        vision_segment_lengths = vision_segment_lengths.cumsum(dim=0)
        text_segment_lengths = vision_segment_lengths - text_lengths_between_vision

        # create position ids for each vision segment based on the image grid
        llm_pos_ids_list = []
        for i, _ in enumerate(vision_segments):
            t, h, w = (
                image_grid_thw[i][0],
                image_grid_thw[i][1] // spatial_merge_size,
                image_grid_thw[i][2] // spatial_merge_size,
            )
            t_indices = torch.arange(t, device=device).repeat_interleave(h * w)
            h_indices = torch.arange(h, device=device).repeat_interleave(w).repeat(t)
            w_indices = torch.arange(w, device=device).repeat(t * h)
            image_position_ids = torch.stack([t_indices, h_indices, w_indices], dim=0)

            # offset by the position of the last vision segment
            im = image_position_ids + vision_segment_lengths[i]
            llm_pos_ids_list.append(im)

        # create position ids for each text segment
        text_ranges = [
            torch.arange(seq_len, device=device).view(1, -1).expand(3, -1)
            + text_segment_lengths[i]
            for i, seq_len in enumerate(text_lengths_between_vision)
        ]

        full_llm_pos_ids_list = [
            item for sublist in zip(text_ranges, llm_pos_ids_list) for item in sublist
        ]
        # import ipdb

        # ipdb.set_trace()
        max_s = full_llm_pos_ids_list[-1].max() + 1
        final_text_len = input_ids_len - vision_ends[-1]
        if final_text_len > 0:
            m = torch.arange(final_text_len, device=device).view(1, -1).expand(3, -1)
            full_llm_pos_ids_list.append(m + max_s)

        position_ids = (
            torch.cat(full_llm_pos_ids_list, dim=1).reshape(3, -1).transpose(0, 1)
        )
        return position_ids

    def post_process_outputs(self, logits, lm_head_indices):
        return logits.squeeze(dim=0)[lm_head_indices].unsqueeze(0)

    def pre_process_inputs(self, input_ids, position_ids, cu_seqlen_prefill):
        input_ids = input_ids.unsqueeze(0)
        position_ids = position_ids.transpose(0, 1).unsqueeze(1)
        return {"input_ids": input_ids, "position_ids": position_ids}


class TransformersGemma3VlmCausalLM(TransformersFlashVlmCausalLM):
    def get_attention_mask(self, input_ids, cu_seqlen_prefill):
        device = input_ids.device
        dtype = self.dtype
        min_dtype = torch.finfo(dtype).min

        lengths = (cu_seqlen_prefill[1:] - cu_seqlen_prefill[:-1]).tolist()
        batch_size = len(lengths)

        sequence_length = max(lengths)
        target_length = sequence_length
        # Create the padding mask from the computed lengths.
        # pad_mask: [batch, sequence_length] where True indicates valid tokens.
        seq_range = torch.arange(sequence_length, device=device).unsqueeze(0)
        lengths_tensor = torch.tensor(lengths, device=device).unsqueeze(1)
        pad_mask = seq_range < lengths_tensor  # shape: [batch, sequence_length]

        # Build the base causal mask (for non-image tokens):
        causal_mask = torch.tril(
            torch.ones(
                (sequence_length, sequence_length), dtype=torch.bool, device=device
            )
        )
        base_mask = pad_mask.unsqueeze(2) & pad_mask.unsqueeze(
            1
        )  # [batch, sequence_length, sequence_length]
        base_mask = base_mask & causal_mask.unsqueeze(0)  # apply causal constraint

        image_token_mask = (input_ids == self.config.image_token_index).to(
            input_ids.device
        )

        image_token_mask = torch.nn.utils.rnn.pad_sequence(
            torch.split(image_token_mask, lengths), batch_first=True, padding_value=0
        )
        bidirectional_mask = image_token_mask.unsqueeze(2) & image_token_mask.unsqueeze(
            1
        )

        # Combine the causal base mask and the bidirectional mask.
        combined_mask = torch.logical_or(
            base_mask.unsqueeze(1), bidirectional_mask.unsqueeze(1)
        ).to(device)
        # combined_mask now has shape [batch, 1, sequence_length, sequence_length]

        full_attention_mask = torch.zeros(
            (batch_size, 1, sequence_length, target_length),
            device=device,
            dtype=torch.bool,
        )
        full_attention_mask[:, :, :, :sequence_length] = combined_mask

        final_attention_mask = torch.where(full_attention_mask, 0, min_dtype).to(device)

        return final_attention_mask

    def pre_process_inputs(self, input_ids, position_ids, cu_seqlen_prefill):
        inputs = {
            "input_ids": input_ids.unsqueeze(0),
            "position_ids": position_ids.unsqueeze(0),
        }

        if cu_seqlen_prefill is not None:
            attention_mask = self.get_attention_mask(
                input_ids.squeeze(0), cu_seqlen_prefill
            )
            inputs["attention_mask"] = attention_mask
            inputs["use_sdpa"] = True

        return inputs


class TransformersLlama4VlmCausalLM(TransformersFlashVlmCausalLM):
    def pre_process_inputs(self, input_ids, position_ids, cu_seqlen_prefill):
        inputs = super().pre_process_inputs(input_ids, position_ids, cu_seqlen_prefill)
        inputs["cache_position"] = position_ids
        inputs["attention_mask"] = torch.zeros((1, 1, 1, 1), device=input_ids.device)
        return inputs

    def get_vision_embeds(
        self,
        pixel_values: torch.FloatTensor,
        pixel_attention_mask: Optional[torch.FloatTensor] = None,
        image_sizes: Optional[torch.Tensor] = None,
        image_grid_thw: Optional[torch.LongTensor] = None,
    ):
        image_features = self.model.get_image_features(
            pixel_values=pixel_values,
            vision_feature_layer=self.model.config.vision_config.vision_feature_layer,
            vision_feature_select_strategy=self.model.config.vision_config.vision_feature_select_strategy,
            image_sizes=image_sizes,
        )

        vision_flat = image_features.view(-1, image_features.size(-1))
        projected_vision_flat = self.model.multi_modal_projector(vision_flat)
        return projected_vision_flat

    def get_inputs_embeds(self, input_ids, vision_embeds=None):
        inputs_embeds = self.model.get_input_embeddings()(input_ids)

        if vision_embeds is not None:
            original_inputs_embeds_shape = inputs_embeds.shape
            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(
                -1
            )
            final_mask = special_image_mask.to(inputs_embeds.device)
            inputs_embeds = inputs_embeds.view(-1, inputs_embeds.size(-1))

            final_mask_1d = final_mask[..., 0].reshape(-1)
            num_tokens_to_fill = final_mask_1d.sum()

            if num_tokens_to_fill != vision_embeds.size(0):
                raise ValueError(
                    f"Mismatch: final_mask wants {num_tokens_to_fill} embeddings, "
                    f"but multi_modal_projector returned {vision_embeds.size(0)}"
                )

            expanded_mask = final_mask_1d.unsqueeze(-1).expand(
                -1, inputs_embeds.size(-1)
            )
            inputs_embeds = inputs_embeds.masked_scatter(expanded_mask, vision_embeds)
            inputs_embeds = inputs_embeds.view(original_inputs_embeds_shape)
        return inputs_embeds


================================================
FILE: server/text_generation_server/models/types.py
================================================
import torch

from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import List, Optional

from transformers import PreTrainedTokenizerBase

from text_generation_server.pb import generate_pb2
from text_generation_server.pb.generate_pb2 import FinishReason


class Batch(ABC):
    @abstractmethod
    def to_pb(self) -> generate_pb2.CachedBatch:
        raise NotImplementedError

    @classmethod
    @abstractmethod
    def from_pb(
        cls,
        pb: generate_pb2.Batch,
        tokenizer: PreTrainedTokenizerBase,
        dtype: torch.dtype,
        device: torch.device,
    ) -> "Batch":
        raise NotImplementedError

    @abstractmethod
    def filter(self, request_ids: List[int]) -> "Batch":
        raise NotImplementedError

    @classmethod
    @abstractmethod
    def concatenate(cls, batches: List["Batch"]) -> "Batch":
        raise NotImplementedError

    @abstractmethod
    def __len__(self):
        raise NotImplementedError


@dataclass
class GeneratedText:
    text: str
    generated_tokens: int
    finish_reason: FinishReason
    seed: Optional[int]

    def to_pb(self) -> generate_pb2.GeneratedText:
        return generate_pb2.GeneratedText(
            text=self.text,
            generated_tokens=self.generated_tokens,
            finish_reason=self.finish_reason,
            seed=self.seed,
        )


@dataclass
class Tokens:
    token_ids: List[int]
    logprobs: List[float]
    texts: List[str]
    is_special: List[bool]

    def to_pb(self) -> generate_pb2.Tokens:
        return generate_pb2.Tokens(
            ids=self.token_ids,
            logprobs=self.logprobs,
            texts=self.texts,
            is_special=self.is_special,
        )

    def __len__(self):
        return len(self.token_ids)

    def __add__(self, other: "Tokens") -> "Tokens":
        return Tokens(
            self.token_ids + other.token_ids,
            self.logprobs + other.logprobs,
            self.texts + other.texts,
            self.is_special + other.is_special,
        )


@dataclass
class Generation:
    request_id: int
    prefill_tokens: Optional[Tokens]
    tokens: Tokens
    generated_text: Optional[GeneratedText]
    # Optional for now, since it's not yet supported for every model.
    top_tokens: Optional[List[Tokens]]

    def to_pb(self) -> generate_pb2.Generation:
        return generate_pb2.Generation(
            request_id=self.request_id,
            prefill_tokens=(
                self.prefill_tokens.to_pb() if self.prefill_tokens is not None else None
            ),
            tokens=self.tokens.to_pb(),
            generated_text=(
                self.generated_text.to_pb() if self.generated_text is not None else None
            ),
            top_tokens=(
                [top_tokens.to_pb() for top_tokens in self.top_tokens]
                if self.top_tokens is not None
                else None
            ),
        )


================================================
FILE: server/text_generation_server/models/vlm_causal_lm.py
================================================
from dataclasses import dataclass
import torch
from PIL import Image
from io import BytesIO

from opentelemetry import trace
from typing import Iterable, Optional, Tuple, List, Type, Dict

from transformers import PreTrainedTokenizerBase
from transformers.image_processing_utils import select_best_resolution
from text_generation_server.pb import generate_pb2
from text_generation_server.models.flash_causal_lm import (
    FlashCausalLMBatch,
    FlashCausalLM,
)
from text_generation_server.models.globals import PREFIX_CACHING, ATTENTION, MEM_POOL
from loguru import logger
from text_generation_server.utils.log import log_master
from transformers import AutoProcessor
from text_generation_server.layers.attention import Seqlen
from text_generation_server.models.metadata_kernels import block_tables_to_ragged

tracer = trace.get_tracer(__name__)

IDEFICS2_FAKE_TOKEN = "<fake_token_around_image>"
IDEFICS2_IMAGE_TOKEN = "<image>"

IDEFICS3_IMAGE_TOKEN = "<image>"
IDEFICS3_FAKE_IMAGE_TOKEN = "<fake_token_around_image>"
IDEFICS3_GLOBAL_IMG_TOKEN = "<global-img>"


def prompt_split_image_llama4(aspect_ratio, num_patches_per_chunk):
    """
    Create a structured string representation of image tokens

    Args:
       num_patches: Number of patches in the image

    Returns:
        String with appropriate image tokens
    """
    img_string = "<|image_start|>"
    ratio_h, ratio_w = aspect_ratio
    if ratio_h * ratio_w > 1:
        for yy in range(ratio_h):
            for xx in range(ratio_w):
                img_string += "<|patch|>" * num_patches_per_chunk
                if xx < ratio_w - 1:
                    img_string += "<|tile_x_separator|>"

            img_string += "<|tile_y_separator|>"
    img_string += "<|image|>"
    img_string += "<|patch|>" * num_patches_per_chunk
    img_string += "<|image_end|>"

    return img_string


# copied from: https://github.com/huggingface/transformers/blob/02ed609285c2448b3b54c31e362f2c389fa952ab/src/transformers/models/idefics3/processing_idefics3.py#L44-L60
def _prompt_split_image(
    *,
    image_seq_len: int,
    image_rows: int,
    image_cols: int,
    fake_token_around_image: str,
    image_token: str,
    global_img_token: str,
):
    """Prompt with expanded image tokens for when the image is split into patches."""
    text_split_images = ""
    for n_h in range(image_rows):
        for n_w in range(image_cols):
            text_split_images += (
                f"{fake_token_around_image}"
                + f"<row_{n_h + 1}_col_{n_w + 1}>"
                + f"{image_token}" * image_seq_len
            )
        text_split_images += "\n"

    text_split_images += (
        f"\n{fake_token_around_image}"
        + f"{global_img_token}"
        + f"{image_token}" * image_seq_len
        + f"{fake_token_around_image}"
    )
    return text_split_images


def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
    """
    Calculate the shape of the image patch grid after the preprocessing for images of any resolution.

    Args:
        image_size (`tuple`):
            The size of the input image in the format (height, width).
        grid_pinpoints (`List`):
            A list containing possible resolutions. Each item in the list should be a tuple or list
            of the form `(height, width)`.
        patch_size (`int`):
            The size of each image patch.

    Returns:
        tuple: The shape of the image patch grid in the format (width, height).
    """
    if not isinstance(grid_pinpoints, list):
        raise ValueError("grid_pinpoints should be a list of tuples or lists")

    height, width = select_best_resolution(image_size, grid_pinpoints)
    return height // patch_size, width // patch_size


def image_text_replacement(processor, image_input, config) -> str:
    if config.model_type == "idefics2":
        image_seq_len = 64
        image_str = f"{IDEFICS2_FAKE_TOKEN}{IDEFICS2_IMAGE_TOKEN * image_seq_len}{IDEFICS2_FAKE_TOKEN}"
        if processor.image_processor.do_image_splitting:
            image_str *= 5
        return image_str, IDEFICS2_FAKE_TOKEN
    if config.model_type == "idefics3":
        # TODO: implement this in a more general way
        n_rows = image_input["rows"][0][0]
        n_cols = image_input["cols"][0][0]
        image_seq_len = int(
            ((config.vision_config.image_size // config.vision_config.patch_size) ** 2)
            / (config.scale_factor**2)
        )
        image_str = _prompt_split_image(
            image_seq_len=image_seq_len,
            image_rows=n_rows,
            image_cols=n_cols,
            fake_token_around_image=IDEFICS3_FAKE_IMAGE_TOKEN,
            image_token=IDEFICS3_IMAGE_TOKEN,
            global_img_token=IDEFICS3_GLOBAL_IMG_TOKEN,
        )
        return image_str, IDEFICS3_FAKE_IMAGE_TOKEN
    elif config.model_type == "llava_next":
        height, width = image_input["image_sizes"][0]
        num_features = get_number_of_features(height, width, config)

        log_master(
            logger.info,
            f"Found {num_features} features in image of resolution {height}x{width}",
        )
        return "<image>" * num_features, "<image>"

    elif config.model_type == "paligemma":
        return "<image>" * config.text_config.num_image_tokens, "<image>"
    elif config.model_type == "qwen2_vl":
        grid_t, grid_h, grid_w = image_input["image_grid_thw"][0]
        num_pads = grid_t * grid_h * grid_w // 4
        padding = "<|image_pad|>" * num_pads
        return f"<|vision_start|>{padding}<|vision_end|>", "<|vision_start|>"
    elif config.model_type == "qwen2_5_vl":
        grid_t, grid_h, grid_w = image_input["image_grid_thw"][0]
        num_pads = grid_t * grid_h * grid_w // 4
        padding = "<|image_pad|>" * num_pads
        return f"<|vision_start|>{padding}<|vision_end|>", "<|vision_start|>"
    elif config.model_type == "gemma3":
        # TODO: get correct number of features via reviewing the Gemma3 architecture
        # and calculating the number of image tokens
        num_pads = 256
        padding = "<image_soft_token>" * num_pads
        return f"\n\n<start_of_image>{padding}<end_of_image>\n\n", "<start_of_image>"
    elif config.model_type == "llama4":
        patch_size = config.vision_config.patch_size
        pixel_shuffle_ratio = config.vision_config.pixel_shuffle_ratio
        downsample_ratio = int(round(1.0 / (pixel_shuffle_ratio**2)))
        aspect_ratios = image_input["aspect_ratios"][0]
        image_height, image_width = image_input["pixel_values"][0].shape[-2:]

        num_patches_per_chunk = int(
            (image_height // patch_size)
            * (image_width // patch_size)
            // downsample_ratio
        )
        tokens_for_this_image = prompt_split_image_llama4(
            aspect_ratios, num_patches_per_chunk
        )

        return tokens_for_this_image, "<|image_start|>"
    else:
        raise RuntimeError(f"Unknown config {config.model_type} for multimodal")


def image_text_replacement_fixup(config, text: str) -> str:
    if config.model_type == "idefics2":
        return text.replace(
            f"{IDEFICS2_FAKE_TOKEN}{IDEFICS2_FAKE_TOKEN}", IDEFICS2_FAKE_TOKEN
        )
    return text


def preprocess_text(config, text: str) -> str:
    if config.model_type == "paligemma":
        return "<bos>" + text + "\n"
    return text


def preprocess_image(config, img):
    model_type = config.model_type

    if model_type in {"qwen2_vl", "qwen2_5_vl"} and img.width <= 20:
        img = img.resize((img.width * 2, img.height * 2))
    if model_type == "paligemma":
        img = img.convert("RGB")

    if model_type not in {"llava_next", "gemma3", "llama4"}:
        # TODO: check if this is needed
        img = [img]

    return img


def get_unpadded_features(
    original_height: int,
    original_width: int,
    npatches: int,
    num_patch_height: int,
    num_patch_width: int,
) -> Tuple[int, int]:
    current_height = npatches * num_patch_height
    current_width = npatches * num_patch_width

    aspect_ratio: float = original_width / original_height
    current_aspect_ratio: float = current_width / current_height

    if aspect_ratio > current_aspect_ratio:
        new_height = (original_height * current_width) // original_width
        padding = (current_height - new_height) // 2
        current_height = current_height - (2 * padding)
    else:
        new_width = (original_width * current_height) // original_height
        padding = (current_width - new_width) // 2
        current_width = current_width - (2 * padding)

    unpadded_features = current_height * current_width
    newline_features = current_height
    return (unpadded_features, newline_features)


def get_number_of_features(height: int, width: int, config) -> int:
    # From config
    # Hardcoded for CLIP for now
    # image_grid_pinpoints = [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]
    image_grid_pinpoints = config.image_grid_pinpoints
    image_size = config.vision_config.image_size
    patch_size = config.vision_config.patch_size

    assert image_size % patch_size == 0

    npatches = image_size // patch_size

    # Dimensions are intentionally swapped to be bug-compatible with
    # upstream: https://github.com/LLaVA-VL/LLaVA-NeXT/issues/59
    num_patch_width, num_patch_height = get_anyres_image_grid_shape(
        [height, width],
        image_grid_pinpoints,
        image_size,
    )
    unpadded_features, newline_features = get_unpadded_features(
        height, width, npatches, num_patch_height, num_patch_width
    )
    # The base patch covers the entire image
    base_features = npatches**2
    return unpadded_features + newline_features + base_features


def scatter_image_embeds(
    embeds: torch.Tensor, is_embed: Optional[torch.Tensor]
) -> torch.Tensor:
    if is_embed is None:
        return embeds

    placeholders = embeds.new_full(
        (is_embed.shape[0], embeds.shape[-1]),
        fill_value=torch.nan,
    )
    placeholders[is_embed] = embeds
    return placeholders


def gather_image_embeds(
    embeds: torch.Tensor, is_embed: Optional[torch.Tensor]
) -> Optional[torch.Tensor]:
    if is_embed is None:
        return embeds
    sel = embeds[is_embed]
    return sel if sel.numel() else None


@dataclass
class ImagePositions:
    offset: int
    length: int
    id: int
    num_placeholder_tokens: int
    is_embed: Optional[torch.Tensor] = None


class VlmCausalLMBatch(FlashCausalLMBatch):
    image_inputs: Optional[List[List[Dict[str, torch.Tensor]]]]
    image_positions: Optional[List[List[ImagePositions]]]
    encoder_cache: Optional[List[Dict[int, torch.Tensor]]]
    pixel_values: Optional[List[torch.Tensor]]
    pixel_attention_mask: Optional[List[torch.Tensor]]
    image_sizes: Optional[List[Tuple[int, int]]]
    image_grid_thw: Optional[torch.Tensor]
    cache_entries_to_free: List[Tuple[int, int]]
    has_image_inputs: bool = False
    inputs_embeds: Optional[torch.Tensor] = None

    @classmethod
    @tracer.start_as_current_span("concatenate")
    def concatenate(cls, batches):
        batch = super(VlmCausalLMBatch, cls).concatenate(batches)

        batch.image_inputs = []
        batch.image_positions = []
        batch.encoder_cache = []
        for b in batches:
            if b.image_inputs is not None:
                batch.image_inputs.extend(b.image_inputs)
            else:
                batch.image_inputs.append(None)
            if b.image_positions is not None:
                batch.image_positions.extend(b.image_positions)
            else:
                batch.image_positions.append(None)
            if b.encoder_cache is not None:
                batch.encoder_cache.extend(b.encoder_cache)
            else:
                batch.encoder_cache.append(None)

        batch.pixel_values = None
        batch.pixel_attention_mask = None
        batch.image_sizes = None
        batch.image_grid_thw = None
        batch.inputs_embeds = None

        # To be filled in prepare_for_prefill
        batch.has_image_inputs = False
        batch.cache_entries_to_free = []

        return batch

    @tracer.start_as_current_span("filter")
    def filter(self, request_ids: List[int]):
        if len(request_ids) == 0:
            raise ValueError("Batch must have at least one request")

        image_inputs = []
        image_positions = []
        encoder_cache = []

        for request_id in request_ids:
            idx = self.requests_idx_mapping[request_id]
            image_inputs.append(self.image_inputs[idx])
            image_positions.append(self.image_positions[idx])
            encoder_cache.append(self.encoder_cache[idx])

        batch = super().filter(request_ids)
        batch.pixel_values = None
        batch.pixel_attention_mask = None
        batch.image_sizes = None
        batch.image_grid_thw = None
        batch.inputs_embeds = None

        batch.image_inputs = image_inputs
        batch.image_positions = image_positions
        batch.encoder_cache = encoder_cache

        # To be filled in prepare_for_prefill
        batch.has_image_inputs = False
        batch.cache_entries_to_free = []
        return batch

    @classmethod
    def batch_tokenized_inputs(
        cls, requests: Iterable[generate_pb2.Request], tokenizer, processor, config
    ):
        kwargs = {}
        if (
            hasattr(processor, "image_processor_class")
            and processor.image_processor_class == "Idefics3ImageProcessor"
        ):
            kwargs["return_row_col_info"] = True

        max_length = 0
        vocab = tokenizer.get_vocab()

        if not hasattr(config, "image_token_index"):
            config.image_token_index = config.image_token_id

        batch_tokenized_inputs: List[List[int]] = []
        batch_image_inputs: List[Optional[List[dict]]] = []
        batch_image_positions: List[Optional[List[ImagePositions]]] = []

        for r in requests:
            text_parts = []
            image_inputs = []
            image_texts = []

            image_id = 0

            for chunk in r.input_chunks.chunks:
                chunk_type = chunk.WhichOneof("chunk")
                if chunk_type == "text":
                    text = preprocess_text(config, chunk.text)
                    text_parts.append(text)
                elif chunk_type == "image":
                    img = Image.open(BytesIO(chunk.image.data))
                    img = preprocess_image(config, img)

                    image_input = processor.image_processor(
                        [img], return_tensors="pt", **kwargs
                    )
                    image_inputs.append(image_input)

                    img_text, img_start_token_str = image_text_replacement(
                        processor, image_input, config
                    )
                    text_parts.append(img_text)

                    image_texts.append([image_id, img_start_token_str, img_text])
                    image_id += 1
                else:
                    raise RuntimeError(f"Invalid chunk type {chunk_type}")

            full_text = image_text_replacement_fixup(config, "".join(text_parts))
            input_ids = tokenizer(
                full_text,
                truncation=True,
                max_length=r.truncate,
                add_special_tokens=(
                    r.add_special_tokens if config.model_type != "paligemma" else False
                ),
            )["input_ids"]
            max_length = max(max_length, len(input_ids))

            if len(image_inputs) > 0:
                img_start_token = vocab[image_texts[0][1]]
                image_positions = cls.get_image_positions(
                    input_ids, image_texts, img_start_token, config, tokenizer
                )
            else:
                image_inputs = None
                image_positions = None

            batch_tokenized_inputs.append(input_ids)
            batch_image_inputs.append(image_inputs)
            batch_image_positions.append(image_positions)

        return batch_tokenized_inputs, batch_image_inputs, batch_image_positions

    @classmethod
    def get_image_positions(
        cls,
        input_ids: List[int],
        image_texts: List[Tuple[int, str, str]],
        img_start_token: int,
        config,
        tokenizer: PreTrainedTokenizerBase,
    ) -> List[ImagePositions]:
        image_positions = []
        num_images = len(image_texts)

        input_ids_t = torch.as_tensor(input_ids)
        img_start_token_pos = torch.where(input_ids_t.eq(img_start_token))[0]
        num_tokens = input_ids_t.numel()

        last_pos = 0
        for i in range(num_images):
            image_id, img_start_token_str, img_text = image_texts[i]
            img_text = image_text_replacement_fixup(config, img_text)

            if config.model_type == "gemma3":
                img_text = img_text.replace("\n\n", "")

            tokens = tokenizer(img_text, add_special_tokens=False, return_tensors="pt")[
                "input_ids"
            ][0]
            length = tokens.numel()

            assert (
                length <= num_tokens
            ), f"{length} > {num_tokens} Image is truncated, try increasing --max-batch-prefill-tokens"

            pos = torch.searchsorted(img_start_token_pos, last_pos, right=False)
            index = img_start_token_pos[pos]
            assert torch.equal(
                input_ids_t[index : index + length], tokens
            ), "Image tokens not found in input_ids"

            is_embed = tokens == config.image_token_index
            num_placeholder_tokens = int(is_embed.sum())
            if num_placeholder_tokens == length:
                is_embed = None

            pos = ImagePositions(
                offset=index,
                length=length,
                id=image_id,
                num_placeholder_tokens=num_placeholder_tokens,
                is_embed=is_embed,
            )

            image_positions.append(pos)
            last_pos = index + length

            if (
                config.model_type == "idefics2"
                and i + 1 != num_images
                and input_ids[last_pos] == config.image_token_index
            ):
                fake_token = last_pos - 1
                fake_token_index = torch.searchsorted(
                    img_start_token_pos, fake_token, right=False
                )
                img_start_token_pos[fake_token_index] = last_pos
                image_texts[i + 1][2] = image_texts[i + 1][2][
                    len(img_start_token_str) :
                ]

        return image_positions

    @classmethod
    def from_pb_processor(
        cls,
        pb: generate_pb2.Batch,
        tokenizer: PreTrainedTokenizerBase,
        processor,
        config,
        dtype: torch.dtype,
        device: torch.device,
    ) -> "VlmCausalLMBatch":
        batch_tokenized_inputs, image_inputs, image_positions = (
            cls.batch_tokenized_inputs(pb.requests, tokenizer, processor, config)
        )
        batch = cls.from_tokenized(pb, tokenizer, batch_tokenized_inputs, dtype, device)
        batch.image_inputs = image_inputs
        batch.image_positions = image_positions
        batch.encoder_cache = [{} for _ in range(len(pb.requests))]
        if len(image_inputs):
            batch.pixel_values = None
            batch.pixel_attention_mask = None
            batch.image_sizes = None
            batch.image_grid_thw = None
        return batch

    def prepare_for_prefill(self):
        super().prepare_for_prefill()

        self.has_image_inputs = False
        self.cache_entries_to_free = []

        self.pixel_values = []

        assert (
            len(self.cache_lengths)
            == len(self.input_lengths)
            == len(self.prefilling_mask)
        ), "Mismatch in lengths of cache_lengths, input_lengths, and prefilling_mask"

        for i, (
            cache_length,
            input_length,
            request_prefilling,
        ) in enumerate(
            zip(
                self.cache_lengths,
                self.input_lengths,
                self.prefilling_mask,
            )
        ):
            if not request_prefilling or self.image_positions[i] is None:
                continue

            for image_position in self.image_positions[i]:
                if image_position is None:
                    continue
                start_pos = image_position.offset
                length = image_position.length

                if start_pos >= cache_length + input_length:
                    # No encoder input required at this step
                    break
                if start_pos + length <= cache_length:
                    # The encode input is already processed
                    continue

                self.has_image_inputs = True

                if image_position.id not in self.encoder_cache[i]:
                    image_inputs = self.image_inputs[i][image_position.id]
                    self.pixel_values.append((i, image_position.id, image_inputs))

                    # Remove the image from the image_inputs
                    self.image_inputs[i][image_position.id] = None

        if not self.has_image_inputs:
            self.pixel_values = None
            self.pixel_attention_mask = None
            self.image_sizes = None
            self.image_grid_thw = None
        else:
            image_grid_thw_list = [
                x[2]["image_grid_thw"]
                for x in self.pixel_values
                if "image_grid_thw" in x[2]
            ]
            if image_grid_thw_list:
                self.image_grid_thw = torch.cat(image_grid_thw_list, dim=0).to(
                    self.input_ids.device
                )
            else:
                self.image_grid_thw = None

    def update_encoder_cache(self, encoder_outputs, request_id, img_pos):
        self.encoder_cache[request_id][img_pos.id] = scatter_image_embeds(
            encoder_outputs, img_pos.is_embed
        )

    def gather_vision_embeds(self):
        device = self.input_ids.device
        chunks = []
        for (
            i,
            cache_length,
            input_length,
            request_prefilling,
        ) in zip(
            range(len(self.requests)),
            self.cache_lengths,
            self.input_lengths,
            self.prefilling_mask,
        ):
            if not request_prefilling or self.image_positions[i] is None:
                continue

            for image_position in self.image_positions[i]:
                if image_position is None:
                    continue
                start_pos = image_position.offset
                length = image_position.length

                if start_pos >= cache_length + input_length:
                    # No encoder input required at this step
                    break
                if start_pos + length <= cache_length:
                    # The encode input is already processed
                    continue

                start_idx = max(cache_length - start_pos, 0)
                end_idx = min(cache_length - start_pos + input_length, length)

                assert (
                    image_position.id in self.encoder_cache[i]
                ), f"image_id {image_position.id} not in encoder_cache {self.encoder_cache[i]}"
                encoder_output = self.encoder_cache[i][image_position.id]

                is_embed = image_position.is_embed
                if is_embed is not None:
                    is_embed = is_embed[start_idx:end_idx]

                from loguru import logger

                logger.info(
                    f"image_id {image_position.id} start_idx {start_idx} end_idx {end_idx}, length {length}"
                )

                embeds = gather_image_embeds(
                    encoder_output[start_idx:end_idx],
                    is_embed=is_embed,
                )
                if embeds is not None:
                    chunks.append(embeds)

                if end_idx == length:
                    self.cache_entries_to_free.append((i, image_position.id))
                    self.image_positions[i][image_position.id] = None

        if len(chunks) == 0:
            return None
        return torch.cat(chunks, dim=0).to(device)

    def free_encoder_cache(self):
        for i, image_id in self.cache_entries_to_free:
            self.encoder_cache[i].pop(image_id, None)

        self.cache_entries_to_free = []

        # release any freed GPU memory immediately?


class VlmCausalLM(FlashCausalLM):
    def __init__(
        self,
        model_id: str,
        *,
        processor_class=AutoProcessor,
        processor_kwargs=None,
        batch_class=VlmCausalLMBatch,
        revision,
        trust_remote_code: bool,
        support_chunking: bool = True,
        **kwargs,
    ):
        if PREFIX_CACHING:
            raise NotImplementedError("Vlm do not work with prefix caching yet")
        if processor_kwargs is None:
            processor_kwargs = {}
        self.processor = processor_class.from_pretrained(
            model_id,
            revision=revision,
            trust_remote_code=trust_remote_code,
            **processor_kwargs,
        )
        self.batch_class = batch_class
        super().__init__(
            model_id=model_id,
            revision=revision,
            trust_remote_code=trust_remote_code,
            support_chunking=support_chunking,
            **kwargs,
        )

    @property
    def batch_type(self) -> Type[VlmCausalLMBatch]:
        return self.batch_class

    def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int):
        max_bs = max(self.cuda_graphs.keys()) if self.cuda_graphs else None
        input_lengths = [max_s] * bs
        cache_lengths = [0] * bs
        config = getattr(self.model.config, "text_config", self.model.config)
        if max_bs is None:
            inputs_embeds = torch.zeros(
                (bs, config.hidden_size),
                device=self.device,
                dtype=self.dtype,
            )
            position_ids = torch.zeros(bs, dtype=torch.int32, device=self.device)
            config = getattr(self.model, "config", None)
            rope_scaling = getattr(config, "rope_scaling", None) if config else None
            if (  # mrope have position_ids per section, if so repeat n times
                isinstance(rope_scaling, dict) and rope_scaling["rope_type"] == "mrope"
            ):
                n_sections = len(self.model.config.rope_scaling["mrope_section"])
                position_ids = position_ids.unsqueeze(1).repeat(1, n_sections)
            slots = torch.arange(bs, dtype=torch.int64, device=self.device)
            input_lengths_tensor = (
                torch.ones(bs, dtype=torch.int32, device=self.device) * max_s
            )
            cache_lengths_tensor = torch.zeros(
                bs, dtype=torch.int32, device=self.device
            )
            block_tables = torch.arange(
                max_bt, dtype=torch.int32, device=self.device
            ).repeat(bs)
            block_tables = block_tables.reshape((bs, max_bt))
            if ATTENTION == "flashinfer":
                block_tables = block_tables_to_ragged(
                    block_tables=block_tables,
                    input_lengths=input_lengths,
                    cache_lengths=cache_lengths,
                    input_lengths_tensor=input_lengths_tensor,
                    cache_lengths_tensor=cache_lengths_tensor,
                    max_current_length=max_s,
                )
        else:
            if bs > max_bs:
                raise RuntimeError(
                    "Cuda graphs should be generated in decreasing order size to reduce VRAM usage"
                )
            inputs_embeds = self.cuda_graphs[max_bs]["inputs_embeds"][:bs]
            position_ids = self.cuda_graphs[max_bs]["position_ids"][:bs]
            if ATTENTION == "flashinfer":
                block_tables = self.cuda_graphs[max_bs]["block_tables"][: bs * max_bt]
            else:
                block_tables = self.cuda_graphs[max_bs]["block_tables"][:bs]
            slots = self.cuda_graphs[max_bs]["slots"][:bs]
            input_lengths_tensor = self.cuda_graphs[max_bs]["input_lengths"][:bs]
            cache_lengths_tensor = self.cuda_graphs[max_bs]["cache_lengths"][:bs]

        if ATTENTION == "flashinfer":
            from text_generation_server.layers.attention.flashinfer import (
                create_decode_state_cuda_graphs,
            )

            block_tables_ptr = torch.zeros(
                bs + 1, dtype=torch.int32, device=self.device
            )
            last_page_len = torch.ones(bs, dtype=torch.int32, device=self.device)
            state = create_decode_state_cuda_graphs(
                device=inputs_embeds.device,
                block_tables=block_tables,
                block_tables_ptr=block_tables_ptr,
                last_page_len=last_page_len,
                num_heads=self.num_heads,
                num_kv_heads=self.num_kv_heads,
            )
        else:
            state = None

        graph = torch.cuda.CUDAGraph()
        self.cuda_graphs[bs] = {
            "inputs_embeds": inputs_embeds,
            "position_ids": position_ids,
            "kv_cache": self.kv_cache,
            "block_tables": block_tables,
            "slots": slots,
            "input_lengths": input_lengths_tensor,
            "cache_lengths": cache_lengths_tensor,
            "state": state,
            "graph": graph,
        }

        torch.cuda.synchronize()
        # Run once outside to warmup
        with self._forward_context(
            block_tables=block_tables,
            cu_seqlen_prefill=None,
            input_lengths_tensor=input_lengths_tensor,
            state=state,
            cache_lengths_tensor=cache_lengths_tensor,
        ):
            seqlen = Seqlen(
                input_lengths=input_lengths_tensor,
                cache_lengths=cache_lengths_tensor,
                cu_seqlen_q=None,
                max_q=1,
                max_k=max_s,
            )
            self.model.forward(
                inputs_embeds=inputs_embeds,
                position_ids=position_ids,
                cu_seqlen_prefill=None,
                kv_cache=self.kv_cache,
                block_tables=block_tables,
                slots=slots,
                seqlen=seqlen,
                max_s=max_s,
                prefill_cache_indices=None,
                lm_head_indices=None,
            )
            del seqlen

            torch.cuda.synchronize()

            with torch.cuda.graph(graph, pool=MEM_POOL):
                seqlen = Seqlen(
                    input_lengths=input_lengths_tensor,
                    cache_lengths=cache_lengths_tensor,
                    cu_seqlen_q=None,
                    max_q=1,
                    max_k=max_s,
                )
                logits, speculative_logits = self.model.forward(
                    inputs_embeds=inputs_embeds,
                    position_ids=position_ids,
                    cu_seqlen_prefill=None,
                    kv_cache=self.kv_cache,
                    block_tables=block_tables,
                    slots=slots,
                    seqlen=seqlen,
                    max_s=max_s,
                    prefill_cache_indices=None,
                    lm_head_indices=None,
                )
                self.cuda_graphs[bs]["logits"] = logits
                self.cuda_graphs[bs]["speculative_logits"] = speculative_logits
        torch.cuda.synchronize()

    def get_vision_embeds(
        self,
        pixel_values: torch.Tensor,
        pixel_attention_mask: torch.Tensor,
        image_sizes: torch.Tensor,
        image_grid_thw: torch.Tensor,
    ):
        embeds = self.model.get_vision_embeds(
            pixel_values=pixel_values,
            pixel_attention_mask=pixel_attention_mask,
            image_sizes=image_sizes,
            image_grid_thw=image_grid_thw,
        )
        return embeds

    def get_inputs_embeds(
        self,
        input_ids: torch.Tensor,
        vision_embeds: Optional[torch.Tensor] = None,
    ):
        return self.model.get_inputs_embeds(
            input_ids=input_ids,
            vision_embeds=vision_embeds,
        )

    def encode_images(self, batch):
        if batch.pixel_values is not None:
            device = batch.input_ids.device
            for request_id, image_id, image_input in batch.pixel_values:
                pixel_values = image_input["pixel_values"].to(device)

                if "pixel_attention_mask" in image_input:
                    pixel_attention_mask = image_input["pixel_attention_mask"].to(
                        device
                    )
                else:
                    pixel_attention_mask = None

                if "image_sizes" in image_input:
                    image_sizes = image_input["image_sizes"].to(device)
                else:
                    image_sizes = None

                if "image_grid_thw" in image_input:
                    image_grid_thw = image_input["image_grid_thw"].to(device)
                else:
                    image_grid_thw = None

                encoder_outputs = self.get_vision_embeds(
                    pixel_values=pixel_values,
                    pixel_attention_mask=pixel_attention_mask,
                    image_sizes=image_sizes,
                    image_grid_thw=image_grid_thw,
                )
                batch.update_encoder_cache(
                    encoder_outputs,
                    request_id,
                    batch.image_positions[request_id][image_id],
                )

        batch.pixel_values = None
        batch.pixel_attention_mask = None
        batch.image_sizes = None

    def set_inputs_embeds(self, batch):
        if batch.has_image_inputs:
            self.encode_images(batch)
            vision_embeds = batch.gather_vision_embeds()
            batch.has_image_inputs = False
        else:
            vision_embeds = None

        inputs_embeds = self.get_inputs_embeds(
            batch.input_ids, vision_embeds=vision_embeds
        )

        batch.inputs_embeds = inputs_embeds

    def forward(
        self,
        batch: VlmCausalLMBatch,
        adapter_data: Optional[Dict[str, torch.Tensor]] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        # Model Forward
        if batch.speculative_ids is not None:
            input_ids = batch.input_ids
            position_ids = batch.position_ids
            cu_seqlen_prefill = batch.cu_seqlen_prefill
            kv_cache = self.kv_cache
            block_tables = batch.block_tables_tensor
            slots = batch.slots[batch.slot_indices]
            input_lengths = batch.input_lengths_tensor
            max_s = batch.max_current_length
            lm_head_indices = batch.prefill_head_indices

            speculative_ids = batch.speculative_ids

            B, speculative_length = speculative_ids.shape
            new_length = speculative_length + 1
            new_input_ids = torch.cat(
                [input_ids.unsqueeze(-1), speculative_ids], dim=1
            ).reshape(-1)
            arange = torch.arange(new_length, device=position_ids.device).unsqueeze(0)
            arange_int = arange.to(dtype=torch.int32)
            new_position_ids = (
                position_ids.unsqueeze(-1).expand(B, new_length) + arange
            ).view(-1)
            slots = (slots.unsqueeze(-1).expand(B, new_length) + arange_int).view(-1)
            input_lengths = (
                input_lengths.unsqueeze(-1).expand(B, new_length) + arange_int
            ).view(-1)
            cache_lengths_tensor = (
                batch.cache_lengths_tensor.unsqueeze(-1).expand(B, new_length)
            ).reshape(-1)

            # Add Copy the block tables for all members
            block_tables = (
                block_tables.unsqueeze(1)
                .expand(B, new_length, -1)
                .reshape(B * new_length, -1)
                .contiguous()
            )
            max_s = max_s + speculative_length

            input_ids = new_input_ids
            position_ids = new_position_ids
        else:
            input_ids = batch.input_ids
            inputs_embeds = batch.inputs_embeds
            position_ids = batch.position_ids
            cu_seqlen_prefill = batch.cu_seqlen_prefill
            kv_cache = self.kv_cache
            block_tables = batch.block_tables_tensor
            slots = batch.slots[batch.slot_indices]
            input_lengths = batch.input_lengths_tensor
            cache_lengths_tensor = batch.cache_lengths_tensor
            max_s = batch.max_current_length
            lm_head_indices = batch.prefill_head_indices

        if self.model.config.model_type in {"qwen2_vl", "qwen2_5_vl"}:
            if position_ids.dim() == 1 and batch.prefilling:
                position_ids = self.model.get_position_ids(
                    input_ids, batch.image_grid_thw
                )
                batch.position_ids = position_ids

        attention_mask = None
        attention_mask_forward = None
        if self.model.config.model_type == "gemma3" and cu_seqlen_prefill is not None:
            attention_mask = self.model.get_attention_mask(
                input_ids, cu_seqlen_prefill, self.dtype, bool_mask=True
            )
            min_dtype = torch.finfo(self.dtype).min
            attention_mask_forward = torch.where(attention_mask, 0, min_dtype).to(
                input_ids.device
            )
            attention_mask = attention_mask.reshape(-1)

        # Try to find an associated cuda graph
        bs = input_ids.shape[0]
        sorted_padded_bs = sorted([k for k in self.cuda_graphs.keys() if k >= bs])
        if sorted_padded_bs:
            # Get associated cuda graph
            cuda_graph = self.cuda_graphs[sorted_padded_bs[0]]
        else:
            cuda_graph = None
        if cu_seqlen_prefill is not None or cuda_graph is None:
            if ATTENTION == "flashinfer":
                block_tables = block_tables_to_ragged(
                    block_tables=block_tables,
                    input_lengths=batch.input_lengths,
                    cache_lengths=batch.cache_lengths,
                    input_lengths_tensor=batch.input_lengths_tensor,
                    cache_lengths_tensor=batch.cache_lengths_tensor,
                    max_current_length=batch.max_current_length,
                )
            with self._forward_context(
                block_tables=block_tables,
                cu_seqlen_prefill=cu_seqlen_prefill,
                input_lengths_tensor=input_lengths,
                cache_lengths_tensor=cache_lengths_tensor,
                attention_mask=attention_mask,
            ):
                seqlen = Seqlen(
                    input_lengths=input_lengths,
                    cache_lengths=cache_lengths_tensor,
                    cu_seqlen_q=cu_seqlen_prefill,
                    max_q=batch.max_input_length,
                    max_k=batch.max_current_length,
                )
                logits, speculative_logits = self.model.forward(
                    inputs_embeds=inputs_embeds,
                    position_ids=position_ids,
                    cu_seqlen_prefill=cu_seqlen_prefill,
                    kv_cache=kv_cache,
                    block_tables=block_tables,
                    slots=slots,
                    seqlen=seqlen,
                    max_s=max_s,
                    prefill_cache_indices=batch.prefill_cache_indices,
                    lm_head_indices=lm_head_indices,
                    attention_mask=attention_mask_forward,
                )
                if batch.prefill_cache_indices is not None:
                    batch.prefill_cache_indices = None
                batch.image_grid_thw = None
                batch.free_encoder_cache()
                return logits, speculative_logits

        # Copy inputs to the static inputs of the cuda graph
        # Static inputs are potentially padded
        cuda_graph["inputs_embeds"][: inputs_embeds.shape[0]] = inputs_embeds
        cuda_graph["position_ids"][: position_ids.shape[0]] = position_ids
        if ATTENTION == "flashinfer":
            block_tables = block_tables_to_ragged(
                block_tables=block_tables,
                input_lengths=batch.input_lengths,
                cache_lengths=batch.cache_lengths,
                input_lengths_tensor=batch.input_lengths_tensor,
                cache_lengths_tensor=batch.cache_lengths_tensor,
                max_current_length=batch.max_current_length,
            )
            cuda_graph["block_tables"][: block_tables.shape[0]] = block_tables
        else:
            cuda_graph["block_tables"][
                : block_tables.shape[0], : block_tables.shape[1]
            ] = block_tables

        # XXX: This is working only because block 0 is reserved for the healthcheck
        # so it doesn't matter if we override it with bogus values.
        cuda_graph["slots"].fill_(0)
        cuda_graph["slots"][: slots.shape[0]] = slots
        cuda_graph["input_lengths"].zero_()
        cuda_graph["input_lengths"][: input_lengths.shape[0]] = input_lengths
        cuda_graph["cache_lengths"].zero_()
        cuda_graph["cache_lengths"][
            : cache_lengths_tensor.shape[0]
        ] = cache_lengths_tensor

        with self._forward_context(
            block_tables=cuda_graph["block_tables"],
            cu_seqlen_prefill=None,
            input_lengths_tensor=cuda_graph["input_lengths"],
            cache_lengths_tensor=cuda_graph["cache_lengths"],
            state=cuda_graph["state"],
        ):
            # Replay the graph
            cuda_graph["graph"].replay()

        # Slice output to the correct shape
        speculative_logits = (
            cuda_graph["speculative_logits"][:bs]
            if cuda_graph["speculative_logits"] is not None
            else None
        )
        logits = cuda_graph["logits"][:bs]

        batch.free_encoder_cache()
        return logits, speculative_logits


================================================
FILE: server/text_generation_server/pb/.gitignore
================================================
*.py
*.pyi
*.py-e


================================================
FILE: server/text_generation_server/server.py
================================================
import asyncio
import os
import torch
import time
import signal

from grpc import aio
from loguru import logger

from grpc_reflection.v1alpha import reflection
from pathlib import Path
from typing import List, Optional

from text_generation_server.cache import Cache
from text_generation_server.interceptor import ExceptionInterceptor
from text_generation_server.models import Model, get_model_with_lora_adapters
from text_generation_server.utils.adapter import AdapterInfo
from text_generation_server.utils.prefill_chunking import set_max_prefill_tokens

try:
    from text_generation_server.models.vlm_causal_lm import (
        VlmCausalLMBatch,
    )
    from text_generation_server.models.idefics_causal_lm import IdeficsCausalLMBatch
    from text_generation_server.models.mllama_causal_lm import MllamaCausalLMBatch

    VLM_BATCH_TYPES = {
        VlmCausalLMBatch,
        IdeficsCausalLMBatch,
        MllamaCausalLMBatch,
    }
except (ImportError, NotImplementedError):
    # These imports can fail on CPU/Non flash.
    VLM_BATCH_TYPES = set()

from text_generation_server.pb import generate_pb2_grpc, generate_pb2
from text_generation_server.tracing import UDSOpenTelemetryAioServerInterceptor
from text_generation_server.models.globals import set_adapter_to_index


class SignalHandler:
    KEEP_PROCESSING = True

    def __init__(self):
        signal.signal(signal.SIGINT, self.exit_gracefully)
        signal.signal(signal.SIGTERM, self.exit_gracefully)

    def set_keep_processing(self, value: bool):
        self.KEEP_PROCESSING = value

    def exit_gracefully(self, signum, frame):
        print(f"Exiting gracefully: Signal {signum}")
        self.set_keep_processing(False)


class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
    def __init__(
        self,
        model: Model,
        cache: Cache,
        server_urls: List[str],
    ):
        self.cache = cache
        self.model = model
        # Quantize is resolved during model loading
        self.quantize = model.quantize
        self.server_urls = server_urls
        # For some reason, inference_mode does not work well with GLOO which we use on CPU
        # if model.device.type == "cuda":
        #     # Force inference mode for the lifetime of TextGenerationService
        #     self._inference_mode_raii_guard = torch._C._InferenceMode(True)

    async def Info(self, request, context):
        return self.model.info

    async def Health(self, request, context):
        if self.model.device.type == "cuda":
            torch.zeros((2, 2)).cuda()
        return generate_pb2.HealthResponse()

    async def ServiceDiscovery(self, request, context):
        return generate_pb2.ServiceDiscoveryResponse(urls=self.server_urls)

    async def ClearCache(self, request, context):
        if request.HasField("id"):
            self.cache.delete(request.id)
        else:
            self.cache.clear()
        return generate_pb2.ClearCacheResponse()

    async def FilterBatch(self, request, context):
        batch = self.cache.pop(request.batch_id)
        if batch is None:
            raise ValueError(f"Batch ID {request.batch_id} not found in cache.")
        filtered_batch = batch.filter(request.request_ids)
        self.cache.set(filtered_batch)

        return generate_pb2.FilterBatchResponse(batch=filtered_batch.to_pb())

    async def Warmup(self, request, context):
        set_max_prefill_tokens(request.max_prefill_tokens)

        if self.quantize in {"exl2", "gptq"}:
            try:
                # When using GPTQ, Exllama kernels need some global kernels
                # For which we have the finale shapes only after the model has loaded
                # This will allocate those buffers.
                from text_generation_server.layers.gptq import (
                    create_exllama_buffers,
                    set_device,
                )

                set_device(self.model.device)
                create_exllama_buffers(request.max_prefill_tokens)
            except ImportError:
                pass

        if (
            self.model.batch_type in VLM_BATCH_TYPES
        ):  # Hack, i would rather use kwargs in the `from_pb` call
            batch = self.model.batch_type.from_pb_processor(
                request.batch,
                self.model.tokenizer,
                self.model.processor,
                self.model.model.config,
                self.model.dtype,
                self.model.device,
            )
        else:
            batch = self.model.batch_type.from_pb(
                request.batch, self.model.tokenizer, self.model.dtype, self.model.device
            )

        # Override default values with None for clearer semantics.
        max_input_tokens = (
            request.max_input_tokens if request.HasField("max_input_tokens") else None
        )
        max_total_tokens = (
            request.max_total_tokens if request.HasField("max_total_tokens") else None
        )
        max_supported_total_tokens, max_input_tokens, max_total_tokens = (
            self.model.warmup(batch, max_input_tokens, max_total_tokens)
        )

        return generate_pb2.WarmupResponse(
            max_supported_total_tokens=max_supported_total_tokens,
            max_input_tokens=max_input_tokens,
            max_total_tokens=max_total_tokens,
        )

    async def Prefill(self, request, context):
        start = time.time_ns()
        if (
            self.model.batch_type in VLM_BATCH_TYPES
        ):  # Hack, i would rather use kwargs in the `from_pb` call
            batch = self.model.batch_type.from_pb_processor(
                request.batch,
                self.model.tokenizer,
                self.model.processor,
                self.model.model.config,
                self.model.dtype,
                self.model.device,
            )
        else:
            batch = self.model.batch_type.from_pb(
                request.batch, self.model.tokenizer, self.model.dtype, self.model.device
            )

        concat_ns = None
        if self.model.support_chunking:
            if request.HasField("cached_batch"):
                cached_batch = self.cache.pop(request.cached_batch.id)
                if cached_batch is None:
                    raise ValueError(
                        f"Batch ID {request.cached_batch.id} not found in cache."
                    )
                start_concat = time.time_ns()
                batch = self.model.batch_type.concatenate([cached_batch, batch])
                concat_ns = time.time_ns() - start_concat

        generations, next_batch, timings = self.model.generate_token(batch)
        self.cache.set(next_batch)

        return generate_pb2.PrefillResponse(
            generations=[generation.to_pb() for generation in generations],
            batch=next_batch.to_pb() if next_batch else None,
            forward_ns=timings[0],
            decode_ns=timings[1],
            total_ns=time.time_ns() - start,
            concat_ns=concat_ns,
        )

    async def Decode(self, request, context):
        start = time.time_ns()
        if len(request.batches) == 0:
            raise ValueError("Must provide at least one batch")

        batches = []
        for batch_pb in request.batches:
            batch = self.cache.pop(batch_pb.id)
            if batch is None:
                raise ValueError(f"Batch ID {batch_pb.id} not found in cache.")
            batches.append(batch)

        if len(batches) == 0:
            raise ValueError("All batches are empty")

        if len(batches) > 1:
            start_concat = time.time_ns()
            batch = self.model.batch_type.concatenate(batches)
            concat_ns = time.time_ns() - start_concat
        else:
            batch = batches[0]
            concat_ns = None

        generations, next_batch, timings = self.model.generate_token(batch)
        self.cache.set(next_batch)

        return generate_pb2.DecodeResponse(
            generations=[generation.to_pb() for generation in generations],
            batch=next_batch.to_pb() if next_batch else None,
            concat_ns=concat_ns,
            forward_ns=timings[0],
            decode_ns=timings[1],
            total_ns=time.time_ns() - start,
        )


def serve(
    model_id: str,
    lora_adapters: Optional[List[AdapterInfo]],
    revision: Optional[str],
    sharded: bool,
    quantize: Optional[str],
    speculate: Optional[int],
    dtype: Optional[str],
    kv_cache_dtype: Optional[str],
    trust_remote_code: bool,
    uds_path: Path,
    max_input_tokens: int,
):
    async def serve_inner(
        model_id: str,
        lora_adapters: Optional[List[AdapterInfo]],
        revision: Optional[str],
        sharded: bool = False,
        quantize: Optional[str] = None,
        speculate: Optional[int] = None,
        dtype: Optional[str] = None,
        kv_cache_dtype: Optional[str] = None,
        trust_remote_code: bool = False,
    ):
        unix_socket_template = "unix://{}-{}"
        adapter_to_index = {}
        if sharded:
            server_urls = [
                unix_socket_template.format(uds_path, rank)
                for rank in range(int(os.environ["WORLD_SIZE"]))
            ]
            local_url = server_urls[int(os.environ["RANK"])]
        else:
            local_url = unix_socket_template.format(uds_path, 0)
            server_urls = [local_url]

        try:
            model = get_model_with_lora_adapters(
                model_id,
                lora_adapters,
                revision,
                sharded,
                quantize,
                speculate,
                dtype,
                kv_cache_dtype,
                trust_remote_code,
                max_input_tokens,
                adapter_to_index,
            )

        except Exception:
            logger.exception("Error when initializing model")
            raise

        signal_handler = SignalHandler()

        set_adapter_to_index(adapter_to_index)
        server = aio.server(
            interceptors=[
                ExceptionInterceptor(lambda: signal_handler.set_keep_processing(False)),
                UDSOpenTelemetryAioServerInterceptor(),
            ],
            options=[
                # Set the maximum possible message length: i32::MAX
                ("grpc.max_receive_message_length", (1 << 31) - 1)
            ],
        )
        generate_pb2_grpc.add_TextGenerationServiceServicer_to_server(
            TextGenerationService(model, Cache(), server_urls), server
        )
        SERVICE_NAMES = (
            generate_pb2.DESCRIPTOR.services_by_name["TextGenerationService"].full_name,
            reflection.SERVICE_NAME,
        )
        reflection.enable_server_reflection(SERVICE_NAMES, server)
        server.add_insecure_port(local_url)

        await server.start()

        logger.info("Server started at {}".format(local_url))
        while signal_handler.KEEP_PROCESSING:
            await asyncio.sleep(0.5)

    asyncio.run(
        serve_inner(
            model_id,
            lora_adapters,
            revision,
            sharded,
            quantize,
            speculate,
            dtype,
            kv_cache_dtype,
            trust_remote_code,
        )
    )


================================================
FILE: server/text_generation_server/tracing.py
================================================
import grpc

from opentelemetry import trace
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation.grpc._aio_server import (
    OpenTelemetryAioServerInterceptor,
)
from opentelemetry.semconv.trace import SpanAttributes
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import (
    BatchSpanProcessor,
)


class UDSOpenTelemetryAioServerInterceptor(OpenTelemetryAioServerInterceptor):
    def __init__(self):
        super().__init__(trace.get_tracer(__name__))

    def _start_span(self, handler_call_details, context, set_status_on_exception=False):
        """
        Rewrite _start_span method to support Unix Domain Socket gRPC contexts
        """

        # standard attributes
        attributes = {
            SpanAttributes.RPC_SYSTEM: "grpc",
            SpanAttributes.RPC_GRPC_STATUS_CODE: grpc.StatusCode.OK.value[0],
        }

        # if we have details about the call, split into service and method
        if handler_call_details.method:
            service, method = handler_call_details.method.lstrip("/").split("/", 1)
            attributes.update(
                {
                    SpanAttributes.RPC_METHOD: method,
                    SpanAttributes.RPC_SERVICE: service,
                }
            )

        # add some attributes from the metadata
        metadata = dict(context.invocation_metadata())
        if "user-agent" in metadata:
            attributes["rpc.user_agent"] = metadata["user-agent"]

        # We use gRPC over a UNIX socket
        attributes.update({SpanAttributes.NET_TRANSPORT: "unix"})

        return self._tracer.start_as_current_span(
            name=handler_call_details.method,
            kind=trace.SpanKind.SERVER,
            attributes=attributes,
            set_status_on_exception=set_status_on_exception,
        )


def setup_tracing(otlp_service_name: str, otlp_endpoint: str):
    resource = Resource.create(attributes={"service.name": otlp_service_name})
    span_exporter = OTLPSpanExporter(endpoint=otlp_endpoint, insecure=True)
    span_processor = BatchSpanProcessor(span_exporter)

    trace.set_tracer_provider(TracerProvider(resource=resource))
    trace.get_tracer_provider().add_span_processor(span_processor)


================================================
FILE: server/text_generation_server/utils/__init__.py
================================================
from text_generation_server.utils.convert import convert_file, convert_files
from text_generation_server.utils.dist import initialize_torch_distributed
from text_generation_server.utils.weights import Weights
from text_generation_server.utils.peft import download_and_unload_peft
from text_generation_server.utils.hub import (
    weight_files,
    weight_hub_files,
    download_weights,
    EntryNotFoundError,
    LocalEntryNotFoundError,
    RevisionNotFoundError,
)
from text_generation_server.utils.tokens import (
    NextTokenChooser,
    HeterogeneousNextTokenChooser,
    StoppingCriteria,
    StopSequenceCriteria,
    FinishReason,
    Sampling,
    Greedy,
)

__all__ = [
    "convert_file",
    "convert_files",
    "initialize_torch_distributed",
    "weight_files",
    "weight_hub_files",
    "download_weights",
    "download_and_unload_peft",
    "EntryNotFoundError",
    "HeterogeneousNextTokenChooser",
    "LocalEntryNotFoundError",
    "RevisionNotFoundError",
    "Greedy",
    "NextTokenChooser",
    "Sampling",
    "StoppingCriteria",
    "StopSequenceCriteria",
    "FinishReason",
    "Weights",
]


================================================
FILE: server/text_generation_server/utils/adapter.py
================================================
# Origin:   https://github.com/predibase/lorax
# Path:     lorax/server/lorax_server/utils/adapter.py
# License:  Apache License Version 2.0, January 2004

import warnings
import re
from dataclasses import dataclass
from functools import lru_cache
from typing import TYPE_CHECKING, Set, Tuple, Optional, List

from safetensors.torch import load_file
from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizer

from text_generation_server.utils.merges.strategies import merge_adapters

from text_generation_server.utils import hub
from text_generation_server.adapters.lora import LoraConfig


if TYPE_CHECKING:
    from text_generation_server.adapters.config import AdapterConfig, ModuleMap


BASE_MODEL_ADAPTER_ID = "__base_model__"


@dataclass
class AdapterInfo:
    id: str
    path: Optional[str]
    revision: Optional[str] = None


@dataclass
class AdapterParameters:
    adapter_info: Tuple[AdapterInfo]
    weights: Tuple[float]
    merge_strategy: NotImplemented
    density: float
    majority_sign_method: NotImplemented


@dataclass
class AdapterSource:
    adapter_id: str
    model_id: str
    revision: str


def parse_lora_adapters(lora_adapters: Optional[str]) -> List[AdapterInfo]:
    if not lora_adapters:
        return []

    adapter_list = []
    for adapter in lora_adapters.split(","):
        adapter = adapter.strip()
        if adapter.count("=") > 1 or adapter.count("@") > 1:
            raise ValueError(f"Invalid LoRA adapter format: {adapter}")
        match = re.match(r"^([^=@]+)(?:=([^@]+))?(?:@(.+))?$", adapter)

        if match:
            adapter_id, path, revision = match.groups()
            adapter_list.append(
                AdapterInfo(id=adapter_id, path=path, revision=revision)
            )
        else:
            raise ValueError(f"Invalid LoRA adapter format: {adapter}")
    return adapter_list


def load_and_merge_adapters(
    model_id: str,
    adapter_parameters: AdapterParameters,
    adapter_index: int,
    weight_names: Tuple[str],
    trust_remote_code: bool = False,
) -> Tuple["ModuleMap", "AdapterConfig", Set[str], PreTrainedTokenizer]:
    if len(adapter_parameters.adapter_info) == 1:
        adapter = next(iter(adapter_parameters.adapter_info))
        return load_module_map(
            model_id,
            adapter.revision,
            adapter.id,
            adapter.path,
            weight_names,
            trust_remote_code,
        )

    adapter_params = AdapterParametersContainer(adapter_parameters, adapter_index)
    return _load_and_merge(
        model_id,
        adapter_params,
        weight_names,
        trust_remote_code,
    )


@dataclass
class AdapterParametersContainer:
    adapter_parameters: AdapterParameters
    adapter_index: int

    def __hash__(self) -> int:
        return self.adapter_index


@lru_cache(maxsize=32)
def _load_and_merge(
    model_id: str,
    adapter_params: AdapterParametersContainer,
    weight_names: Tuple[str],
    trust_remote_code: bool = False,
) -> Tuple["ModuleMap", "AdapterConfig", Set[str], PreTrainedTokenizer]:
    params = adapter_params.adapter_parameters

    adapters_to_merge = []
    merged_weight_names = set()
    tokenizer = None
    for adapter in params.adapter_info:
        if adapter.id == BASE_MODEL_ADAPTER_ID:
            raise ValueError("Base model adapter cannot be merged.")

        (
            module_map,
            adapter_config,
            adapter_weight_names,
            adapter_tokenizer,
        ) = load_module_map(
            model_id,
            adapter.revision,
            adapter.id,
            adapter.path,
            weight_names,
            trust_remote_code,
        )

        adapters_to_merge.append((module_map, adapter_config))
        merged_weight_names = merged_weight_names.union(adapter_weight_names)
        if tokenizer is None:
            tokenizer = adapter_tokenizer

    if len(adapters_to_merge) == 0:
        raise ValueError("No adapters to merge.")

    module_map, adapter_config = merge_adapters(adapters_to_merge, params)
    return module_map, adapter_config, merged_weight_names, tokenizer


def check_architectures(
    model_id: str,
    adapter_id: str,
    adapter_config: "AdapterConfig",
    trust_remote_code: bool = False,
):
    try:
        if not adapter_config.base_model_name_or_path:
            # Avoid execution latency caused by the network connection retrying for AutoConfig.from_pretrained(None)
            return

        expected_config = AutoConfig.from_pretrained(
            model_id, trust_remote_code=trust_remote_code
        )
        model_config = AutoConfig.from_pretrained(
            adapter_config.base_model_name_or_path, trust_remote_code=trust_remote_code
        )
    except Exception as e:
        warnings.warn(
            f"Unable to check architecture compatibility for adapter '{adapter_id}' "
            f"against model '{model_id}'. Assuming they are compatible. Error: {e}"
        )
        return

    if model_config.architectures == expected_config.architectures:
        warnings.warn(
            f"Adapter '{adapter_id}' was not trained on base model '{model_id}'. "
            f"If you encounter issues, use --model-id '{adapter_config.base_model_name_or_path}' instead."
        )
    else:
        # TODO(travis): revisit this when we support clasification heads which will not use CausalLM
        raise ValueError(
            f"Adapter '{adapter_id}' is not compatible with model '{model_id}'. "
            f"Architectures differ: {model_config.architectures} != {expected_config.architectures}. "
            f"Use --model-id '{adapter_config.base_model_name_or_path}' instead."
        )


@lru_cache(maxsize=128)
def load_module_map(
    model_id: str,
    revision: str,
    adapter_id: str,
    adapter_path: Optional[str],
    weight_names: Tuple[str],
    trust_remote_code: bool = False,
) -> Tuple["ModuleMap", "AdapterConfig", Set[str], PreTrainedTokenizer]:
    adapter_config = LoraConfig.load(adapter_path or adapter_id, None)

    if not adapter_path and adapter_config.base_model_name_or_path != model_id:
        check_architectures(model_id, adapter_id, adapter_config, trust_remote_code)

    adapter_filenames = (
        hub._weight_files_from_dir(adapter_path, extension=".safetensors")
        if adapter_path
        else hub._cached_weight_files(
            adapter_id, revision=revision, extension=".safetensors"
        )
    )

    # throw an error if no adapter weights are found
    if not adapter_filenames:
        raise FileNotFoundError(
            f"No adapter weights found for adapter '{adapter_id}' and revision '{revision}'."
        )

    try:
        adapter_tokenizer = AutoTokenizer.from_pretrained(
            adapter_config.config_path,
            trust_remote_code=trust_remote_code,
        )
    except Exception:
        # Adapter does not have a tokenizer, so fallback to base model tokenizer
        adapter_tokenizer = None

    # load adapter weights from all shards (should have relatively small memory footprint)
    adapter_weights = {}
    for filename in adapter_filenames:
        adapter_weights.update(load_file(filename))

    # map the model weights to the relevant adapter weights (LoRA A and B matrices)
    module_map, adapter_weight_names = adapter_config.map_weights_for_model(
        adapter_weights, weight_names
    )
    return module_map, adapter_config, adapter_weight_names, adapter_tokenizer


def get_attn_weights(i, layer):
    qkv = layer.self_attn.query_key_value
    weights = {}

    for k in ["q", "k", "v"]:
        key = (i, f"{k}_proj")
        value = (f"model.layers.{i}.self_attn.{k}_proj", qkv)
        weights[key] = value

    # also add the qkv_proj weight for the adapter
    weights[(i, "qkv_proj")] = (
        f"model.layers.{i}.self_attn.qkv_proj",
        qkv,
    )

    weights[(i, "o_proj")] = (
        f"model.layers.{i}.self_attn.o_proj",
        layer.self_attn.o_proj,
    )

    return weights


def get_mlp_weights(i, layer):
    weights = {}
    if hasattr(layer, "mlp"):
        mlp = layer.mlp
        if hasattr(mlp, "gate_up_proj"):
            # handle combined gate_up_proj (e.g., for some LLaMA variants)
            weights.update(
                {
                    (i, "gate_proj"): (
                        f"model.layers.{i}.mlp.gate_proj",
                        mlp.gate_up_proj,
                    ),
                    (i, "up_proj"): (f"model.layers.{i}.mlp.up_proj", mlp.gate_up_proj),
                }
            )
        else:
            # handle separate gate_proj, up_proj, and down_proj (e.g., for Gemma)
            if hasattr(mlp, "gate_proj"):
                weights[(i, "gate_proj")] = (
                    f"model.layers.{i}.mlp.gate_proj",
                    mlp.gate_proj,
                )
            if hasattr(mlp, "up_proj"):
                weights[(i, "up_proj")] = (f"model.layers.{i}.mlp.up_proj", mlp.up_proj)

        if hasattr(mlp, "c_fc"):
            weights[(i, "c_fc")] = (f"model.layers.{i}.mlp.c_fc", mlp.c_fc)

        if hasattr(mlp, "c_proj"):
            weights[(i, "c_proj")] = (f"model.layers.{i}.mlp.c_proj", mlp.c_proj)

        if hasattr(mlp, "down_proj"):
            weights[(i, "down_proj")] = (
                f"model.layers.{i}.mlp.down_proj",
                mlp.down_proj,
            )

    return weights


# build_layer_weight_lookup creates a mapping of model layers to their corresponding
# weight tensors and paths. It builds a dictionary that maps layer identifiers to tuples
# containing the weight tensor path and the actual layer object. This mapping is needed
# for the lora adapter to know which weights to update when applying the adapter.
def build_layer_weight_lookup(model):
    if hasattr(model, "language_model"):
        m = model.language_model.model
    elif hasattr(model, "text_model"):
        m = model.text_model.model
    else:
        m = model.model

    layer_weights = {}

    for i, layer in enumerate(m.layers):
        attn_weights = get_attn_weights(i, layer)
        mlp_weights = get_mlp_weights(i, layer)

        layer_weights.update(attn_weights)
        layer_weights.update(mlp_weights)

    lm_head = None
    if hasattr(m, "lm_head"):
        lm_head = m.lm_head
    elif hasattr(model, "lm_head"):
        lm_head = model.lm_head

    if lm_head:
        layer_weights[(0, "lm_head")] = ("lm_head", lm_head)

    return layer_weights


================================================
FILE: server/text_generation_server/utils/chunks.py
================================================
from typing import Iterable

from loguru import logger

from text_generation_server.pb import generate_pb2


def concat_text_chunks(chunks: Iterable[generate_pb2.InputChunk]) -> str:
    """
    Concatenate text in text chunks. Non-text chunks are dropped.
    """
    text = None
    for chunk in chunks:
        chunk_type = chunk.WhichOneof("chunk")
        if chunk_type == "text":
            if text is None:
                text = chunk.text
            else:
                raise NotImplementedError("Request contained more than one text chunk")
        else:
            # We cannot reject this, e.g. warmup sends an image chunk.
            logger.debug(f"Encountered non-text chunk type {chunk_type}")

    if text is None:
        raise NotImplementedError("Request without a text chunk")

    return text


================================================
FILE: server/text_generation_server/utils/convert.py
================================================
import datetime
import torch
import os

from loguru import logger
from pathlib import Path
from safetensors.torch import save_file, load_file, _find_shared_tensors, _is_complete
from typing import List, Dict
from collections import defaultdict


def _remove_duplicate_names(
    state_dict: Dict[str, torch.Tensor],
    *,
    preferred_names: List[str] = None,
    discard_names: List[str] = None,
) -> Dict[str, List[str]]:
    if preferred_names is None:
        preferred_names = []
    preferred_names = set(preferred_names)
    if discard_names is None:
        discard_names = []
    discard_names = set(discard_names)

    shareds = _find_shared_tensors(state_dict)
    to_remove = defaultdict(list)
    for shared in shareds:
        complete_names = set(
            [name for name in shared if _is_complete(state_dict[name])]
        )
        if not complete_names:
            if len(shared) == 1:
                # Force contiguous
                name = list(shared)[0]
                state_dict[name] = state_dict[name].clone()
                complete_names = {name}
            else:
                raise RuntimeError(
                    f"Error while trying to find names to remove to save state dict, but found no suitable name to keep for saving amongst: {shared}. None is covering the entire storage.Refusing to save/load the model since you could be storing much more memory than needed. Please refer to https://huggingface.co/docs/safetensors/torch_shared_tensors for more information. Or open an issue."
                )

        keep_name = sorted(list(complete_names))[0]

        # Mecanism to preferentially select keys to keep
        # coming from the on-disk file to allow
        # loading models saved with a different choice
        # of keep_name
        preferred = complete_names.difference(discard_names)
        if preferred:
            keep_name = sorted(list(preferred))[0]

        if preferred_names:
            preferred = preferred_names.intersection(complete_names)
            if preferred:
                keep_name = sorted(list(preferred))[0]
        for name in sorted(shared):
            if name != keep_name:
                to_remove[keep_name].append(name)
    return to_remove


def convert_file(pt_file: Path, sf_file: Path, discard_names: List[str]):
    """
    Convert a pytorch file to a safetensors file
    This will remove duplicate tensors from the file.

    Unfortunately, this might not respect *transformers* convention.
    Forcing us to check for potentially different keys during load when looking
    for specific tensors (making tensor sharing explicit).
    """
    loaded = torch.load(pt_file, map_location="cpu", weights_only=True)
    if "state_dict" in loaded:
        loaded = loaded["state_dict"]
    to_removes = _remove_duplicate_names(loaded, discard_names=discard_names)

    metadata = {"format": "pt"}
    for kept_name, to_remove_group in to_removes.items():
        for to_remove in to_remove_group:
            if to_remove not in metadata:
                metadata[to_remove] = kept_name
            del loaded[to_remove]
    # Force tensors to be contiguous
    loaded = {k: v.contiguous() for k, v in loaded.items()}

    dirname = os.path.dirname(sf_file)
    os.makedirs(dirname, exist_ok=True)
    save_file(loaded, sf_file, metadata=metadata)
    reloaded = load_file(sf_file)
    for k in loaded:
        pt_tensor = loaded[k]
        sf_tensor = reloaded[k]
        if not torch.equal(pt_tensor, sf_tensor):
            raise RuntimeError(f"The output tensors do not match for key {k}")


def convert_files(pt_files: List[Path], sf_files: List[Path], discard_names: List[str]):
    assert len(pt_files) == len(sf_files)

    N = len(pt_files)
    # We do this instead of using tqdm because we want to parse the logs with the launcher

    for i, (pt_file, sf_file) in enumerate(zip(pt_files, sf_files)):
        # Skip blacklisted files
        if (
            "arguments" in pt_file.name
            or "args" in pt_file.name
            or "training" in pt_file.name
        ):
            continue

        start = datetime.datetime.now()
        convert_file(pt_file, sf_file, discard_names)
        elapsed = datetime.datetime.now() - start
        logger.info(f"Convert: [{i + 1}/{N}] -- Took: {elapsed}")


================================================
FILE: server/text_generation_server/utils/dist.py
================================================
import os
import torch
from torch.distributed import ProcessGroup
from datetime import timedelta
from loguru import logger
from text_generation_server.utils.import_utils import SYSTEM

# Tensor Parallelism settings
RANK = int(os.getenv("RANK", "0"))
WORLD_SIZE = int(os.getenv("WORLD_SIZE", "1"))

# CUDA memory fraction
MEMORY_FRACTION = float(os.getenv("CUDA_MEMORY_FRACTION", "1.0"))


class FakeBarrier:
    def wait(self):
        pass


class FakeGroup(ProcessGroup):
    def __init__(self, rank, size):
        self._rank = rank
        self._size = size
        super().__init__(rank, size)

    def allreduce(self, *args, **kwargs):
        return FakeBarrier()

    def allgather(self, inputs, local_tensor, **kwargs):
        assert (
            len(inputs[0]) == len(local_tensor) == 1
        ), f"{len(inputs[0])} != {len(local_tensor)} != 1, and the FakeGroup is supposed to join on simple tensors"
        for input_ in inputs:
            input_[0].data = local_tensor[0].data
        return FakeBarrier()

    def barrier(self, *args, **kwargs):
        return FakeBarrier()

    def size(self):
        return self._size

    def rank(self):
        return self._rank


def initialize_torch_distributed():
    if torch.cuda.is_available():
        from torch.distributed import ProcessGroupNCCL

        # Set the device id.
        assert WORLD_SIZE <= torch.cuda.device_count(), "Each process is one gpu"
        device = RANK % torch.cuda.device_count()
        torch.cuda.set_device(device)
        torch.cuda.set_per_process_memory_fraction(MEMORY_FRACTION, device)
        backend = "nccl"
        options = ProcessGroupNCCL.Options()
        options.is_high_priority_stream = True
        options._timeout = timedelta(seconds=120)
    else:
        backend = "gloo"
        options = None

    if WORLD_SIZE == 1:
        return FakeGroup(RANK, WORLD_SIZE), RANK, WORLD_SIZE
    else:
        if os.getenv("DEBUG", None) == "1":
            return FakeGroup(RANK, WORLD_SIZE), RANK, WORLD_SIZE

        if not torch.distributed.is_initialized():
            # Call the init process.
            if SYSTEM == "ipex":
                import intel_extension_for_pytorch as ipex

                if torch.xpu.is_available():
                    assert (
                        WORLD_SIZE <= torch.xpu.device_count()
                    ), "Each process is one xpu"
                    device = RANK % torch.xpu.device_count()
                    torch.xpu.set_device(device)
                    device_id = torch.device(f"xpu:{RANK}")
                    torch.distributed.init_process_group(
                        backend="xccl",
                        world_size=WORLD_SIZE,
                        rank=RANK,
                        timeout=timedelta(seconds=120),
                        pg_options=options,
                        device_id=device_id,
                    )
                else:
                    ipex.distributed.init_process_group(
                        backend="ccl",
                        world_size=WORLD_SIZE,
                        rank=RANK,
                        timeout=timedelta(seconds=120),
                        pg_options=options,
                    )
            else:
                device = torch.device(f"cuda:{RANK}")
                torch.distributed.init_process_group(
                    backend=backend,
                    world_size=WORLD_SIZE,
                    rank=RANK,
                    timeout=timedelta(seconds=120),
                    pg_options=options,
                    device_id=device,
                )
        else:
            logger.warning("torch.distributed is already initialized.")

        return torch.distributed.group.WORLD, RANK, WORLD_SIZE


================================================
FILE: server/text_generation_server/utils/hub.py
================================================
import time
import os

from datetime import timedelta
from loguru import logger
from pathlib import Path
from typing import Optional, List

from huggingface_hub import file_download, hf_api, HfApi, hf_hub_download
from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
from huggingface_hub.utils import (
    LocalEntryNotFoundError,
    EntryNotFoundError,
    RevisionNotFoundError,  # noqa # Import here to ease try/except in other part of the lib
)

WEIGHTS_CACHE_OVERRIDE = os.getenv("WEIGHTS_CACHE_OVERRIDE", None)
HF_HUB_OFFLINE = os.environ.get("HF_HUB_OFFLINE", "0").lower() in ["true", "1", "yes"]


def _cached_weight_files(
    model_id: str, revision: Optional[str], extension: str
) -> List[str]:
    """Guess weight files from the cached revision snapshot directory"""
    d = _get_cached_revision_directory(model_id, revision)
    if not d:
        return []
    filenames = _weight_files_from_dir(d, extension)
    return filenames


def _weight_hub_files_from_model_info(
    info: hf_api.ModelInfo, extension: str
) -> List[str]:
    return [
        s.rfilename
        for s in info.siblings
        if s.rfilename.endswith(extension)
        and len(s.rfilename.split("/")) == 1
        and "arguments" not in s.rfilename
        and "args" not in s.rfilename
        and "training" not in s.rfilename
    ]


def _weight_files_from_dir(d: Path, extension: str) -> List[str]:
    # os.walk: do not iterate, just scan for depth 1, not recursively
    # see _weight_hub_files_from_model_info, that's also what is
    # done there with the len(s.rfilename.split("/")) == 1 condition
    root, _, files = next(os.walk(str(d)))
    filenames = [
        os.path.join(root, f)
        for f in files
        if f.endswith(extension)
        and "arguments" not in f
        and "args" not in f
        and "training" not in f
    ]
    return filenames


def _get_cached_revision_directory(
    model_id: str, revision: Optional[str]
) -> Optional[Path]:
    if revision is None:
        revision = "main"

    repo_cache = Path(HUGGINGFACE_HUB_CACHE) / Path(
        file_download.repo_folder_name(repo_id=model_id, repo_type="model")
    )

    if not repo_cache.is_dir():
        # No cache for this model
        return None

    refs_dir = repo_cache / "refs"
    snapshots_dir = repo_cache / "snapshots"

    # Resolve refs (for instance to convert main to the associated commit sha)
    if refs_dir.is_dir():
        revision_file = refs_dir / revision
        if revision_file.exists():
            with revision_file.open() as f:
                revision = f.read()

    # Check if revision folder exists
    if not snapshots_dir.exists():
        return None
    cached_shas = os.listdir(snapshots_dir)
    if revision not in cached_shas:
        # No cache for this revision and we won't try to return a random revision
        return None

    return snapshots_dir / revision


def weight_hub_files(
    model_id: str, revision: Optional[str] = None, extension: str = ".safetensors"
) -> List[str]:
    """Get the weights filenames on the hub"""
    api = HfApi()

    if HF_HUB_OFFLINE:
        filenames = _cached_weight_files(model_id, revision, extension)
    else:
        # Online case, fetch model info from the Hub
        info = api.model_info(model_id, revision=revision)
        filenames = _weight_hub_files_from_model_info(info, extension)

    if not filenames:
        raise EntryNotFoundError(
            f"No {extension} weights found for model {model_id} and revision {revision}.",
            None,
        )

    return filenames


def try_to_load_from_cache(
    model_id: str, revision: Optional[str], filename: str
) -> Optional[Path]:
    """Try to load a file from the Hugging Face cache"""

    d = _get_cached_revision_directory(model_id, revision)
    if not d:
        return None

    # Check if file exists in cache
    cached_file = d / filename
    return cached_file if cached_file.is_file() else None


def weight_files(
    model_id: str, revision: Optional[str] = None, extension: str = ".safetensors"
) -> List[Path]:
    """Get the local files"""
    # Local model
    d = Path(model_id)
    if d.exists() and d.is_dir():
        local_files = _weight_files_from_dir(d, extension)
        if not local_files:
            raise FileNotFoundError(
                f"No local weights found in {model_id} with extension {extension}"
            )
        return [Path(f) for f in local_files]

    try:
        filenames = weight_hub_files(model_id, revision, extension)
    except EntryNotFoundError as e:
        if extension != ".safetensors":
            raise e
        # Try to see if there are pytorch weights
        pt_filenames = weight_hub_files(model_id, revision, extension=".bin")
        # Change pytorch extension to safetensors extension
        # It is possible that we have safetensors weights locally even though they are not on the
        # hub if we converted weights locally without pushing them
        filenames = [
            f"{Path(f).stem.lstrip('pytorch_')}.safetensors" for f in pt_filenames
        ]

    if WEIGHTS_CACHE_OVERRIDE is not None:
        files = []
        for filename in filenames:
            p = Path(WEIGHTS_CACHE_OVERRIDE) / filename
            if not p.exists():
                raise FileNotFoundError(
                    f"File {p} not found in {WEIGHTS_CACHE_OVERRIDE}."
                )
            files.append(p)
        return files

    files = []
    for filename in filenames:
        cache_file = try_to_load_from_cache(
            model_id, revision=revision, filename=filename
        )
        if cache_file is None:
            raise LocalEntryNotFoundError(
                f"File {filename} of model {model_id} not found in "
                f"{os.getenv('HUGGINGFACE_HUB_CACHE', 'the local cache')}. "
                f"Please run `text-generation-server download-weights {model_id}` first."
            )
        files.append(cache_file)

    return files


def download_weights(
    filenames: List[str], model_id: str, revision: Optional[str] = None
) -> List[Path]:
    """Download the safetensors files from the hub"""

    def download_file(fname, tries=5, backoff: int = 5):
        local_file = try_to_load_from_cache(model_id, revision, fname)
        if local_file is not None:
            logger.info(f"File {fname} already present in cache.")
            return Path(local_file)

        for idx in range(tries):
            try:
                logger.info(f"Download file: {fname}")
                stime = time.time()
                local_file = hf_hub_download(
                    filename=fname,
                    repo_id=model_id,
                    revision=revision,
                    local_files_only=HF_HUB_OFFLINE,
                )
                logger.info(
                    f"Downloaded {local_file} in {timedelta(seconds=int(time.time() - stime))}."
                )
                return Path(local_file)
            except Exception as e:
                if idx + 1 == tries:
                    raise e
                logger.error(e)
                logger.info(f"Retrying in {backoff} seconds")
                time.sleep(backoff)
                logger.info(f"Retry {idx + 1}/{tries - 1}")

    # We do this instead of using tqdm because we want to parse the logs with the launcher
    start_time = time.time()
    files = []
    for i, filename in enumerate(filenames):
        file = download_file(filename)

        elapsed = timedelta(seconds=int(time.time() - start_time))
        remaining = len(filenames) - (i + 1)
        eta = (elapsed / (i + 1)) * remaining if remaining > 0 else 0

        logger.info(f"Download: [{i + 1}/{len(filenames)}] -- ETA: {eta}")
        files.append(file)

    return files


================================================
FILE: server/text_generation_server/utils/import_utils.py
================================================
import torch
from loguru import logger
import os


import importlib.util


def is_ipex_available():
    return importlib.util.find_spec("intel_extension_for_pytorch") is not None


def get_cuda_free_memory(device, memory_fraction):
    total_free_memory, _ = torch.cuda.mem_get_info(device)
    total_gpu_memory = torch.cuda.get_device_properties(device).total_memory
    free_memory = max(0, total_free_memory - (1 - memory_fraction) * total_gpu_memory)
    return free_memory


def get_xpu_free_memory(device, memory_fraction):
    total_free_memory, total_xpu_memory = torch.xpu.mem_get_info(device)
    memory_fraction = float(os.getenv("XPU_MEMORY_FRACTION", "0.9"))
    free_memory = max(
        0, int(total_free_memory - (1 - memory_fraction) * total_xpu_memory)
    )
    return free_memory


def get_cpu_free_memory(device, memory_fraction):
    import psutil
    from text_generation_server.utils.dist import WORLD_SIZE

    mem = psutil.virtual_memory()
    free_memory = int(mem.available * 0.95 / WORLD_SIZE)
    return free_memory


def noop(*args, **kwargs):
    pass


SYSTEM = None
if torch.version.hip is not None:
    SYSTEM = "rocm"
    empty_cache = torch.cuda.empty_cache
    synchronize = torch.cuda.synchronize
    get_free_memory = get_cuda_free_memory
elif torch.version.cuda is not None and torch.cuda.is_available():
    SYSTEM = "cuda"
    empty_cache = torch.cuda.empty_cache
    synchronize = torch.cuda.synchronize
    get_free_memory = get_cuda_free_memory
elif is_ipex_available():
    SYSTEM = "ipex"
    import intel_extension_for_pytorch  # noqa: F401

    if hasattr(torch, "xpu") and torch.xpu.is_available():
        empty_cache = torch.xpu.empty_cache
        synchronize = torch.xpu.synchronize
        get_free_memory = get_xpu_free_memory
    else:
        empty_cache = noop
        synchronize = noop
        get_free_memory = get_cpu_free_memory
elif hasattr(torch, "xpu") and torch.xpu.is_available():
    SYSTEM = "xpu"
    empty_cache = torch.xpu.empty_cache
    synchronize = torch.xpu.synchronize
    get_free_memory = get_xpu_free_memory
else:
    SYSTEM = "cpu"

    empty_cache = noop
    synchronize = noop
    get_free_memory = get_cpu_free_memory
logger.info(f"Detected system {SYSTEM}")


================================================
FILE: server/text_generation_server/utils/kernels.py
================================================
import importlib

from loguru import logger
from kernels import load_kernel as hf_load_kernel

from text_generation_server.utils.log import log_once


def load_kernel(*, module: str, repo_id: str):
    """
    Load a kernel. First try to load it as the given module (e.g. for
    local development), falling back to a locked Hub kernel.
    """
    try:
        m = importlib.import_module(module)
        log_once(logger.info, f"Using local module for `{module}`")
        return m
    except ModuleNotFoundError:
        return hf_load_kernel(repo_id=repo_id)


__all__ = ["load_kernel"]


================================================
FILE: server/text_generation_server/utils/log.py
================================================
from functools import lru_cache
from text_generation_server.utils.dist import RANK


@lru_cache(10)
def log_once(log, msg: str, master=True):
    if master:
        log_master(log, msg)
    else:
        log(msg)


def log_master(log, msg: str):
    if RANK == 0:
        log(msg)


================================================
FILE: server/text_generation_server/utils/logits_process.py
================================================
from functools import lru_cache
import math
import time
import torch
from typing import List, Optional, DefaultDict

from loguru import logger
from typing import Dict
from text_generation_server.pb.generate_pb2 import GrammarType

from outlines.fsm.guide import RegexGuide

from transformers import (
    LogitsProcessor,
    PreTrainedTokenizerBase,
    TemperatureLogitsWarper,
    TopKLogitsWarper,
    TopPLogitsWarper,
    TypicalLogitsWarper,
)

mempool = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None


class StaticWarper:
    def __init__(
        self,
        temperature=1.0,
        top_k=None,
        top_p=None,
        typical_p=None,
    ):
        self.warpers = []

        if temperature is not None and temperature != 1.0:
            temperature = float(temperature)
            self.warpers.append(TemperatureLogitsWarper(temperature))
        if top_k is not None and top_k != 0:
            self.warpers.append(TopKLogitsWarper(top_k=top_k))
        if top_p is not None and top_p < 1.0:
            self.warpers.append(TopPLogitsWarper(top_p=top_p))
        if typical_p is not None and typical_p < 1.0:
            self.warpers.append(TypicalLogitsWarper(mass=typical_p))

        self.cuda_graph = None
        self.static_scores = None
        self.static_warped_scores = None
        self.static_next_logprob = None

    def __call__(self, scores):
        if torch.cuda.is_available():
            if self.cuda_graph is None:
                self.static_scores = scores
                self.cuda_graph = torch.cuda.CUDAGraph()

                with torch.cuda.graph(self.cuda_graph, pool=mempool):
                    local_scores = self.static_scores
                    for warper in self.warpers:
                        local_scores = warper(None, local_scores)

                    self.static_warped_scores = local_scores
                    # Compute logprobs
                    self.static_next_logprob = torch.log_softmax(
                        self.static_warped_scores, -1
                    )

            self.static_scores.copy_(scores)
            self.cuda_graph.replay()

            return self.static_warped_scores, self.static_next_logprob

        # CPU branch
        for warper in self.warpers:
            scores = warper(None, scores)
        return scores, torch.log_softmax(scores, -1)


@lru_cache(10)
def static_warper(
    temperature: Optional[float],
    top_k: Optional[int],
    top_p: Optional[float],
    typical_p: Optional[float],
) -> StaticWarper:
    return StaticWarper(
        temperature=temperature, top_k=top_k, top_p=top_p, typical_p=typical_p
    )


class HeterogeneousRepetitionPenaltyLogitsProcessor(LogitsProcessor):
    r"""
    [`LogitsProcessor`] enforcing an exponential penalty on repeated sequences.
    This version allows for a separate value for each sample and runs inplace when possible.
    It doesn't validate inputs.

    Args:
        repetition_penalty (`List[float]`):
            The parameter for repetition penalty. 1.0 means no penalty. See [this
            paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
    """

    def __init__(self, penalty: List[float], dtype: torch.dtype, device: torch.device):
        self.penalty = penalty
        self.penalty_tensor = torch.tensor(
            penalty, dtype=dtype, device=device
        ).unsqueeze(1)

    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
        score = torch.gather(scores, 1, input_ids)

        # if score < 0 then repetition penalty has to be multiplied to reduce the previous token probability
        score = torch.where(
            score < 0, score * self.penalty_tensor, score / self.penalty_tensor
        )

        scores.scatter_(1, input_ids, score)
        return scores

    def filter(self, indices):
        self.penalty = [self.penalty[i] for i in indices]
        if any([x != 1.0 for x in self.penalty]):
            self.penalty_tensor = self.penalty_tensor[indices]
            return self
        return None


class FrequencyPenaltyLogitsProcessor(LogitsProcessor):
    r"""
    Frequency penalty as defined by OpenAI

    Args:
        penalty (`float`):
            The parameter for frequency penalty. 0.0 means no penalty.
    """

    def __init__(self, penalty: float):
        self.penalty = penalty

    def __call__(
        self, input_ids: torch.LongTensor, scores: torch.FloatTensor
    ) -> torch.FloatTensor:
        score = torch.gather(scores, 1, input_ids)
        # if score < 0 then penalty has to be multiplied to reduce the previous token probability
        score = -torch.where(score < 0, score * self.penalty, score / self.penalty)
        # set score to 0 where input_ids is a padding token
        score *= input_ids.ne(0)

        return scores.scatter_add_(1, input_ids, score)


class HeterogeneousFrequencyPenaltyLogitsProcessor(LogitsProcessor):
    r"""
    Frequency penalty as defined by OpenAI in
    https://platform.openai.com/docs/guides/text-generation/parameter-details

    Args:
        frequency_penalty (`List[float]`):
            The parameter for frequency penalty. 0.0 means no penalty.
    """

    def __init__(self, penalty: List[float], dtype: torch.dtype, device: torch.device):
        self.penalty = penalty
        self.penalty_tensor = torch.tensor(
            penalty, dtype=dtype, device=device
        ).unsqueeze(1)

    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
        batch_size, input_size = input_ids.size()
        vocab_size = scores.size(1)

        # Calculate the frequency for each token so far
        token_freq = torch.zeros(batch_size, vocab_size, device=input_ids.device)
        token_freq.scatter_add_(
            1, input_ids, torch.ones_like(input_ids, dtype=torch.float)
        )
        token_freq /= input_size

        # Apply the frequency penalty to logits
        scores -= token_freq * self.penalty_tensor
        return scores

    def filter(self, indices):
        self.penalty = [self.penalty[i] for i in indices]
        if any([x != 0.0 for x in self.penalty]):
            self.penalty_tensor = self.penalty_tensor[indices]
            return self
        return None


class HeterogeneousTemperatureLogitsWarper:
    r"""
    [`LogitsWarper`] for temperature (exponential scaling output probability distribution).
    This version allows for a separate value for each sample and runs inplace when possible.
    It doesn't validate inputs.

    Args:
        temperature (`float`):
            The value used to module the logits distribution.
    """

    def __init__(
        self, temperature: List[float], dtype: torch.dtype, device: torch.device
    ):
        self.temperature = temperature
        self.temperature_tensor = torch.tensor(
            temperature, dtype=dtype, device=device
        ).unsqueeze(1)

    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
        scores.div_(self.temperature_tensor)
        return scores

    def filter(self, indices):
        self.temperature = [self.temperature[i] for i in indices]
        if any([x != 1.0 for x in self.temperature]):
            self.temperature_tensor = self.temperature_tensor[indices]
            return self
        return None


class HeterogeneousTopPLogitsWarper(LogitsProcessor):
    """
    [`LogitsWarper`] that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <= prob_cut_off.
    This version allows for a separate value for each sample and runs inplace when possible.
    It doesn't validate inputs.

    Args:
        top_p (`float`):
            If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
            higher are kept for generation.
        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
            All filtered values will be set to this float value.
        min_tokens_to_keep (`int`, *optional*, defaults to 1):
            Minimum number of tokens that cannot be filtered.
    """

    def __init__(
        self,
        top_p: List[float],
        dtype: torch.dtype,
        device: torch.device,
        filter_value: float = -math.inf,
        min_tokens_to_keep: int = 1,
    ):
        self.top_p = top_p
        self.top_p_opposite = 1 - torch.tensor(
            top_p, dtype=dtype, device=device
        ).unsqueeze(1)
        self.filter_value = filter_value
        self.min_tokens_to_keep = min_tokens_to_keep

    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
        sorted_logits, sorted_indices = torch.sort(scores, descending=False)
        probs = sorted_logits.softmax(dim=-1)
        # This is way faster for some reason
        for i in range(probs.shape[0]):
            probs[i] = probs[i].cumsum(dim=-1)

        # Remove tokens with cumulative top_p above the threshold (token with 0 are kept)
        sorted_indices_to_remove = probs <= self.top_p_opposite
        # Keep at least min_tokens_to_keep
        sorted_indices_to_remove[..., -self.min_tokens_to_keep :] = 0

        # scatter sorted tensors to original indexing
        indices_to_remove = sorted_indices_to_remove.scatter(
            1, sorted_indices, sorted_indices_to_remove
        )
        warped_scores = scores.masked_fill_(indices_to_remove, self.filter_value)

        return warped_scores

    def filter(self, indices):
        self.top_p = [self.top_p[i] for i in indices]
        if any([x < 1.0 for x in self.top_p]):
            self.top_p_opposite = self.top_p_opposite[indices]
            return self
        return None


class HeterogeneousTopKLogitsWarper(LogitsProcessor):
    r"""
    [`LogitsWarper`] that performs top-k, i.e. restricting to the k highest probability elements.
    This version allows for a separate value for each sample and runs inplace when possible.
    It doesn't validate inputs.

    Args:
        top_k (`int`):
            The number of highest probability vocabulary tokens to keep for top-k-filtering.
        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
            All filtered values will be set to this float value.
        min_tokens_to_keep (`int`, *optional*, defaults to 1):
            Minimum number of tokens that cannot be filtered.
    """

    def __init__(
        self,
        top_k: List[int],
        device: torch.device,
        filter_value: float = -math.inf,
        min_tokens_to_keep: int = 1,
    ):
        self.top_k = top_k
        self.max_top_k = max(top_k)
        # value - 1 as we will use top_k to index and python uses 0 based numbering
        self.top_k_tensor = torch.tensor(
            [max(x - 1, min_tokens_to_keep - 1) for x in top_k],
            dtype=torch.int64,
            device=device,
        ).unsqueeze(1)

        # 0 is a special value that disables top_k warping for this member of the batch
        disabled = [x == 0 for x in top_k]

        if any(disabled):
            self.top_k_disabled_mask = torch.tensor(
                disabled, dtype=torch.bool, device=device
            ).view(-1, 1)
        else:
            self.top_k_disabled_mask = None

        self.filter_value = filter_value

    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
        # If max_top_k is superior to the vocab, we need to clamp or the warper will fail
        if scores.size(-1) < self.max_top_k:
            max_top_k = scores.size(-1)
            top_k = torch.clamp_max(self.top_k_tensor, max_top_k)
        else:
            max_top_k = self.max_top_k
            top_k = self.top_k_tensor

        # Get the kth score for each member of the batch
        kth_scores = torch.gather(torch.topk(scores, max_top_k)[0], 1, top_k)

        # Mask member of kth_scores that do not want to use top_k warping
        if self.top_k_disabled_mask is not None:
            kth_scores.masked_fill_(self.top_k_disabled_mask, self.filter_value)

        # Remove all tokens with a probability less than the last token of the top-k
        indices_to_remove = scores < kth_scores
        scores.masked_fill_(indices_to_remove, self.filter_value)
        return scores

    def filter(self, indices):
        self.top_k = [self.top_k[i] for i in indices]
        disabled = [x == 0 for x in self.top_k]

        if not all(disabled):
            self.top_k_tensor = self.top_k_tensor[indices]
            self.max_top_k = max(self.top_k)

            if self.top_k_disabled_mask is not None:
                self.top_k_disabled_mask = (
                    self.top_k_disabled_mask[indices] if any(disabled) else None
                )

            return self
        return None


class HeterogeneousTypicalLogitsWarper(LogitsProcessor):
    r"""
    [`LogitsWarper`] that performs typical decoding. See [Typical Decoding for Natural Language
    Generation](https://arxiv.org/abs/2202.00666) for more information.
    This version allows for a separate value for each sample and runs inplace when possible.
    It doesn't validate inputs.

    Args:
        mass (`float`):
            Value of typical_p between 0 and 1 inclusive, defaults to 0.9.
        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
            All filtered values will be set to this float value.
        min_tokens_to_keep (`int`, *optional*, defaults to 1):
            Minimum number of tokens that cannot be filtered.
    """

    def __init__(
        self,
        mass: List[float],
        dtype: torch.dtype,
        device: torch.device,
        filter_value: float = -math.inf,
        min_tokens_to_keep: int = 1,
    ):
        self.mass = mass
        self.mass_tensor = torch.tensor(mass, dtype=dtype, device=device).unsqueeze(1)

        # 1 is a special value that disables typical_p warping for this member of the batch
        disabled = [x == 1.0 for x in mass]

        if any(disabled):
            self.disabled_mask = torch.tensor(disabled, dtype=torch.bool, device=device)
        else:
            self.disabled_mask = None

        self.filter_value = filter_value
        self.min_tokens_to_keep = min_tokens_to_keep

    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
        # calculate entropy
        normalized = torch.nn.functional.log_softmax(scores, dim=-1)
        p = torch.exp(normalized)
        ent = -(normalized * p).nansum(-1, keepdim=True)

        # shift and sort
        shifted_scores = torch.abs((-normalized) - ent)
        sorted_scores, sorted_indices = torch.sort(shifted_scores, descending=False)
        sorted_logits = scores.gather(-1, sorted_indices)
        probs = sorted_logits.softmax(dim=-1)
        # This is way faster for some reason
        for i in range(probs.shape[0]):
            probs[i] = probs[i].cumsum(dim=-1)

        # Remove tokens with cumulative mass above the threshold
        last_ind = (probs < self.mass_tensor).sum(dim=1)
        last_ind[last_ind < 0] = 0

        if self.disabled_mask is not None:
            last_ind.masked_fill_(self.disabled_mask, scores.shape[-1] - 1)

        sorted_indices_to_remove = sorted_scores > sorted_scores.gather(
            1, last_ind.view(-1, 1)
        )
        if self.min_tokens_to_keep > 1:
            # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
            sorted_indices_to_remove[..., : self.min_tokens_to_keep] = 0
        indices_to_remove = sorted_indices_to_remove.scatter(
            1, sorted_indices, sorted_indices_to_remove
        )

        warped_scores = scores.masked_fill_(indices_to_remove, self.filter_value)

        return warped_scores

    def filter(self, indices):
        self.mass = [self.mass[i] for i in indices]
        disabled = [x == 1.0 for x in self.mass]

        if not all(disabled):
            self.mass_tensor = self.mass_tensor[indices]

            if self.disabled_mask is not None:
                self.disabled_mask = (
                    self.disabled_mask[indices] if any(disabled) else None
                )

            return self
        return None


class HeterogeneousProcessorWrapper(LogitsProcessor):
    r"""
    A wrapper for logit warpers or processors without heterogeneous parameter support.
    Args:
        processors (`Dict[int, LogitsProcessor]`):
            A mapping of sample indices to logit warpers or processors, to be run sequentially.
    """

    def __init__(
        self,
        processors: Dict[int, LogitsProcessor],
    ):
        self.processors = processors

    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
        for i, processor in self.processors.items():
            scores[i : i + 1] = processor(input_ids[i : i + 1], scores[i : i + 1])
        return scores

    def filter(self, indices):
        new_processors = {}
        for i, idx in enumerate(indices):
            if idx in self.processors:
                new_processors[i] = self.processors[idx]

        if new_processors:
            self.processors = new_processors
            return self
        return None


class GrammarLogitProcessor(LogitsProcessor):
    fsm_state: DefaultDict[int, int]
    fsm: RegexGuide

    def __init__(
        self,
        tokenizer: Optional[PreTrainedTokenizerBase],
        device: str,
        grammar: str,
        grammar_type: GrammarType,
    ):
        self.device = device
        self.tokenizer = GrammarLogitProcessor._cached_adapt_tokenizer(tokenizer)
        self.fsm = GrammarLogitProcessor._cached_compile_fsm(
            grammar_type, grammar, self.tokenizer
        )

    def __call__(
        self,
        logits: torch.Tensor,
        fsm_grammar_state: int,
    ):
        if fsm_grammar_state == -1 or self.fsm is None:
            return logits
        allowed_tokens = self.fsm.get_next_instruction(fsm_grammar_state).tokens
        mask = torch.full_like(logits, -math.inf)
        if allowed_tokens is not None:
            mask[:, allowed_tokens] = 0
        biased_scores = logits + mask
        return biased_scores

    def advance(self, next_token_id, fsm_grammar_state):
        return GrammarLogitProcessor._advance(
            next_token_id, fsm_grammar_state, self.fsm
        )

    @staticmethod
    def _advance(next_token_id, fsm_grammar_state, fsm):
        if fsm_grammar_state == -1:
            return fsm_grammar_state
        return fsm.get_next_state(fsm_grammar_state, next_token_id)

    # TODO: move grammar compilation into the router
    @staticmethod
    @lru_cache(maxsize=32, typed=True)
    def _cached_compile_fsm(
        grammar_type: GrammarType,
        schema: str,
        tokenizer: Optional[PreTrainedTokenizerBase],
    ):
        start_time = time.time()
        if grammar_type == GrammarType.GRAMMAR_TYPE_JSON:
            # JSON schema is compiled by the v3 router.
            logger.error(
                "Non-regex grammars must be compiled by the router, grammar won't be enforced"
            )
            # allows everything
            schema = "(.*?)"

        fsm = RegexGuide.from_regex(schema, tokenizer)
        logger.debug(f"Compiled FSM in {time.time() - start_time:.2f}s")
        return fsm

    @staticmethod
    @lru_cache(maxsize=32, typed=True)
    def _cached_adapt_tokenizer(tokenizer):
        """Adapt tokenizer to work with the FSM.

        The API of Outlines tokenizers is slightly different to that of
        `transformers`. In addition we need to handle the missing spaces to
        Llama's tokenizer to be able to compile FSMs for this model.

        """
        start_time = time.time()
        tokenizer.vocabulary = tokenizer.get_vocab()
        tokenizer.special_tokens = set(tokenizer.all_special_tokens)

        def convert_token_to_string(token: str) -> str:
            from transformers.file_utils import SPIECE_UNDERLINE

            string = tokenizer.convert_tokens_to_string([token])

            # A hack to handle missing spaces to HF's Llama tokenizers
            if token.startswith(SPIECE_UNDERLINE) or token == "<0x20>":
                return " " + string

            return string

        tokenizer.convert_token_to_string = convert_token_to_string
        logger.debug(f"Adapted tokenizer in {time.time() - start_time:.2f}s")
        return tokenizer


class HeterogeneousGrammarLogitProcessor(LogitsProcessor):
    def __init__(self, tokenizer, device, grammars, grammar_types):
        self.device = device
        self.tokenizer = GrammarLogitProcessor._cached_adapt_tokenizer(tokenizer)
        self.fsms = []
        for grammar, grammar_type in zip(grammars, grammar_types):
            if len(grammar) == 0:
                self.fsms.append(None)
                continue
            fsm = GrammarLogitProcessor._cached_compile_fsm(
                grammar_type, grammar, self.tokenizer
            )
            self.fsms.append(fsm)

    def __call__(
        self,
        logits: torch.Tensor,
        fsm_grammar_states: List[int],
    ):
        mask = torch.full_like(logits, -math.inf)
        for i in range(logits.shape[0]):
            fsm = self.fsms[i]
            if fsm_grammar_states[i] == -1 or fsm is None:
                continue
            allowed_tokens = fsm.get_next_instruction(fsm_grammar_states[i]).tokens
            if allowed_tokens is not None:
                mask[i, allowed_tokens] = 0
            logits[i] += mask[i]
        return logits

    def advance_batch(self, next_token_ids, fsm_grammar_states):
        return [
            GrammarLogitProcessor._advance(
                next_token_ids[i], fsm_grammar_states[i], self.fsms[i]
            )
            for i in range(len(next_token_ids))
        ]

    def advance_at_index(self, next_token_id, fsm_grammar_state, index):
        if self.fsms[index] is None:
            return fsm_grammar_state
        return GrammarLogitProcessor._advance(
            next_token_id, fsm_grammar_state, self.fsms[index]
        )

    def filter(self, indices):
        new_fsms = []
        for i in indices:
            new_fsms.append(self.fsms[i])
        self.fsms = new_fsms
        return self


================================================
FILE: server/text_generation_server/utils/merges/strategies.py
================================================
import copy
from abc import ABC
from collections import defaultdict
from typing import TYPE_CHECKING, Dict, List, Tuple, Type, Union
from text_generation_server.utils.merges.utils import (
    calculate_majority_sign_mask,
    disjoint_merge,
    prune,
)
import torch

if TYPE_CHECKING:
    from text_generation_server.adapters.lora import LoraConfig
    from text_generation_server.utils.adapter import ModuleMap


class AdapterParameters:
    def __init__(
        self, adapter_ids, weights, merge_strategy, density, majority_sign_method
    ):
        self.adapter_ids = adapter_ids
        self.weights = weights
        self.merge_strategy = merge_strategy
        self.density = density
        self.majority_sign_method = majority_sign_method


def _apply_weights(
    tensors: Union[torch.Tensor, List[torch.Tensor]], w: torch.Tensor
) -> torch.Tensor:
    if isinstance(tensors, torch.Tensor):
        t = tensors
    else:
        t = torch.stack(tensors, dim=0)

    # element-wise weighting of each task tensor
    # need to unsqueeze weights to match task tensor dimensions
    # for multiplication to apply element-wise
    while len(t.shape) > len(w.shape):
        w = w.unsqueeze(-1)
    return t * w


class MergeStrategy(ABC):
    def merge(
        self, task_tensors: List[torch.Tensor], weights: torch.Tensor
    ) -> torch.Tensor:
        raise NotImplementedError()


class LinearMerge(MergeStrategy):
    def __init__(self, **kwargs):
        pass

    def merge(
        self, task_tensors: List[torch.Tensor], weights: torch.Tensor
    ) -> torch.Tensor:
        weighted_task_tensors = _apply_weights(task_tensors, weights)
        return weighted_task_tensors.sum(dim=0)


class TiesMerge(MergeStrategy):
    def __init__(self, density: float, majority_sign_method: str = "total", **kwargs):
        self.density = density
        self.majority_sign_method = majority_sign_method

    def merge(
        self, task_tensors: List[torch.Tensor], weights: torch.Tensor
    ) -> torch.Tensor:
        # sparsify
        task_tensors = [
            prune(tensor, self.density, method="magnitude") for tensor in task_tensors
        ]
        task_tensors = torch.stack(task_tensors, dim=0)

        # elect sign before applying weights
        majority_sign_mask = calculate_majority_sign_mask(
            task_tensors, method=self.majority_sign_method
        )
        weighted_task_tensors = _apply_weights(task_tensors, weights)

        # disjoint merge
        return disjoint_merge(weighted_task_tensors, majority_sign_mask)


class DareLinearMerge(MergeStrategy):
    def __init__(self, density: float, **kwargs):
        self.density = density

    def merge(
        self, task_tensors: List[torch.Tensor], weights: torch.Tensor
    ) -> torch.Tensor:
        # sparsify
        task_tensors = [
            prune(tensor, self.density, method="random", rescale=True)
            for tensor in task_tensors
        ]
        weighted_task_tensors = _apply_weights(task_tensors, weights)
        return weighted_task_tensors.sum(dim=0)


class DareTiesMerge(MergeStrategy):
    def __init__(self, density: float, majority_sign_method: str = "total", **kwargs):
        self.density = density
        self.majority_sign_method = majority_sign_method

    def merge(
        self, task_tensors: List[torch.Tensor], weights: torch.Tensor
    ) -> torch.Tensor:
        # sparsify
        task_tensors = [
            prune(tensor, self.density, method="random", rescale=True)
            for tensor in task_tensors
        ]
        task_tensors = torch.stack(task_tensors, dim=0)

        # elect sign before applying weights
        majority_sign_mask = calculate_majority_sign_mask(
            task_tensors, method=self.majority_sign_method
        )
        weighted_task_tensors = _apply_weights(task_tensors, weights)

        # disjoint merge
        mixed_task_tensors = disjoint_merge(weighted_task_tensors, majority_sign_mask)
        return mixed_task_tensors


strategy_registry: Dict[str, Type[MergeStrategy]] = {
    "linear": LinearMerge,
    "ties": TiesMerge,
    "dare_linear": DareLinearMerge,
    "dare_ties": DareTiesMerge,
}


def merge_adapters(
    adapters: List[Tuple["ModuleMap", "LoraConfig"]],
    merge_params: AdapterParameters,
) -> Tuple["ModuleMap", "LoraConfig"]:
    # strategy_name = MergeStrategyEnum.Name(merge_params.merge_strategy).lower()
    strategy_name = "linear"

    weights = merge_params.weights
    if not weights:
        weights = torch.ones(len(adapters))
    else:
        weights = torch.tensor(weights)

    merge_config = {
        "density": merge_params.density,
        # "majority_sign_method": MajoritySignMethodEnum.Name(
        #     merge_params.majority_sign_method
        # ).lower(),
        "majority_sign_method": "total",
    }
    merge_strategy = strategy_registry[strategy_name](**merge_config)

    module_maps: Dict[str, Dict[str, Dict[str, List[torch.Tensor]]]] = defaultdict(
        lambda: defaultdict(lambda: defaultdict(list))
    )
    lora_configs = []
    weight_name_to_adapter_idx = defaultdict(list)

    # input is list of (module_map, lora_config) tuples
    # convert into dict[k][param_name] -> list of tensors
    for idx, (module_map, lora_config) in enumerate(adapters):
        for weight_name, data in module_map.items():
            weight_name_to_adapter_idx[weight_name].append(idx)
            for k, (param_data, param_name) in data.items():
                module_maps[weight_name][k][param_name].append(param_data)
        lora_configs.append(lora_config)

    # validate lora configs are compatible
    _validate_lora_configs(lora_configs)

    # merge tensors for each module such that we have a single ModuleMap:
    # dict[k] -> merged tensor
    merged_module_map: "ModuleMap" = defaultdict(dict)
    for weight_name, data in module_maps.items():
        indices = weight_name_to_adapter_idx[weight_name]
        param_weights = weights[indices]
        for k, param_data in data.items():
            for param_name, tensors in param_data.items():
                merged_tensor = merge_strategy.merge(tensors, param_weights)
                merged_module_map[weight_name][k] = (merged_tensor, param_name)

    # merge lora configs
    merged_lora_config = _merge_lora_configs(lora_configs)

    return merged_module_map, merged_lora_config


def _validate_lora_configs(lora_configs: List["LoraConfig"]):
    # check that all configs have the same rank
    ranks = set(lora_config.r for lora_config in lora_configs)
    if len(ranks) > 1:
        raise ValueError(
            f"unable to merge adapters, lora configs have different ranks: {ranks}"
        )

    if all(len(lora_config.target_modules) == 0 for lora_config in lora_configs):
        raise ValueError(
            "unable to merge adapters, lora configs have no target modules"
        )


def _merge_lora_configs(lora_configs: List["LoraConfig"]) -> "LoraConfig":
    merged_lora_config = copy.copy(lora_configs[0])

    # merge target modules as a union operation
    merged_target_modules = sorted(
        set(
            module
            for lora_config in lora_configs
            for module in lora_config.target_modules
        )
    )
    merged_lora_config.target_modules = merged_target_modules

    return merged_lora_config


================================================
FILE: server/text_generation_server/utils/merges/utils.py
================================================
# coding=utf-8
# From: https://github.com/huggingface/peft/pull/1364
# Copyright 2024-present the HuggingFace Inc. team.
# Modifications by Predibase, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Literal

import torch


def magnitude_based_pruning(tensor: torch.Tensor, density: float) -> torch.Tensor:
    """
    Prune the smallest values of the task tensors and retain the top-k values based on the specified fraction
    `density`.

    Args:
    tensor (`torch.Tensor`):The tensor to prune.
    density (`float`):The fraction of values to preserve. Should be in [0,1].
    """
    mask = torch.zeros_like(tensor).reshape(-1)
    k = int(density * tensor.reshape(-1).shape[0])
    top_k = torch.topk(tensor.abs().reshape(-1), k=k, largest=True)
    mask[top_k[1]] = 1
    return tensor * mask.reshape(tensor.shape)


def random_pruning(tensor: torch.Tensor, density: float, rescale: bool) -> torch.Tensor:
    """
    Prune the smallest values of the task tensors and retain the top-k values based on the specified fraction
    `density`.

    Args:
    tensor (`torch.Tensor`):The tensor to prune.
    density (`float`):The fraction of values to preserve. Should be in [0,1].
    rescale (`bool`):Whether to rescale the result to preserve the expected value of the original tensor.
    """
    mask = torch.bernoulli(torch.full_like(input=tensor, fill_value=density))
    pruned_tensor = tensor * mask
    if rescale:
        torch.div(input=pruned_tensor, other=density)
    return pruned_tensor


def prune(
    tensor: torch.Tensor,
    density: float,
    method: Literal["magnitude", "random"],
    rescale: bool = False,
) -> torch.Tensor:
    """
    Prune the values of task tensors based on the `method`.

    Args:
    tensor (`torch.Tensor`):The tensor to prune.
    density (`float`):The fraction of values to preserve. Should be in [0,1].
    method (`str`):The method to use to prune. Should be one of ["magnitude", "random"].
    rescale (`bool`):Whether to rescale the result to preserve the expected value of the original tensor.
    """
    if density >= 1:
        return tensor
    elif density < 0:
        raise ValueError("Density should be >= 0, got {density}")
    if method == "magnitude":
        return magnitude_based_pruning(tensor, density)
    elif method == "random":
        return random_pruning(tensor, density, rescale=rescale)
    else:
        raise ValueError(f"Unknown method {method}")


def calculate_majority_sign_mask(
    tensor: torch.Tensor, method: Literal["total", "frequency"] = "total"
):
    """
    Get the mask of the majority sign across the task tensors. Task tensors are stacked on dimension 0.

    Args:
    tensor (`torch.Tensor`):The tensor to get the mask from.
    method (`str`):The method to use to get the mask. Should be one of ["total", "frequency"].
    """

    sign = tensor.sign()
    if method == "total":
        sign_magnitude = (sign * tensor.abs()).sum(dim=0)
    elif method == "frequency":
        sign_magnitude = sign.sum(dim=0)
    else:
        raise RuntimeError(f'Unimplemented mask method "{method}"')
    majority_sign = torch.where(sign_magnitude >= 0, 1, -1)
    return sign == majority_sign


def disjoint_merge(task_tensors, majority_sign_mask):
    mixed_task_tensors = (task_tensors * majority_sign_mask).sum(dim=0)
    num_params_preserved = majority_sign_mask.sum(dim=0)
    return mixed_task_tensors / torch.clamp(num_params_preserved, min=1.0)


================================================
FILE: server/text_generation_server/utils/peft.py
================================================
import os
from typing import Union
from loguru import logger
import torch

from transformers import AutoTokenizer
from peft import AutoPeftModelForCausalLM, AutoPeftModelForSeq2SeqLM


def download_and_unload_peft(model_id, revision, trust_remote_code):
    torch_dtype = torch.float16

    logger.info("Trying to load a Peft model. It might take a while without feedback")
    try:
        model = AutoPeftModelForCausalLM.from_pretrained(
            model_id,
            revision=revision,
            torch_dtype=torch_dtype,
            trust_remote_code=trust_remote_code,
            low_cpu_mem_usage=True,
        )
    except Exception:
        model = AutoPeftModelForSeq2SeqLM.from_pretrained(
            model_id,
            revision=revision,
            torch_dtype=torch_dtype,
            trust_remote_code=trust_remote_code,
            low_cpu_mem_usage=True,
        )
    logger.info("Peft model detected.")
    logger.info("Merging the lora weights.")

    base_model_id = model.peft_config["default"].base_model_name_or_path

    model = model.merge_and_unload()

    os.makedirs(model_id, exist_ok=True)
    cache_dir = model_id
    logger.info(f"Saving the newly created merged model to {cache_dir}")
    tokenizer = AutoTokenizer.from_pretrained(
        base_model_id, trust_remote_code=trust_remote_code
    )
    model.save_pretrained(cache_dir, safe_serialization=True)
    model.config.save_pretrained(cache_dir)
    tokenizer.save_pretrained(cache_dir)


def download_peft(
    model_id: Union[str, os.PathLike], revision: str, trust_remote_code: bool
):
    torch_dtype = torch.float16
    try:
        _model = AutoPeftModelForCausalLM.from_pretrained(
            model_id,
            revision=revision,
            torch_dtype=torch_dtype,
            trust_remote_code=trust_remote_code,
            low_cpu_mem_usage=True,
        )
    except Exception:
        _model = AutoPeftModelForSeq2SeqLM.from_pretrained(
            model_id,
            revision=revision,
            torch_dtype=torch_dtype,
            trust_remote_code=trust_remote_code,
            low_cpu_mem_usage=True,
        )
    logger.info("Peft model downloaded.")


================================================
FILE: server/text_generation_server/utils/prefill_chunking.py
================================================
from typing import Optional

SUPPORT_CHUNKING: Optional[bool] = None
MAX_PREFILL_TOKENS: Optional[int] = None


def set_support_chunking(support_chunking: bool):
    global SUPPORT_CHUNKING
    SUPPORT_CHUNKING = support_chunking


def get_support_chunking() -> bool:
    global SUPPORT_CHUNKING
    return SUPPORT_CHUNKING


def set_max_prefill_tokens(max_prefill_tokens: int):
    global MAX_PREFILL_TOKENS
    MAX_PREFILL_TOKENS = max_prefill_tokens


def get_max_prefill_tokens() -> int:
    global MAX_PREFILL_TOKENS
    return MAX_PREFILL_TOKENS


================================================
FILE: server/text_generation_server/utils/quantization.py
================================================
import json
import os
from dataclasses import dataclass
from typing import Optional, List

from huggingface_hub import hf_hub_download
from text_generation_server.layers.marlin.gptq import can_use_gptq_marlin
from text_generation_server.utils.weights import (
    DefaultWeightsLoader,
    WeightsLoader,
)


# TODO: Split this config to have a single config type per quant method
@dataclass
class _QuantizerConfig:
    bits: int
    checkpoint_format: Optional[str]
    desc_act: bool
    groupsize: int
    quant_method: str
    sym: bool
    weight_block_size: Optional[List[int]]
    modules_to_not_convert: List[str]


@dataclass
class _FP8QuantizerConfig:
    activation_scale_ub: float


def _get_config_json(model_id: str, revision: Optional[str], filename: str):
    if os.path.exists(
        os.path.join(
            model_id,
        )
    ):
        filename = os.path.join(model_id, filename)
    else:
        filename = hf_hub_download(model_id, filename=filename, revision=revision)
    with open(filename, "r") as f:
        return json.load(f)


# We should probably do this with Pydantic JSON deserialization,
# but for now we'll stay close to the old _set_gptq_params.
def _get_quantizer_config(model_id, revision):
    bits = 4
    groupsize = -1
    quant_method = "gptq"
    checkpoint_format = None
    sym = False
    desc_act = False
    weight_block_size = None
    modules_to_not_convert = []

    filename = "config.json"
    try:
        data = _get_config_json(model_id, revision, filename)
        # FP8 config
        if data["quantization_config"]["quant_method"] == "fbgemm_fp8":
            return _FP8QuantizerConfig(
                activation_scale_ub=data["quantization_config"]["activation_scale_ub"]
            )
        weight_block_size = data["quantization_config"].get("weight_block_size", None)

        if "zero_point" in data["quantization_config"]:
            sym = not data["quantization_config"]["zero_point"]
            quant_method = "awq"
        elif "sym" in data["quantization_config"]:
            sym = data["quantization_config"]["sym"]

        bits = data["quantization_config"]["bits"]
        groupsize = data["quantization_config"]["group_size"]
        # Order is important here, desc_act is missing on some real models
        quant_method = data["quantization_config"]["quant_method"]
        checkpoint_format = data["quantization_config"].get("checkpoint_format")
        desc_act = data["quantization_config"].get("desc_act", False)
        modules_to_not_convert = data["quantization_config"].get(
            "modules_to_not_convert", []
        )
        if modules_to_not_convert is None:
            modules_to_not_convert = []
    except Exception:
        filename = "quantize_config.json"
        try:
            data = _get_config_json(model_id, revision, filename)
            bits = data["bits"]
            groupsize = data["group_size"]

            if "zero_point" in data:
                sym = not data["zero_point"]
                quant_method = "awq"
            elif "sym" in data:
                sym = data["sym"]

            desc_act = data["desc_act"]
            if "version" in data and data["version"] == "GEMM":
                quant_method = "awq"
        except Exception:
            filename = "quant_config.json"
            try:
                data = _get_config_json(model_id, revision, filename)
                bits = data["w_bit"]
                groupsize = data["q_group_size"]
                desc_act = data["desc_act"]
                if "version" in data and data["version"] == "GEMM":
                    quant_method = "awq"
            except Exception:
                pass

    return _QuantizerConfig(
        bits=bits,
        groupsize=groupsize,
        quant_method=quant_method,
        checkpoint_format=checkpoint_format,
        sym=sym,
        desc_act=desc_act,
        weight_block_size=weight_block_size,
        modules_to_not_convert=modules_to_not_convert,
    )


def get_loader(
    quantize: Optional[str], model_id: str, revision: Optional[str]
) -> WeightsLoader:
    if quantize == "compressed-tensors":
        config = _get_config_json(model_id, revision, "config.json")
        from text_generation_server.layers.compressed_tensors import (
            CompressedTensorsLoader,
        )

        return CompressedTensorsLoader(config)

    quantizer_config = _get_quantizer_config(model_id, revision)
    if quantize in {"awq", "gptq"}:
        from text_generation_server.layers.gptq import GPTQWeightsLoader

        # TODO: improve check once we have one config type per quantize value
        if not isinstance(quantizer_config, _QuantizerConfig):
            raise ValueError(
                f"Quantize is set to `{quantize}` but received a `{quantizer_config.__class__.__name__}` config."
            )

        if can_use_gptq_marlin(
            bits=quantizer_config.bits,
            groupsize=quantizer_config.groupsize,
            quant_method=quantizer_config.quant_method,
            quantize=quantize,
            sym=quantizer_config.sym,
        ):
            from text_generation_server.layers.marlin import GPTQMarlinWeightsLoader

            return GPTQMarlinWeightsLoader(
                bits=quantizer_config.bits,
                desc_act=quantizer_config.desc_act,
                groupsize=quantizer_config.groupsize,
                quant_method=quantizer_config.quant_method,
                quantize=quantize,
                sym=quantizer_config.sym,
            )
        else:
            return GPTQWeightsLoader(
                bits=quantizer_config.bits,
                desc_act=quantizer_config.desc_act,
                groupsize=quantizer_config.groupsize,
                quant_method=quantizer_config.quant_method,
                quantize=quantize,
                sym=quantizer_config.sym,
                modules_to_not_convert=quantizer_config.modules_to_not_convert,
            )
    elif quantize == "bitsandbytes":
        from text_generation_server.layers.bnb import BNBWeight

        return DefaultWeightsLoader(BNBWeight)
    elif quantize == "bitsandbytes-fp4":
        from text_generation_server.layers.bnb import BNBFP4Weight

        return DefaultWeightsLoader(BNBFP4Weight)
    elif quantize == "bitsandbytes-nf4":
        from text_generation_server.layers.bnb import BNBNF4Weight

        return DefaultWeightsLoader(BNBNF4Weight)
    elif quantize == "eetq":
        from text_generation_server.layers.eetq import EETQWeight

        return DefaultWeightsLoader(EETQWeight)
    elif quantize == "exl2":
        from text_generation_server.layers.exl2 import Exl2WeightsLoader

        return Exl2WeightsLoader()
    elif quantize == "marlin":
        from text_generation_server.layers.marlin import MarlinWeightsLoader

        # TODO: improve check once we have one config type per quantize value
        if not isinstance(quantizer_config, _QuantizerConfig):
            raise ValueError(
                f"Quantize is set to `{quantize}` but received a `{quantizer_config.__class__.__name__}` config."
            )

        return MarlinWeightsLoader(
            bits=quantizer_config.bits,
            is_marlin_24=quantizer_config.checkpoint_format == "marlin_24",
        )
    elif quantize == "fp8" or quantize is None:
        from text_generation_server.layers.fp8 import HybridFP8UnquantLoader

        # Since the default for the quantize config is _QuantizerConfig,
        # we need to add this check to not get an attribute error
        activation_scale_ub = None
        weight_block_size = quantizer_config.weight_block_size
        if isinstance(quantizer_config, _FP8QuantizerConfig):
            activation_scale_ub = quantizer_config.activation_scale_ub

        return HybridFP8UnquantLoader(
            activation_scale_ub,
            to_fp8=quantize == "fp8",
            weight_block_size=weight_block_size,
        )
    else:
        raise ValueError(f"Unknown quantization method: {quantize}")


================================================
FILE: server/text_generation_server/utils/segments.py
================================================
# Origin:   https://github.com/predibase/lorax
# Path:     lorax/server/lorax_server/utils/segments.py
# License:  Apache License Version 2.0, January 2004

from typing import List, Tuple, Union

import torch
import numpy as np


def find_segments(
    adapter_indices: Union[torch.Tensor, List[int]],
) -> Tuple[List[int], List[int]]:
    if isinstance(adapter_indices, torch.Tensor):
        adapter_indices = adapter_indices.cpu().numpy()
    elif isinstance(adapter_indices, list):
        adapter_indices = np.array(adapter_indices)

    change_mask = np.diff(adapter_indices, prepend=adapter_indices[0] - 1)
    change_indices = np.nonzero(change_mask)[0]

    segments = [0]
    segments.extend(change_indices[1:].tolist())
    segments.append(len(adapter_indices))

    segment_indices = adapter_indices[change_indices].tolist()

    return segments, segment_indices


class SegmentConcatBuilder:
    def __init__(self):
        self.adapter_segment_indices = []
        self.adapter_segment_tensors = []

    def concat(self, adapter_segments: torch.Tensor, segment_indices: List[int]):
        # Update adapter segments
        if self.adapter_segment_tensors:
            # Because we have already processed at least one batch, remove the 0 start index
            # from this batch denoting the beginning of the segment, then offset all segment
            # positions by the value of the last segment in the previous batch to account for
            # the concatenation.
            adapter_segments = (
                adapter_segments[1:] + self.adapter_segment_tensors[-1][-1]
            )

        if (
            self.adapter_segment_indices
            and self.adapter_segment_indices[-1] == segment_indices[0]
        ):
            # If the last segment in the previous batch is the same as the first segment in this batch,
            # then we merge them together into a single segment. In effect, this means removing it from
            # the segment indices of this batch, and extending the segment span by removing the segment
            # end index from the previous batch.
            segment_indices = segment_indices[1:]
            self.adapter_segment_tensors[-1] = self.adapter_segment_tensors[-1][:-1]

        self.adapter_segment_indices.extend(segment_indices)
        self.adapter_segment_tensors.append(adapter_segments)

    def build(self) -> Tuple[torch.Tensor, List[int]]:
        return torch.concat(self.adapter_segment_tensors), self.adapter_segment_indices


================================================
FILE: server/text_generation_server/utils/speculate.py
================================================
SPECULATE = None


def get_speculate() -> int:
    global SPECULATE
    return SPECULATE


def set_speculate(speculate: int):
    global SPECULATE
    SPECULATE = speculate


================================================
FILE: server/text_generation_server/utils/tokens.py
================================================
import re
from typing import List, Optional, Tuple, Set, Union

import torch
from text_generation_server.pb import generate_pb2
from text_generation_server.pb.generate_pb2 import FinishReason, GrammarType
from text_generation_server.utils.logits_process import (
    FrequencyPenaltyLogitsProcessor,
    GrammarLogitProcessor,
    HeterogeneousProcessorWrapper,
    HeterogeneousRepetitionPenaltyLogitsProcessor,
    HeterogeneousFrequencyPenaltyLogitsProcessor,
    HeterogeneousTemperatureLogitsWarper,
    HeterogeneousTopKLogitsWarper,
    HeterogeneousTopPLogitsWarper,
    HeterogeneousTypicalLogitsWarper,
    HeterogeneousGrammarLogitProcessor,
    static_warper,
)
from text_generation_server.utils.watermark import WatermarkLogitsProcessor
from transformers import PreTrainedTokenizerBase, RepetitionPenaltyLogitsProcessor


class NextTokenChooser:
    def __init__(
        self,
        watermark: bool = False,
        temperature: float = 1.0,
        repetition_penalty: float = 1.0,
        frequency_penalty: float = 0.0,
        top_k: Optional[int] = None,
        top_p: Optional[float] = None,
        typical_p: Optional[float] = None,
        do_sample: bool = False,
        seed: int = 0,
        device: str = "cpu",
        tokenizer: Optional[PreTrainedTokenizerBase] = None,
        grammar: str = "",
        grammar_type: GrammarType = GrammarType.GRAMMAR_TYPE_NONE,
        fsm_grammar_state: int = 0,
    ):
        self.watermark_processor = (
            WatermarkLogitsProcessor(device=device) if watermark else None
        )
        self.repetition_processor = (
            RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty)
            if repetition_penalty and repetition_penalty != 1.0
            else None
        )
        self.frequency_processor = (
            FrequencyPenaltyLogitsProcessor(penalty=frequency_penalty)
            if frequency_penalty and frequency_penalty != 0.0
            else None
        )
        self.grammar_processor = (
            GrammarLogitProcessor(tokenizer, device, grammar, grammar_type)
            if grammar != ""
            else None
        )
        self.tokenizer = tokenizer

        has_warpers = (
            (temperature is not None and temperature != 1.0)
            or (top_k is not None and top_k != 0)
            or (top_p is not None and top_p < 1.0)
            or (typical_p is not None and typical_p < 1.0)
        )
        if has_warpers:
            self.static_warper = static_warper(
                temperature=temperature, top_k=top_k, top_p=top_p, typical_p=typical_p
            )
        else:
            self.static_warper = None

        sampling = do_sample or has_warpers

        self.choice = Sampling(seed, device) if sampling else Greedy()
        self.fsm_grammar_state = fsm_grammar_state
        self.grammar = grammar

    def __call__(self, input_ids, scores):
        if self.watermark_processor is not None:
            scores = self.watermark_processor(input_ids, scores)
        if self.repetition_processor is not None:
            scores = self.repetition_processor(input_ids, scores)
        if self.frequency_processor is not None:
            scores = self.frequency_processor(input_ids, scores)
        if self.grammar_processor is not None:
            scores = self.grammar_processor(scores, self.fsm_grammar_state)

        if self.static_warper is None:
            next_logprob = torch.log_softmax(scores, -1)
        else:
            scores, next_logprob = self.static_warper(scores)

        next_id = self.choice(scores[-1]).view(1, 1)

        return next_id, next_logprob

    def advance_grammar(self, next_id: int):
        if self.grammar_processor is not None:
            self.fsm_grammar_state = self.grammar_processor.advance(
                next_id, self.fsm_grammar_state
            )
        return self

    @classmethod
    def from_pb(
        cls,
        pb: generate_pb2.NextTokenChooserParameters,
        device: torch.device,
        tokenizer: PreTrainedTokenizerBase,
    ) -> "NextTokenChooser":
        return NextTokenChooser(
            watermark=pb.watermark,
            temperature=pb.temperature,
            repetition_penalty=pb.repetition_penalty,
            frequency_penalty=pb.frequency_penalty,
            top_k=pb.top_k,
            top_p=pb.top_p,
            typical_p=pb.typical_p,
            do_sample=pb.do_sample,
            seed=pb.seed,
            device=device,
            tokenizer=tokenizer,
            grammar=pb.grammar,
            grammar_type=pb.grammar_type,
        )


class StopSequenceCriteria:
    def __init__(self, stop_sequence: str):
        stop_sequence = re.escape(stop_sequence)
        self.regex = re.compile(f"{stop_sequence}$")

    def __call__(self, output: str) -> bool:
        if self.regex.findall(output):
            return True
        return False


class StoppingCriteria:
    def __init__(
        self,
        eos_token_ids: Optional[Union[Set[int], int]],
        stop_sequence_criterias: List[StopSequenceCriteria],
        max_new_tokens: int = 20,
        ignore_eos_token: bool = False,
    ):
        if eos_token_ids is None:
            eos_token_ids = set()
        elif isinstance(eos_token_ids, int):
            eos_token_ids = set([eos_token_ids])
        elif isinstance(eos_token_ids, set):
            eos_token_ids = eos_token_ids
        else:
            raise RuntimeError(
                f"eos_token_ids is of invalid type {type(eos_token_ids)}, expected int, None or set[int]"
            )
        self.eos_token_ids = eos_token_ids
        self.stop_sequence_criterias = stop_sequence_criterias
        self.max_new_tokens = max_new_tokens
        self.current_tokens = 0
        self.current_output = ""
        self.ignore_eos_token = ignore_eos_token

    def __call__(self, last_token: int, last_output: str) -> Tuple[bool, Optional[str]]:
        self.current_tokens += 1
        if self.current_tokens >= self.max_new_tokens:
            return True, FinishReason.FINISH_REASON_LENGTH

        if isinstance(last_token, torch.Tensor):
            last_token = last_token.item()

        if not self.ignore_eos_token and last_token in self.eos_token_ids:
            return True, FinishReason.FINISH_REASON_EOS_TOKEN

        if self.stop_sequence_criterias:
            self.current_output += last_output
            # There is no need to keep an output that is too long
            if len(self.current_output) > 300:
                # Slice to -200 to avoid doing it all the time
                self.current_output = self.current_output[-200:]
            for stop_sequence_criteria in self.stop_sequence_criterias:
                if stop_sequence_criteria(self.current_output):
                    return True, FinishReason.FINISH_REASON_STOP_SEQUENCE

        return False, None

    @classmethod
    def from_pb(
        cls,
        pb: generate_pb2.StoppingCriteriaParameters,
        tokenizer: PreTrainedTokenizerBase,
    ) -> "StoppingCriteria":
        stop_sequence_criterias = [
            StopSequenceCriteria(sequence) for sequence in pb.stop_sequences
        ]
        # TODO Hack because eos_token_id cannot be what we want.
        eos_token_id = getattr(tokenizer, "_eos_token_ids", tokenizer.eos_token_id)
        return StoppingCriteria(
            eos_token_id,
            stop_sequence_criterias,
            pb.max_new_tokens,
            pb.ignore_eos_token,
        )


def create_n_gram_speculation(
    input_ids: torch.Tensor,
    next_ids: torch.Tensor,
    accepted_ids: torch.Tensor,
    speculate: int,
    verbose: bool,
):
    # Very trivial approach, find first match in the string.
    # This is much less refined than actual n-gram but seems to work
    # relatively OK in grounded mode and is by far much faster with
    # much less worst case complexity as everything happens on device.
    B = accepted_ids.shape[0]
    device = input_ids.device
    seeds = next_ids[accepted_ids.cumsum(dim=-1) - 1]
    indices = (input_ids == seeds.unsqueeze(-1)).max(dim=1).indices + 1
    all_indices = indices.unsqueeze(-1).expand(B, speculate) + torch.arange(
        speculate, device=device
    )
    all_indices = torch.clamp(all_indices, max=input_ids.shape[1] - 1)

    speculative_ids = input_ids.gather(dim=-1, index=all_indices)
    return speculative_ids


class HeterogeneousNextTokenChooser:
    def __init__(
        self,
        dtype: torch.dtype,
        device: torch.device,
        watermark: List[bool],
        temperature: List[float],
        repetition_penalty: List[float],
        frequency_penalty: List[float],
        top_k: List[int],
        top_p: List[float],
        typical_p: List[float],
        do_sample: List[bool],
        seeds: List[int],
        tokenizer: PreTrainedTokenizerBase,
        grammars: List[str],
        grammar_types: List[int],
        fsm_grammar_states=List[int],
    ):
        warpers = []

        self.watermark_processor = (
            HeterogeneousProcessorWrapper(
                {
                    i: WatermarkLogitsProcessor(device=device)
                    for i, do_watermark in enumerate(watermark)
                    if do_watermark
                }
            )
            if any(watermark)
            else None
        )

        self.repetition_processor = (
            HeterogeneousRepetitionPenaltyLogitsProcessor(
                repetition_penalty, dtype, device
            )
            if any([x != 1.0 for x in repetition_penalty])
            else None
        )

        self.frequency_processor = (
            HeterogeneousFrequencyPenaltyLogitsProcessor(
                frequency_penalty, dtype, device
            )
            if any([x != 0.0 for x in frequency_penalty])
            else None
        )

        self.grammar_processor = (
            HeterogeneousGrammarLogitProcessor(
                tokenizer, device, grammars, grammar_types
            )
            if any([grammar != "" for grammar in grammars])
            else None
        )

        if any(x != 1.0 for x in temperature):
            do_sample = [
                sample or x != 1.0 for x, sample in zip(temperature, do_sample)
            ]
            warpers.append(
                HeterogeneousTemperatureLogitsWarper(temperature, dtype, device)
            )

        if any(x != 0 for x in top_k):
            do_sample = [sample or x != 0 for x, sample in zip(top_k, do_sample)]
            warpers.append(HeterogeneousTopKLogitsWarper(top_k, device))

        if any(x < 1.0 for x in top_p):
            do_sample = [sample or x < 1.0 for x, sample in zip(top_p, do_sample)]
            warpers.append(HeterogeneousTopPLogitsWarper(top_p, dtype, device))

        if any(x < 1.0 for x in typical_p):
            do_sample = [sample or x < 1.0 for x, sample in zip(typical_p, do_sample)]
            warpers.append(HeterogeneousTypicalLogitsWarper(typical_p, dtype, device))

        self.warpers = warpers

        if any(do_sample):
            self.choice = HeterogeneousSampling(do_sample, seeds, device)
        else:
            self.choice = Greedy()

        self.seeds = seeds
        self.do_sample = do_sample
        self.dtype = dtype
        self.device = device
        self.tokenizer = tokenizer
        self.fsm_grammar_states = fsm_grammar_states
        self.grammars = grammars
        self.grammar_types = grammar_types

    def __call__(
        self,
        input_ids: torch.Tensor,
        scores: torch.Tensor,
        speculate: int,
        speculated_ids: Optional[torch.Tensor] = None,
        speculative_scores: Optional[torch.Tensor] = None,
        verbose=False,
    ):
        if speculated_ids is not None:
            B = scores.shape[0] // (speculated_ids.shape[1] + 1)
            S = speculated_ids.shape[1] + 1
            scores = scores.view(B, S, -1)
        else:
            B = scores.shape[0]
            S = 1
            scores = scores.view(B, S, -1)

        next_ids = torch.zeros((B, S), device=scores.device, dtype=torch.long)

        for j in range(S):
            _scores = scores[:, j]
            if self.watermark_processor is not None:
                _scores = self.watermark_processor(input_ids, _scores)
            if self.repetition_processor is not None:
                _scores = self.repetition_processor(input_ids, _scores)
            if self.frequency_processor is not None:
                _scores = self.frequency_processor(input_ids, _scores)
            if self.grammar_processor is not None:
                _scores = self.grammar_processor(_scores, self.fsm_grammar_states)
            for warper in self.warpers:
                _scores = warper(input_ids, _scores)
            _next_ids = self.choice(_scores)
            scores[:, j] = _scores
            next_ids[:, j] = _next_ids
        next_ids = next_ids.view(B * S)
        allscores = scores.view(B * S, -1)
        alllogprobs = torch.log_softmax(allscores, -1)

        if speculated_ids is not None:
            accepted_ids = []
            B = next_ids.shape[0] // (speculated_ids.shape[1] + 1)
            S = speculated_ids.shape[1] + 1
            indices = []
            for i in range(B):
                _next_ids = next_ids[i * S : (i + 1) * S]
                _speculated_ids = speculated_ids[i]
                validate_speculative = _next_ids[:-1] == _speculated_ids
                index = i * S
                accepted = 1
                # First is always valid
                indices.append(index)
                for valid in validate_speculative.tolist():
                    if valid:
                        index += 1
                        accepted += 1
                        indices.append(index)
                    else:
                        break
                accepted_ids.append(accepted)

            accepted_ids = torch.tensor(
                accepted_ids, device=input_ids.device, dtype=input_ids.dtype
            )
            next_ids = next_ids[indices]
            logprobs = alllogprobs[indices]
            indices = torch.arange(B, device=input_ids.device) * S
            if speculative_scores is not None:
                speculative_scores = speculative_scores[indices + accepted_ids - 1]
        else:
            accepted_ids = torch.ones_like(next_ids)
            logprobs = alllogprobs

        next_logprobs = torch.gather(logprobs, 1, next_ids.view(-1, 1)).view(-1)

        if speculate > 0:
            if speculative_scores is not None:
                # Medusa provided some scores
                speculative_ids = Greedy()(speculative_scores)
            else:
                # n-gram
                speculative_ids = create_n_gram_speculation(
                    input_ids, next_ids, accepted_ids, speculate, verbose
                )
        else:
            speculative_ids = None

        return next_ids, next_logprobs, alllogprobs, accepted_ids, speculative_ids

    def advance_grammar(self, next_ids: List[int]):
        if self.grammar_processor is not None:
            other_new_states = self.grammar_processor.advance_batch(
                next_ids, self.fsm_grammar_states
            )
            self.fsm_grammar_states = other_new_states
        return self

    def advance_grammar_single(self, grammar_state_index: int, next_id: int):
        if self.grammar_processor is not None:
            self.fsm_grammar_states[grammar_state_index] = (
                self.grammar_processor.advance_at_index(
                    next_id,
                    self.fsm_grammar_states[grammar_state_index],
                    grammar_state_index,
                )
            )
        return self

    def filter(self, indices):
        if self.watermark_processor is not None:
            self.watermark_processor = self.watermark_processor.filter(indices)

        if self.repetition_processor is not None:
            self.repetition_processor = self.repetition_processor.filter(indices)

        if self.frequency_processor is not None:
            self.frequency_processor = self.frequency_processor.filter(indices)

        if self.grammar_processor is not None:
            self.grammar_processor = self.grammar_processor.filter(indices)

        filtered_warpers = []
        for warper in self.warpers:
            filtered_warper = warper.filter(indices)
            if filtered_warper is not None:
                filtered_warpers.append(filtered_warper)
        self.warpers = filtered_warpers

        self.seeds = [self.seeds[i] for i in indices]
        self.do_sample = [self.do_sample[i] for i in indices]

        new_grammars = []
        new_fsm_grammar_states = []
        new_grammar_types = []
        for i in indices:
            new_grammars.append(self.grammars[i])
            new_fsm_grammar_states.append(self.fsm_grammar_states[i])
            new_grammar_types.append(self.grammar_types[i])

        self.grammars = new_grammars
        self.fsm_grammar_states = new_fsm_grammar_states
        self.grammar_types = new_grammar_types

        if any(self.do_sample):
            self.choice.filter(indices)
        else:
            self.choice = Greedy()

        return self

    @classmethod
    def from_pb(
        cls,
        pb: List[generate_pb2.NextTokenChooserParameters],
        dtype: torch.dtype,
        device: torch.device,
        tokenizer: PreTrainedTokenizerBase,
        fsm_grammar_states: Optional[List[int]] = None,
    ) -> "HeterogeneousNextTokenChooser":
        return HeterogeneousNextTokenChooser(
            watermark=[pb_.watermark for pb_ in pb],
            temperature=[pb_.temperature for pb_ in pb],
            repetition_penalty=[pb_.repetition_penalty for pb_ in pb],
            frequency_penalty=[pb_.frequency_penalty for pb_ in pb],
            top_k=[pb_.top_k for pb_ in pb],
            top_p=[pb_.top_p for pb_ in pb],
            typical_p=[pb_.typical_p for pb_ in pb],
            do_sample=[pb_.do_sample for pb_ in pb],
            seeds=[pb_.seed for pb_ in pb],
            device=device,
            dtype=dtype,
            tokenizer=tokenizer,
            grammars=[pb_.grammar for pb_ in pb],
            grammar_types=[pb_.grammar_type for pb_ in pb],
            fsm_grammar_states=(
                fsm_grammar_states if fsm_grammar_states else [0] * len(pb)
            ),
        )


class Sampling:
    def __init__(self, seed: int, device: str = "cpu"):
        self.generator = torch.Generator(device)
        self.generator.manual_seed(seed)
        self.seed = seed

    def __call__(self, logits):
        probs = torch.nn.functional.softmax(logits, -1)
        # Avoid GPU<->CPU sync done by torch multinomial
        # See: https://github.com/pytorch/pytorch/blob/925a3788ec5c06db62ca732a0e9425a26a00916f/aten/src/ATen/native/Distributions.cpp#L631-L637
        q = torch.empty_like(probs).exponential_(1, generator=self.generator)
        return probs.div_(q).argmax()


class Greedy:
    def __call__(self, logits):
        return logits.argmax(dim=-1)


class HeterogeneousSampling:
    r"""
    Mixed greedy and probabilistic sampling. Compute both and pick the right one for each sample.
    """

    def __init__(self, do_sample: List[bool], seeds: List[int], device: torch.device):
        self.seeds = seeds

        self.greedy_indices = []
        self.sampling_mapping = {}
        for i, (sample, seed) in enumerate(zip(do_sample, seeds)):
            if sample:
                self.sampling_mapping[i] = Sampling(seed, device)
            else:
                self.greedy_indices.append(i)

        self.greedy = Greedy()

    def __call__(self, logits):
        out = torch.empty(logits.shape[0], dtype=torch.int64, device=logits.device)
        if self.greedy_indices:
            # Computing for all indices is faster than slicing
            torch.argmax(logits, -1, out=out)

        for i, sampling in self.sampling_mapping.items():
            out[i] = sampling(logits[i])
        return out

    def filter(self, indices):
        new_greedy_indices = []
        new_sampling_mapping = {}
        for i, idx in enumerate(indices):
            if idx in self.sampling_mapping:
                new_sampling_mapping[i] = self.sampling_mapping[idx]
            else:
                new_greedy_indices.append(i)

        self.greedy_indices = new_greedy_indices
        self.sampling_mapping = new_sampling_mapping
        return self


def batch_top_tokens(
    top_n_tokens: List[int],
    top_n_tokens_tensor: torch.Tensor,
    logprobs: torch.Tensor,
    accepted_ids: torch.Tensor,
) -> Tuple[List[List[List[int]]], List[List[List[float]]]]:
    """Find the top n most likely tokens for a batch of generations.

    When multiple tokens have equal probabilities and they don't all fit, the
    remaining tokens are also returned.
    """
    max_top_n = max(top_n_tokens)
    # Early exit when top_n_tokens is not used
    if max_top_n == 0:
        return [[[]]] * len(top_n_tokens), [[[]]] * len(top_n_tokens)

    batch_size = accepted_ids.shape[0]
    speculate_size = logprobs.shape[0] // batch_size
    top_n_tokens_tensor = top_n_tokens_tensor.repeat_interleave(speculate_size)
    # Ensure top_n doesn't exceed vocab size
    top_n_tokens = [
        min(tok, logprobs.size(-1))
        for tok in top_n_tokens
        for _ in range(speculate_size)
    ]

    # Parallel kthvalue adapted from https://discuss.pytorch.org/t/how-to-efficiently-get-the-k-th-largest-values-in-parallel/160529/2
    # Sorted topk is faster than torch.sort() since we only need a small subset
    sorted_top_k = torch.topk(logprobs, k=max_top_n, dim=-1, sorted=True).values

    nth_highest = torch.gather(
        sorted_top_k, 1, (top_n_tokens_tensor - 1).clip(min=0).unsqueeze(1)
    )
    nth_highest[nth_highest == -float("inf")] = torch.finfo(logprobs.dtype).min

    # Find the new "fuzzy" top n values
    top_n_indices = (logprobs >= nth_highest).nonzero()
    _, top_n_ishes = torch.unique_consecutive(top_n_indices[:, 0], return_counts=True)

    k = 1 if top_n_ishes.numel() == 0 else top_n_ishes.max()
    # Take a new topk for these new max n values
    top_k = torch.topk(logprobs, k=k, dim=1, sorted=True)

    top_n_ishes = top_n_ishes.tolist()
    top_indices = top_k.indices.tolist()
    top_values = top_k.values.tolist()

    batch_top_token_ids = []
    batch_top_token_logprobs = []
    accepted_ids_list = accepted_ids.tolist()
    for i, n_accepted_ids in enumerate(accepted_ids_list):
        start = speculate_size * i
        stop = speculate_size * (i + 1)
        _top_indices = top_indices[start:stop]
        _top_values = top_values[start:stop]
        _top_n_ishes = top_n_ishes[start:stop]
        _top_n_tokens = top_n_tokens[start:stop]

        _top_indices = _top_indices[:n_accepted_ids]
        _top_values = _top_values[:n_accepted_ids]
        _top_n_ishes = _top_n_ishes[:n_accepted_ids]
        _top_n_tokens = _top_n_tokens[:n_accepted_ids]

        row_top_token_ids = []
        row_top_token_logprobs = []

        for idxs, vals, n, req_n in zip(
            _top_indices, _top_values, _top_n_ishes, _top_n_tokens
        ):
            indices = idxs[:n] if req_n > 0 else []
            values = vals[:n] if req_n > 0 else []

            row_top_token_ids.append(indices)
            row_top_token_logprobs.append(values)

        batch_top_token_ids.append(row_top_token_ids)
        batch_top_token_logprobs.append(row_top_token_logprobs)

    return batch_top_token_ids, batch_top_token_logprobs


================================================
FILE: server/text_generation_server/utils/watermark.py
================================================
# coding=utf-8
# Copyright 2023 Authors of "A Watermark for Large Language Models"
# available at https://arxiv.org/abs/2301.10226
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os

import torch
from transformers import LogitsProcessor
from typing import List, Union

GAMMA = float(os.getenv("WATERMARK_GAMMA", 0.5))
DELTA = float(os.getenv("WATERMARK_DELTA", 2.0))


class WatermarkLogitsProcessor(LogitsProcessor):
    def __init__(
        self,
        gamma: float = GAMMA,
        delta: float = DELTA,
        hash_key: int = 15485863,  # just a large prime number to create a rng seed with sufficient bit width
        device: str = "cpu",
    ):
        # watermarking parameters
        self.gamma = gamma
        self.delta = delta
        self.rng = torch.Generator(device=device)
        self.hash_key = hash_key

    def _seed_rng(self, input_ids: Union[List[int], torch.LongTensor]):
        if isinstance(input_ids, list):
            assert (
                len(input_ids) >= 1
            ), "requires at least a 1 token prefix sequence to seed rng"
            prev_token = input_ids[-1]
        else:
            assert len(input_ids) == 1
            input_ids = input_ids[0]
            assert (
                input_ids.shape[-1] >= 1
            ), "requires at least a 1 token prefix sequence to seed rng"
            prev_token = input_ids[-1].item()
        self.rng.manual_seed(self.hash_key * prev_token)

    def _get_greenlist_ids(
        self,
        input_ids: Union[List[int], torch.LongTensor],
        max_value: int,
        device: torch.device,
    ) -> List[int]:
        # seed the rng using the previous tokens/prefix
        self._seed_rng(input_ids)

        greenlist_size = int(max_value * self.gamma)
        vocab_permutation = torch.randperm(max_value, device=device, generator=self.rng)
        greenlist_ids = vocab_permutation[:greenlist_size]
        return greenlist_ids

    @staticmethod
    def _calc_greenlist_mask(
        scores: torch.FloatTensor, greenlist_token_ids
    ) -> torch.BoolTensor:
        green_tokens_mask = torch.zeros_like(scores)
        green_tokens_mask[-1, greenlist_token_ids] = 1
        final_mask = green_tokens_mask.bool()
        return final_mask

    @staticmethod
    def _bias_greenlist_logits(
        scores: torch.Tensor, greenlist_mask: torch.Tensor, greenlist_bias: float
    ) -> torch.Tensor:
        scores[greenlist_mask] = scores[greenlist_mask] + greenlist_bias
        return scores

    def __call__(
        self, input_ids: Union[List[int], torch.LongTensor], scores: torch.FloatTensor
    ) -> torch.FloatTensor:
        greenlist_ids = self._get_greenlist_ids(
            input_ids, scores.shape[-1], scores.device
        )
        green_tokens_mask = self._calc_greenlist_mask(
            scores=scores, greenlist_token_ids=greenlist_ids
        )

        scores = self._bias_greenlist_logits(
            scores=scores, greenlist_mask=green_tokens_mask, greenlist_bias=self.delta
        )
        return scores


================================================
FILE: server/text_generation_server/utils/weights.py
================================================
import torch

from abc import ABC, abstractmethod
from contextlib import contextmanager
from pathlib import Path
from typing import Dict, List, Optional, Union, Type
from safetensors import safe_open
from dataclasses import dataclass

from text_generation_server.utils.import_utils import SYSTEM


class WeightsLoader(ABC):
    """
    Instances of this type implement higher-level weight loading.

    At a low-level, every weight is stored in the Safetensors format.
    The interpretation of weights may be different however, for instance
    could be packed, quantized weights. Loaders are responsible for
    interpreting the raw tensors, sharding tensors in a manner compatible
    with the format, etc.
    """

    @abstractmethod
    def get_weights(self, weights: "Weights", prefix: str):
        """
        Get weights at the given prefix and apply without tensor paralllism.
        """
        ...

    @abstractmethod
    def get_weights_col_packed(
        self,
        weights: "Weights",
        prefix: str,
        block_sizes: Union[int, List[int]],
    ):
        """
        Get the packed weights at the given prefix with column-splitting for
        tensor parallelism. This method should be used when multiple different
        weights are packed into a tensor, for instance, query/key/value
        weights or a gate/up projection.

        The `block_sizes` determines the proportions of the packed tensors.
        The columns are split in equally sized blocks when `block_sizes` is an
        `int`, or in blocks proportional given to the sizes. For instance
        `[2, 1, 1]` will divide an input with dimensionality `1024` in
        `[512, 256, 256]`.
        """
        ...

    def get_weights_col(self, weights: "Weights", prefix: str):
        """
        Get weights at the given prefix and apply column-splitting for tensor
        paralllism.
        """
        return weights.get_multi_weights_col([prefix], 0)

    @abstractmethod
    def get_multi_weights_col(self, weights: "Weights", prefixes: List[str], dim: int):
        """
        Get the weights at the given prefixes, column-split them for tensor
        parallelim, and then concatenate the weights along the given dimension.
        """
        ...

    @abstractmethod
    def get_weights_row(self, weights: "Weights", prefix: str):
        """
        Get the weights at the given prefix and apply row-splitting for tensor
        parallism.
        """
        ...


class Weight(ABC):
    """Instances of this type implement unquantized/quantized/to-be
    quantized weights."""

    @abstractmethod
    def get_linear(self, bias: torch.Tensor):
        """Create a linear layer from this weight."""
        ...


@dataclass
class UnquantizedWeight(Weight):
    weight: torch.Tensor

    def get_linear(self, bias: torch.Tensor):
        from text_generation_server.layers.linear import FastLinear, FastLinearROCm

        if SYSTEM == "rocm":
            return FastLinearROCm(self.weight, bias)
        else:
            return FastLinear(self.weight, bias)


class DefaultWeightsLoader(WeightsLoader):
    """Weight loader that loads (unquantized) Torch tensors."""

    def __init__(self, weight_class: Type[UnquantizedWeight]):
        """Create a loader. Weights will be wrapped using the given `weights_class`,
        normally this will be `UnquantizedWeight`, but a quantizer-specific class
        such as `Fp8Weight` can be used to quantize the weights during loading.
        """
        self.weight_class = weight_class

    """
    Loader that uses tensors as-is with the exception of applying sharding
    and/or concatenation.
    """

    def get_weights(self, weights: "Weights", prefix: str):
        return weights.get_tensor(f"{prefix}.weight")

    def get_weights_col_packed(
        self,
        weights: "Weights",
        prefix: str,
        block_sizes: Union[int, List[int]],
    ):
        return self.weight_class(
            weights.get_packed_sharded(
                f"{prefix}.weight", dim=0, block_sizes=block_sizes
            ),
        )

    def get_multi_weights_col(self, weights: "Weights", prefixes: List[str], dim: int):
        w = [weights.get_sharded(f"{p}.weight", dim=0) for p in prefixes]
        return self.weight_class(torch.cat(w, dim=dim))

    def get_weights_row(self, weights: "Weights", prefix: str):
        return self.weight_class(
            weights.get_sharded(f"{prefix}.weight", dim=1),
        )


class Weights:
    def __init__(
        self,
        filenames: List[Path],
        device,
        dtype,
        process_group,
        weights_loader: WeightsLoader,
        aliases: Optional[Dict[str, List[str]]] = None,
        prefix: Optional[str] = None,
    ):
        routing = {}
        for filename in filenames:
            with safe_open(filename, framework="pytorch") as f:
                for k in f.keys():
                    if k in routing:
                        raise RuntimeError(
                            f"Key {k} was found in multiple files: {filename} and {routing[k]}"
                        )
                    routing[k] = filename
        if aliases is None:
            aliases = {}
        self.aliases = aliases
        self.routing = routing
        self.device = device
        self.dtype = dtype
        self.process_group = process_group
        self.prefix = prefix
        self.weights_loader = weights_loader
        self._handles = {}

    def _get_handle(self, filename):
        if filename not in self._handles:
            f = safe_open(filename, framework="pytorch")
            self._handles[filename] = f

        return self._handles[filename]

    def get_filename(self, tensor_name: str) -> (str, str):
        names = [tensor_name]
        if self.prefix is not None:
            prefixed = f"{self.prefix}.{tensor_name}"
            names.append(prefixed)
        for name in names:
            filename = self.routing.get(name, None)
            if filename is not None:
                return str(filename), name

            aliases = self.aliases.get(name, [])
            for alias in aliases:
                filename = self.routing.get(alias, None)
                if filename is not None:
                    return str(filename), alias
        raise RuntimeError(f"weight {tensor_name} does not exist")

    def _get_slice(self, tensor_name: str):
        filename, tensor_name = self.get_filename(tensor_name)
        f = self._get_handle(filename)
        slice_ = f.get_slice(tensor_name)
        return slice_

    def has_tensor(self, tensor_name: str):
        try:
            self.get_filename(tensor_name)
        except Exception:
            return False
        return True

    def get_shape(self, tensor_name: str):
        return self._get_slice(tensor_name).get_shape()

    def get_tensor(
        self, tensor_name: str, to_device: bool = True, to_dtype: bool = True
    ) -> torch.Tensor:
        filename, tensor_name = self.get_filename(tensor_name)
        f = self._get_handle(filename)
        tensor = f.get_tensor(tensor_name)
        # Special case for gptq which shouldn't convert
        # u4 which are disguised as int32. Exl2 uses int16
        # as well. FP8 uses torch.float8_e4m3fn
        if (
            tensor.dtype
            not in [
                torch.float8_e4m3fn,
                torch.int8,
                torch.int16,
                torch.int32,
                torch.int64,
            ]
            and to_dtype
        ):
            tensor = tensor.to(dtype=self.dtype)
        if to_device:
            tensor = tensor.to(device=self.device)
        return tensor

    def get_partial_sharded(
        self, tensor_name: str, dim: int, to_device=True, to_dtype=True
    ):
        filename, tensor_name = self.get_filename(tensor_name)
        f = self._get_handle(filename)
        slice_ = f.get_slice(tensor_name)
        world_size = self.process_group.size()
        rank = self.process_group.rank()

        size = slice_.get_shape()[dim]
        block_size = (size + world_size - 1) // world_size
        start = rank * block_size
        stop = (rank + 1) * block_size

        if dim == 0:
            tensor = slice_[start:stop]
        elif dim == 1:
            tensor = slice_[:, start:stop]
        else:
            raise NotImplementedError("Let's make that generic when needed")
        # Special case for gptq which shouldn't convert
        # u4 which are disguised as int32. exl2 uses int16.
        # FP8 uses torch.float8_e4m3fn.
        if (
            tensor.dtype
            not in (torch.float8_e4m3fn, torch.int8, torch.int16, torch.int32)
            and to_dtype
        ):
            tensor = tensor.to(dtype=self.dtype)
        if to_device:
            tensor = tensor.to(device=self.device)
        return tensor

    def get_sharded(self, tensor_name: str, dim: int, to_device=True, to_dtype=True):
        filename, tensor_name = self.get_filename(tensor_name)
        f = self._get_handle(filename)
        slice_ = f.get_slice(tensor_name)
        world_size = self.process_group.size()
        size = slice_.get_shape()[dim]
        assert (
            size % world_size == 0
        ), f"The choosen size {size} is not compatible with sharding on {world_size} shards"
        return self.get_partial_sharded(
            tensor_name, dim, to_device=to_device, to_dtype=to_dtype
        )

    def get_packed_sharded(
        self,
        tensor_name: str,
        dim: int,
        block_sizes: Union[int, List[int]],
        to_dtype=True,
    ) -> torch.Tensor:
        """
        Get a shard from a tensor that packs multiple tensors.

        When a tensor packs multiple tensors (such as QKV or an up
        projection + gate projection), sharding with `get_sharded` is not
        safe since it would not split the packed tensors across shards.

        This method shards a tensor, such that the packed tensors are
        split across shards.

        The columns are split in equally sized blocks when blocks is an `int`, or
        in blocks proportional given to the sizes. For instance `[2, 1, 1]` will
        divide an input with dimensionality `1024` in `[512, 256, 256]`. This is
        convenient for e.g. splitting QKV without knowing the storage details of
        quantized weights.
        """
        slice_ = self._get_slice(tensor_name)
        total_size = slice_.get_shape()[dim]
        block_sizes = _blocks_to_block_sizes(total_size=total_size, blocks=block_sizes)

        world_size = self.process_group.size()
        rank = self.process_group.rank()

        tensors = []
        block_offset = 0
        for block_size in block_sizes:
            assert (
                block_size % world_size == 0
            ), f"Prepacked tensor cannot be sharded across {world_size} shards"
            shard_block_size = block_size // world_size
            start = rank * shard_block_size
            stop = (rank + 1) * shard_block_size
            if dim == 0:
                tensor = slice_[block_offset + start : block_offset + stop]
            elif dim == 1:
                tensor = slice_[:, block_offset + start : block_offset + stop]
            else:
                raise NotImplementedError("Currently only dim=0 or dim=1 is supported")
            tensors.append(tensor)
            block_offset += block_size
        tensor = torch.cat(tensors, dim=dim)
        tensor = tensor.to(device=self.device)

        # Avoid casting quantizer dtypes.
        if (
            tensor.dtype
            not in [
                torch.float8_e4m3fn,
                torch.int8,
                torch.int16,
                torch.int32,
                torch.int64,
            ]
            and to_dtype
        ):
            tensor = tensor.to(dtype=self.dtype)

        return tensor

    def get_weights(self, prefix: str):
        return self.weights_loader.get_weights(self, prefix)

    def get_weights_col_packed_qkv(
        self,
        prefix: str,
        num_heads: int,
        num_key_value_heads: int,
    ):
        return self.get_weights_col_packed(
            prefix, [num_heads, num_key_value_heads, num_key_value_heads]
        )

    def get_weights_col_packed_gate_up(self, prefix: str):
        return self.get_weights_col_packed(prefix, 2)

    def get_weights_col_packed(self, prefix: str, block_sizes: Union[int, List[int]]):
        """
        The columns are split in equally sized blocks when blocks is an `int`, or
        in blocks proportional given to the sizes. For instance `[2, 1, 1]` will
        divide an input with dimensionality `1024` in `[512, 256, 256]`. This is
        convenient for e.g. splitting QKV without knowing the storage details of
        quantized weights.
        """
        return self.weights_loader.get_weights_col_packed(self, prefix, block_sizes)

    def get_weights_col(self, prefix: str):
        return self.weights_loader.get_weights_col(self, prefix)

    def get_multi_weights_col(self, prefixes: List[str], dim: int):
        return self.weights_loader.get_multi_weights_col(self, prefixes, dim)

    def get_tensor_shard(self, var, dim):
        world_size = self.process_group.size()
        rank = self.process_group.rank()
        block_size = var.size()[dim] // world_size
        start = rank * block_size
        stop = (rank + 1) * block_size
        if dim == 0:
            tensor = var[start:stop]
        elif dim == 1:
            tensor = var[:, start:stop]
        else:
            raise NotImplementedError("Let's make that generic when needed")
        tensor = tensor.to(dtype=self.dtype)
        tensor = tensor.to(device=self.device)
        return tensor

    def get_weights_row(self, prefix: str):
        return self.weights_loader.get_weights_row(self, prefix)

    @contextmanager
    def use_loader(self, weights_loader: WeightsLoader):
        """
        This method is a context manager that can be used to use `Weights` with
        a different loader for the duration of the context.
        """

        old_loader = self.weights_loader
        self.weights_loader = weights_loader
        try:
            yield
        finally:
            self.weights_loader = old_loader

    @property
    def loader(self):
        return self.weights_loader


def _blocks_to_block_sizes(total_size: int, blocks: Union[int, List[int]]) -> List[int]:
    """
    Convert block count or proportions to block sizes.

    This function accepts

    - The number of blocks (int), in which case the block size is
      total_size//blocks; or
    - A list of block sizes (List[int]).

    In the latter case, if sum(blocks) < total_size, the ratios between
    the block sizes will be preserved. For instance, if blocks is
    [2, 1, 1] and total_size is 1024, the returned block sizes are
    [512, 256, 256].
    """
    if isinstance(blocks, list):
        total_blocks = sum(blocks)
        assert (
            total_size % total_blocks == 0
        ), f"Cannot split {total_size} in proportional blocks: {blocks}"
        part_size = total_size // total_blocks
        return [part_size * block for block in blocks]
    else:
        assert total_size % blocks == 0, f"Prepacked is not divisible by {blocks}"
        single_size = total_size // blocks
        return [single_size] * blocks


================================================
FILE: tgi-entrypoint.sh
================================================
#!/bin/bash

ldconfig 2>/dev/null || echo 'unable to refresh ld cache, not a big deal in most cases'

source /usr/src/.venv/bin/activate
exec text-generation-launcher $@


================================================
FILE: update_doc.py
================================================
import subprocess
import argparse
import ast
import json
import os

TEMPLATE = """
# Supported Models

Text Generation Inference enables serving optimized models. The following sections list which models (VLMs & LLMs) are supported.

SUPPORTED_MODELS


If the above list lacks the model you would like to serve, depending on the model's pipeline type, you can try to initialize and serve the model anyways to see how well it performs, but performance isn't guaranteed for non-optimized models:

```python
# for causal LMs/text-generation models
AutoModelForCausalLM.from_pretrained(<model>, device_map="auto")
# or, for text-to-text generation models
AutoModelForSeq2SeqLM.from_pretrained(<model>, device_map="auto")
```

If you wish to serve a supported model that already exists on a local folder, just point to the local folder.

```bash
text-generation-launcher --model-id <PATH-TO-LOCAL-BLOOM>
```
"""


def check_cli(check: bool):
    output = subprocess.check_output(["text-generation-launcher", "--help"]).decode(
        "utf-8"
    )

    wrap_code_blocks_flag = "<!-- WRAP CODE BLOCKS -->"
    final_doc = f"# Text-generation-launcher arguments\n\n{wrap_code_blocks_flag}\n\n"

    lines = output.split("\n")

    header = ""
    block = []
    for line in lines:
        if line.startswith("  -") or line.startswith("      -"):
            rendered_block = "\n".join(block)
            if header:
                final_doc += f"## {header}\n```shell\n{rendered_block}\n```\n"
            else:
                final_doc += f"```shell\n{rendered_block}\n```\n"
            block = []
            tokens = line.split("<")
            if len(tokens) > 1:
                header = tokens[-1][:-1]
            else:
                header = line.split("--")[-1]
            header = header.upper().replace("-", "_")

        block.append(line)

    rendered_block = "\n".join(block)
    final_doc += f"## {header}\n```shell\n{rendered_block}\n```\n"
    block = []

    filename = "docs/source/reference/launcher.md"
    if check:
        with open(filename, "r") as f:
            doc = f.read()
            if doc != final_doc:
                tmp = "launcher.md"
                with open(tmp, "w") as g:
                    g.write(final_doc)
                diff = subprocess.run(
                    ["diff", tmp, filename], capture_output=True
                ).stdout.decode("utf-8")
                print(diff)
                raise Exception(
                    "Cli arguments Doc is not up-to-date, run `python update_doc.py` in order to update it"
                )
    else:
        with open(filename, "w") as f:
            f.write(final_doc)


def check_supported_models(check: bool):
    filename = "server/text_generation_server/models/__init__.py"
    with open(filename, "r") as f:
        tree = ast.parse(f.read())

    enum_def = [
        x for x in tree.body if isinstance(x, ast.ClassDef) and x.name == "ModelType"
    ][0]
    _locals = {}
    _globals = {}
    exec(f"import enum\n{ast.unparse(enum_def)}", _globals, _locals)
    ModelType = _locals["ModelType"]
    list_string = ""
    for data in ModelType:
        list_string += f"- [{data.value['name']}]({data.value['url']})"
        if data.value.get("multimodal", None):
            list_string += " (Multimodal)"
        list_string += "\n"

    final_doc = TEMPLATE.replace("SUPPORTED_MODELS", list_string)

    filename = "docs/source/supported_models.md"
    if check:
        with open(filename, "r") as f:
            doc = f.read()
            if doc != final_doc:
                tmp = "supported.md"
                with open(tmp, "w") as g:
                    g.write(final_doc)
                diff = subprocess.run(
                    ["diff", tmp, filename], capture_output=True
                ).stdout.decode("utf-8")
                print(diff)
                raise Exception(
                    "Supported models is not up-to-date, run `python update_doc.py` in order to update it"
                )
    else:
        with open(filename, "w") as f:
            f.write(final_doc)


def get_openapi_schema():
    try:
        output = subprocess.check_output(["text-generation-router", "print-schema"])
        return json.loads(output)
    except subprocess.CalledProcessError as e:
        print(f"Error running text-generation-router print-schema: {e}")
        raise SystemExit(1)
    except json.JSONDecodeError:
        print("Error: Invalid JSON received from text-generation-router print-schema")
        raise SystemExit(1)


def check_openapi(check: bool):
    new_openapi_data = get_openapi_schema()
    filename = "docs/openapi.json"
    tmp_filename = "openapi_tmp.json"

    with open(tmp_filename, "w") as f:
        json.dump(new_openapi_data, f, indent=2)
        f.write("\n")

    if check:
        diff = subprocess.run(
            [
                "diff",
                tmp_filename,
                filename,
            ],
            capture_output=True,
        ).stdout.decode("utf-8")
        os.remove(tmp_filename)

        if diff:
            print(diff)
            raise Exception(
                "OpenAPI documentation is not up-to-date, run `python update_doc.py` in order to update it"
            )

    else:
        os.rename(tmp_filename, filename)
        print("OpenAPI documentation updated.")
    p = subprocess.run(
        [
            "redocly",
            # allow for trailing whitespace since it's not significant
            # and the precommit hook will remove it
            "lint",
            "--skip-rule",
            "security-defined",
            filename,
        ],
        capture_output=True,
    )
    errors = p.stderr.decode("utf-8")
    # The openapi specs fails on `exclusive_minimum` which is expected to be a boolean where
    # utoipa outputs a value instead: https://github.com/juhaku/utoipa/issues/969
    print(errors)
    if p.returncode != 0:
        print(errors)
        raise Exception(
            f"OpenAPI documentation is invalid, `redocly lint {filename}` showed some error:\n {errors}"
        )
    return True


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--check", action="store_true")

    args = parser.parse_args()

    check_cli(args.check)
    check_supported_models(args.check)
    check_openapi(args.check)


if __name__ == "__main__":
    main()